diff --git a/.coverage b/.coverage index 5f7baa5..e2106b9 100755 Binary files a/.coverage and b/.coverage differ diff --git a/.env.example b/.env.example new file mode 100755 index 0000000..a89bb87 --- /dev/null +++ b/.env.example @@ -0,0 +1,18 @@ +# Database +PW_DB_HOST=localhost +PW_DB_PORT=5432 +PW_DB_DATABASE=pricewatch +PW_DB_USER=pricewatch +PW_DB_PASSWORD=pricewatch + +# Redis +PW_REDIS_HOST=localhost +PW_REDIS_PORT=6379 +PW_REDIS_DB=0 + +# App +PW_DEBUG=false +PW_WORKER_TIMEOUT=300 +PW_WORKER_CONCURRENCY=2 +PW_ENABLE_DB=true +PW_ENABLE_WORKER=true diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/CHANGELOG.md b/CHANGELOG.md index 685850a..90643a3 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,94 @@ Le format est basé sur [Keep a Changelog](https://keepachangelog.com/fr/1.0.0/) ## [Non publié] ### En cours -- Ajout de fixtures HTML réalistes pour tests pytest -- Tests stores/cdiscount/ -- Tests scraping/ avec mocks +- Phase 2 : Base de données PostgreSQL +- Phase 2 : Worker Redis/RQ +- Phase 3 : API REST FastAPI +- Phase 4 : Web UI + +### Ajouté +- Configuration Alembic (env.py, script.py.mako, alembic.ini) +- Migration initiale SQLAlchemy (5 tables + indexes) +- Commandes CLI DB: `init-db`, `migrate`, `upgrade`, `downgrade` +- `docker-compose.yml` PostgreSQL/Redis +- `.env.example` avec variables DB/Redis/app +- Tests DB de base (models + connection) +- Repository `ProductRepository` + `ScrapingPipeline` +- Flag CLI `--save-db/--no-db` pour la persistence +- Tests repository/pipeline (SQLite) +- Test end-to-end CLI + DB (SQLite) +- Worker RQ + scheduler (tasks + CLI) + +--- + +## [0.3.0] - 2026-01-14 🎉 PHASE 1 TERMINÉE + +### ✅ Phase 1 CLI complétée à 100% + +**Résultat final**: +- **295 tests passent** (100% de réussite) +- **76% code coverage global** +- **4 stores opérationnels** (Amazon, Cdiscount, Backmarket, AliExpress) + +### Ajouté + +#### Corrections et améliorations +- **Amazon Store**: Correction extraction images avec fallback générique +- **Amazon Store**: Support prix séparés en 2 spans (a-price-whole + a-price-fraction) + +#### Tests complets ajoutés (177 nouveaux tests) +- **tests/core/test_registry.py**: 40 tests (100% coverage) + - 24 tests unitaires avec mocks + - 16 tests d'intégration avec les 4 stores réels + - Tests de détection automatique multi-stores +- **tests/core/test_registry_integration.py**: Tests d'intégration stores + - Vérification détection correcte pour Amazon, Cdiscount, Backmarket, AliExpress + - Tests de priorité et exclusivité des matches +- **tests/core/test_io.py**: 36 tests (97% coverage) + - Tests ScrapingConfig/ScrapingOptions Pydantic + - Tests read_yaml_config avec validation erreurs + - Tests write_json_results et read_json_results + - Tests save_debug_html et save_debug_screenshot +- **tests/scraping/test_http_fetch.py**: 21 tests (100% coverage) + - Tests fetch_http avec mocks requests + - Tests codes HTTP (200, 403, 404, 429, 500+) + - Tests timeout et exceptions réseau + - Tests User-Agent rotation et headers personnalisés +- **tests/scraping/test_pw_fetch.py**: 21 tests (91% coverage) + - Tests fetch_playwright avec mocks Playwright + - Tests modes headless/headful + - Tests screenshot et wait_for_selector + - Tests fetch_with_fallback (stratégie HTTP → Playwright) + - Tests cleanup des ressources + +### Statistiques détaillées + +**Coverage par module**: +| Module | Coverage | Tests | +|--------|----------|-------| +| core/schema.py | 100% | 29 | +| core/registry.py | 100% | 40 | +| core/io.py | 97% | 36 | +| scraping/http_fetch.py | 100% | 21 | +| scraping/pw_fetch.py | 91% | 21 | +| stores/amazon/store.py | 89% | 33 | +| stores/aliexpress/store.py | 85% | 32 | +| stores/backmarket/store.py | 85% | 25 | +| stores/cdiscount/store.py | 72% | 30 | +| **TOTAL** | **76%** | **295** | + +### Améliorations techniques +- Architecture complètement testée avec mocks et fixtures +- Tests d'intégration validant le fonctionnement end-to-end +- Couverture de code élevée sur tous les modules critiques +- Détection automatique de stores validée avec URLs réelles + +### Prochaines étapes (Phase 2) +Phase 1 CLI est maintenant **production-ready**. La Phase 2 peut démarrer: +1. Base de données PostgreSQL + Alembic +2. Worker Redis/RQ pour scraping planifié +3. API REST FastAPI +4. Web UI responsive avec dark theme Gruvbox --- diff --git a/PHASE_1_COMPLETE.md b/PHASE_1_COMPLETE.md new file mode 100755 index 0000000..30fc48c --- /dev/null +++ b/PHASE_1_COMPLETE.md @@ -0,0 +1,267 @@ +# 🎉 Phase 1 CLI - TERMINÉE À 100% + +**Date de complétion**: 2026-01-14 +**Version**: 0.3.0 + +--- + +## 📊 Résultats Finaux + +### Tests +- ✅ **295/295 tests passent** (100% de réussite) +- 📈 **76% code coverage global** +- ⚡ **Temps d'exécution**: 41.4 secondes + +### Modules testés + +| Module | Coverage | Tests | Statut | +|--------|----------|-------|--------| +| `core/schema.py` | **100%** | 29 | ✅ | +| `core/registry.py` | **100%** | 40 | ✅ | +| `core/io.py` | **97%** | 36 | ✅ | +| `scraping/http_fetch.py` | **100%** | 21 | ✅ | +| `scraping/pw_fetch.py` | **91%** | 21 | ✅ | +| `stores/amazon/` | **89%** | 33 | ✅ | +| `stores/aliexpress/` | **85%** | 32 | ✅ | +| `stores/backmarket/` | **85%** | 25 | ✅ | +| `stores/cdiscount/` | **72%** | 30 | ✅ | +| `base.py` | **87%** | - | ✅ | +| `logging.py` | **71%** | - | ✅ | + +--- + +## 🏗️ Architecture Implémentée + +### 1. Core (`pricewatch/app/core/`) +- ✅ `schema.py` - Modèle ProductSnapshot Pydantic +- ✅ `registry.py` - Détection automatique stores +- ✅ `io.py` - Lecture YAML / Écriture JSON +- ✅ `logging.py` - Système de logs colorés + +### 2. Scraping (`pricewatch/app/scraping/`) +- ✅ `http_fetch.py` - HTTP simple avec rotation User-Agent +- ✅ `pw_fetch.py` - Playwright fallback anti-bot +- ✅ Stratégie automatique: HTTP → Playwright si échec + +### 3. Stores (`pricewatch/app/stores/`) +- ✅ `base.py` - Classe abstraite BaseStore +- ✅ **Amazon** - amazon.fr, amazon.com, amazon.co.uk, amazon.de +- ✅ **Cdiscount** - cdiscount.com +- ✅ **Backmarket** - backmarket.fr, backmarket.com +- ✅ **AliExpress** - fr.aliexpress.com, aliexpress.com + +### 4. CLI (`pricewatch/app/cli/`) +- ✅ `pricewatch run` - Pipeline YAML → JSON +- ✅ `pricewatch detect` - Détection store depuis URL +- ✅ `pricewatch fetch` - Test HTTP/Playwright +- ✅ `pricewatch parse` - Test parsing HTML +- ✅ `pricewatch doctor` - Health check + +--- + +## 🔧 Corrections Apportées + +### Amazon Store +1. **Extraction images** - Ajout fallback générique `soup.find_all("img")` +2. **Prix séparés** - Support `a-price-whole` + `a-price-fraction` + +### Tests Ajoutés (177 nouveaux) +1. **Registry** - 40 tests (24 unitaires + 16 intégration) +2. **I/O** - 36 tests (YAML, JSON, debug files) +3. **HTTP Fetch** - 21 tests (mocks requests) +4. **Playwright Fetch** - 21 tests (mocks Playwright) + +--- + +## ✨ Fonctionnalités Validées + +### Scraping +- ✅ Détection automatique du store depuis URL +- ✅ Normalisation URLs vers forme canonique +- ✅ Extraction ASIN/SKU/référence produit +- ✅ Parsing HTML → ProductSnapshot +- ✅ Fallback HTTP → Playwright automatique +- ✅ Gestion anti-bot (User-Agent, headers, timeout) + +### Data Extraction +- ✅ Titre produit +- ✅ Prix (EUR, USD, GBP) +- ✅ Statut stock (in_stock, out_of_stock, unknown) +- ✅ Images (URLs multiples) +- ✅ Catégorie (breadcrumb) +- ✅ Caractéristiques techniques (specs dict) +- ✅ Référence produit (ASIN, SKU) + +### Debug & Observabilité +- ✅ Logs détaillés avec timestamps et couleurs +- ✅ Sauvegarde HTML optionnelle +- ✅ Screenshots Playwright optionnels +- ✅ Métriques (durée, taille HTML, méthode) +- ✅ Gestion erreurs robuste (403, captcha, timeout) + +### Output +- ✅ JSON structuré (ProductSnapshot[]) +- ✅ Validation Pydantic +- ✅ Serialization ISO 8601 (dates) +- ✅ Pretty-print configurable + +--- + +## 📋 Commandes Testées + +```bash +# Pipeline complet +pricewatch run --yaml scrap_url.yaml --out scraped_store.json + +# Détection store +pricewatch detect "https://www.amazon.fr/dp/B08N5WRWNW" + +# Test HTTP +pricewatch fetch "https://example.com" --http + +# Test Playwright +pricewatch fetch "https://example.com" --playwright + +# Parse HTML +pricewatch parse amazon --in page.html + +# Health check +pricewatch doctor + +# Mode debug +pricewatch run --yaml scrap_url.yaml --debug +``` + +--- + +## 🧪 Tests Exécutés + +### Lancer tous les tests +```bash +pytest -v --tb=no --cov=pricewatch +``` + +**Résultat**: `295 passed, 3 warnings in 41.40s` + +### Par module +```bash +pytest tests/core/ # 105 tests +pytest tests/scraping/ # 42 tests +pytest tests/stores/ # 148 tests +``` + +### Coverage détaillé +```bash +pytest --cov=pricewatch --cov-report=html +# Voir: htmlcov/index.html +``` + +--- + +## 📦 Dépendances + +### Production +- `typer[all]` - CLI framework +- `rich` - Terminal UI +- `pydantic` - Data validation +- `requests` - HTTP client +- `playwright` - Browser automation +- `beautifulsoup4` - HTML parsing +- `lxml` - XML/HTML parser +- `pyyaml` - YAML support + +### Développement +- `pytest` - Testing framework +- `pytest-cov` - Coverage reporting +- `pytest-mock` - Mocking utilities +- `pytest-asyncio` - Async test support + +--- + +## 🚀 Prochaines Étapes (Phase 2) + +La Phase 1 CLI est **production-ready**. Vous pouvez démarrer la Phase 2: + +### Infrastructure +1. **PostgreSQL + Alembic** + - Schéma base de données + - Migrations versionnées + - Models SQLAlchemy + - Historique prix + +2. **Worker & Scheduler** + - Redis pour queue + - RQ ou Celery worker + - Scraping planifié (quotidien) + - Retry policy + +3. **API REST** + - FastAPI endpoints + - Authentification JWT + - Documentation OpenAPI + - CORS configuration + +4. **Web UI** + - Framework React/Vue + - Design responsive + - Dark theme Gruvbox + - Graphiques historique prix + - Système d'alertes + +### Features +- Alertes baisse prix (email, webhooks) +- Alertes retour en stock +- Comparateur multi-stores +- Export données (CSV, Excel) +- API publique + +--- + +## 📝 Documentation + +- `README.md` - Guide utilisateur complet +- `TODO.md` - Roadmap et phases +- `CHANGELOG.md` - Historique des versions +- `CLAUDE.md` - Guide pour Claude Code +- `PROJECT_SPEC.md` - Spécifications techniques + +--- + +## 🎯 Métriques de Qualité + +| Métrique | Valeur | Objectif | Statut | +|----------|--------|----------|--------| +| Tests passants | 295/295 | 100% | ✅ | +| Code coverage | 76% | >70% | ✅ | +| Stores actifs | 4 | ≥2 | ✅ | +| CLI commands | 5 | ≥4 | ✅ | +| Documentation | Complète | Complète | ✅ | + +--- + +## ✅ Checklist Phase 1 + +- [x] Architecture modulaire +- [x] Modèle de données Pydantic +- [x] Système de logging +- [x] Lecture YAML / Écriture JSON +- [x] Registry stores avec détection automatique +- [x] HTTP fetch avec User-Agent rotation +- [x] Playwright fallback anti-bot +- [x] BaseStore abstrait +- [x] Amazon store complet +- [x] Cdiscount store complet +- [x] Backmarket store complet +- [x] AliExpress store complet +- [x] CLI Typer avec 5 commandes +- [x] Tests pytest (295 tests) +- [x] Code coverage >70% +- [x] Documentation complète +- [x] Pipeline YAML → JSON fonctionnel +- [x] Validation avec URLs réelles + +--- + +**Phase 1 CLI: 100% COMPLÈTE** ✅ + +Prêt pour la Phase 2! 🚀 diff --git a/PHASE_2_PROGRESS.md b/PHASE_2_PROGRESS.md new file mode 100755 index 0000000..1408c7f --- /dev/null +++ b/PHASE_2_PROGRESS.md @@ -0,0 +1,437 @@ +# 🚀 Phase 2 Infrastructure - EN COURS + +**Date de démarrage**: 2026-01-14 +**Version cible**: 0.4.0 +**Objectif**: Ajouter PostgreSQL + Redis/RQ worker pour persistence et scraping asynchrone + +--- + +## 📊 Vue d'Ensemble + +### Objectifs Phase 2 +- ✅ Configuration centralisée (database, Redis, app) +- ✅ Modèles SQLAlchemy ORM (5 tables) +- ✅ Connexion base de données (init_db, get_session) +- ✅ Migrations Alembic +- ⏳ Repository pattern (CRUD) +- ⏳ Worker RQ pour scraping asynchrone +- ⏳ Scheduler pour jobs récurrents +- ✅ CLI étendu (commandes DB) +- ✅ Docker Compose (PostgreSQL + Redis) +- ⏳ Tests complets + +--- + +## ✅ Semaine 1: Database Foundation (TERMINÉE) + +### Tâches Complétées + +#### 1. Configuration Centralisée ✅ +**Fichier**: `pricewatch/app/core/config.py` (187 lignes) + +**Contenu**: +- `DatabaseConfig`: Configuration PostgreSQL + - Host, port, database, user, password + - Propriété `url`: SQLAlchemy connection string + - Propriété `url_async`: AsyncPG connection string (futur) + - Prefix env vars: `PW_DB_*` (PW_DB_HOST, PW_DB_PORT, etc.) + +- `RedisConfig`: Configuration Redis pour RQ + - Host, port, db, password (optional) + - Propriété `url`: Redis connection string + - Prefix env vars: `PW_REDIS_*` + +- `AppConfig`: Configuration globale application + - Debug mode + - Worker timeout (300s par défaut) + - Worker concurrency (2 par défaut) + - Feature flags: `enable_db`, `enable_worker` + - Defaults Playwright: timeout, use_playwright + - Nested configs: `db`, `redis` + - Prefix env vars: `PW_*` + +- **Pattern Singleton**: `get_config()`, `set_config()`, `reset_config()` + +**Justifications**: +- 12-factor app: configuration via env vars +- Pydantic validation garantit config valide au démarrage +- Valeurs par défaut pour développement local +- Support `.env` file pour faciliter le setup +- Feature flags permettent de désactiver DB/worker pour tests + +#### 2. Dépendances Phase 2 ✅ +**Fichier**: `pyproject.toml` (lignes 48-60) + +**Ajouts**: +```toml +# Database (Phase 2) +"sqlalchemy>=2.0.0", +"psycopg2-binary>=2.9.0", +"alembic>=1.13.0", + +# Configuration (Phase 2) +"python-dotenv>=1.0.0", + +# Worker/Queue (Phase 2) +"redis>=5.0.0", +"rq>=1.15.0", +"rq-scheduler>=0.13.0", +``` + +#### 3. Modèles SQLAlchemy ORM ✅ +**Fichier**: `pricewatch/app/db/models.py` (322 lignes) + +**Tables créées**: + +1. **`products`** - Catalogue produits + - PK: `id` (Integer, autoincrement) + - Natural key: `(source, reference)` - Unique constraint + - Colonnes: `url`, `title`, `category`, `currency` + - Timestamps: `first_seen_at`, `last_updated_at` + - Relations: `price_history`, `images`, `specs`, `logs` + - Indexes: source, reference, last_updated_at + +2. **`price_history`** - Historique prix (time-series) + - PK: `id` (Integer, autoincrement) + - FK: `product_id` → products(id) CASCADE + - Unique: `(product_id, fetched_at)` - Évite doublons + - Colonnes: `price` (Numeric 10,2), `shipping_cost`, `stock_status` + - Fetch metadata: `fetch_method`, `fetch_status`, `fetched_at` + - Check constraints: stock_status, fetch_method, fetch_status + - Indexes: product_id, fetched_at + +3. **`product_images`** - Images produit + - PK: `id` (Integer, autoincrement) + - FK: `product_id` → products(id) CASCADE + - Unique: `(product_id, image_url)` - Évite doublons + - Colonnes: `image_url` (Text), `position` (Integer, 0=main) + - Index: product_id + +4. **`product_specs`** - Caractéristiques produit (key-value) + - PK: `id` (Integer, autoincrement) + - FK: `product_id` → products(id) CASCADE + - Unique: `(product_id, spec_key)` - Évite doublons + - Colonnes: `spec_key` (String 200), `spec_value` (Text) + - Indexes: product_id, spec_key + +5. **`scraping_logs`** - Logs observabilité + - PK: `id` (Integer, autoincrement) + - FK optionnelle: `product_id` → products(id) SET NULL + - Colonnes: `url`, `source`, `reference`, `fetched_at` + - Métriques: `duration_ms`, `html_size_bytes` + - Fetch metadata: `fetch_method`, `fetch_status` + - Debug data (JSONB): `errors`, `notes` + - Indexes: product_id, source, fetched_at, fetch_status + +**Justifications schéma**: +- Normalisation: products séparée de price_history (catalogue vs time-series) +- Clé naturelle (source, reference) vs UUID arbitraire +- Tables séparées pour images/specs: évite JSONB non structuré +- JSONB uniquement pour données variables: errors, notes dans logs +- Cascade DELETE: suppression produit → suppression historique +- SET NULL pour logs: garde trace même si produit supprimé + +--- + +### Tâches Complétées (suite) + +#### 4. Connexion Base de Données ✅ +**Fichier**: `pricewatch/app/db/connection.py` + +**Contenu**: +- `get_engine(config)`: Engine SQLAlchemy (pooling) +- `get_session_factory(config)`: Session factory +- `get_session(config)`: Context manager +- `init_db(config)`: Création tables +- `check_db_connection(config)`: Health check +- `reset_engine()`: Reset pour tests + +**Justifications**: +- Singleton engine pour éviter les pools multiples +- `pool_pre_ping` pour robustesse +- Context manager pour rollback/close automatiques + +--- + +#### 5. Setup Alembic ✅ +**Fichiers**: +- `alembic.ini` +- `pricewatch/app/db/migrations/env.py` +- `pricewatch/app/db/migrations/script.py.mako` + +**Justifications**: +- URL DB injectée depuis `AppConfig` +- `compare_type=True` pour cohérence des migrations + +#### 6. Migration Initiale ✅ +**Fichier**: `pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py` + +**Contenu**: +- 5 tables + indexes + contraintes +- JSONB pour `errors` et `notes` + +#### 7. Commandes CLI Database ✅ +**Fichier**: `pricewatch/app/cli/main.py` + +**Commandes**: +```bash +pricewatch init-db # Créer tables +pricewatch migrate "message" # Générer migration Alembic +pricewatch upgrade # Appliquer migrations +pricewatch downgrade # Rollback migration +``` + +#### 8. Docker Compose ✅ +**Fichier**: `docker-compose.yml` + +**Services**: +- PostgreSQL 16 (port 5432) +- Redis 7 (port 6379) +- Volumes pour persistence + +#### 9. Fichier .env Exemple ✅ +**Fichier**: `.env.example` + +**Variables**: +```bash +# Database +PW_DB_HOST=localhost +PW_DB_PORT=5432 +PW_DB_DATABASE=pricewatch +PW_DB_USER=pricewatch +PW_DB_PASSWORD=pricewatch + +# Redis +PW_REDIS_HOST=localhost +PW_REDIS_PORT=6379 +PW_REDIS_DB=0 + +# App +PW_DEBUG=false +PW_WORKER_TIMEOUT=300 +PW_WORKER_CONCURRENCY=2 +PW_ENABLE_DB=true +PW_ENABLE_WORKER=true +``` + +#### 10. Tests Database ✅ +**Fichiers**: +- `tests/db/test_models.py`: Tests des modèles SQLAlchemy +- `tests/db/test_connection.py`: Tests connexion et session + +**Stratégie tests**: +- SQLite in-memory pour tests unitaires +- Fixtures pytest pour setup/teardown +- Tests relationships, constraints, indexes + +--- + +## 📦 Semaine 2: Repository & Pipeline (EN COURS) + +### Tâches Prévues + +#### Repository Pattern +**Fichier**: `pricewatch/app/db/repository.py` + +**Classe**: `ProductRepository` +- `get_or_create(source, reference)`: Trouver ou créer produit +- `save_snapshot(snapshot)`: Persist ProductSnapshot to DB +- `update_product_metadata(product, snapshot)`: Update title, url, etc. +- `add_price_history(product, snapshot)`: Ajouter entrée prix +- `sync_images(product, images)`: Sync images (add new, keep existing) +- `sync_specs(product, specs)`: Sync specs (upsert) +- `add_scraping_log(snapshot, product_id)`: Log scraping + +**Statut**: ✅ Terminé + +#### Scraping Pipeline +**Fichier**: `pricewatch/app/scraping/pipeline.py` + +**Classe**: `ScrapingPipeline` +- `process_snapshot(snapshot, save_to_db)`: Orchestration +- Non-blocking: échec DB ne crash pas pipeline +- Retour: `product_id` ou `None` + +**Statut**: ✅ Terminé + +#### CLI Modification +**Fichier**: `pricewatch/app/cli/main.py` + +**Modification commande `run`**: +- Ajouter flag `--save-db / --no-db` +- Intégrer `ScrapingPipeline` si `save_db=True` +- Compatibilité backward: JSON output toujours créé + +**Statut**: ✅ Terminé + +#### Tests Repository + Pipeline ✅ +**Fichiers**: +- `tests/db/test_repository.py` +- `tests/scraping/test_pipeline.py` + +**Statut**: ✅ Terminé + +#### Tests end-to-end CLI + DB ✅ +**Fichier**: +- `tests/cli/test_run_db.py` + +**Statut**: ✅ Terminé + +--- + +## 📦 Semaine 3: Worker Infrastructure (EN COURS) + +### Tâches Prévues + +#### RQ Task +**Fichier**: `pricewatch/app/tasks/scrape.py` + +**Fonction**: `scrape_product(url, use_playwright=True)` +- Réutilise 100% code Phase 1 (detect → fetch → parse) +- Save to DB via ScrapingPipeline +- Retour: `{success, product_id, snapshot, error}` + +**Statut**: ✅ Terminé + +#### Scheduler +**Fichier**: `pricewatch/app/tasks/scheduler.py` + +**Classe**: `ScrapingScheduler` +- `schedule_product(url, interval_hours=24)`: Job récurrent +- `enqueue_immediate(url)`: Job unique +- Basé sur `rq-scheduler` + +**Statut**: ✅ Terminé + +#### CLI Worker +**Nouvelles commandes**: +```bash +pricewatch worker # Lancer worker RQ +pricewatch enqueue # Enqueue scrape immédiat +pricewatch schedule --interval 24 # Scrape quotidien +``` + +**Statut**: ✅ Terminé + +--- + +## 📦 Semaine 4: Tests & Documentation (NON DÉMARRÉ) + +### Tâches Prévues + +#### Tests +- Tests end-to-end (CLI → DB → Worker) +- Tests erreurs (DB down, Redis down) +- Tests backward compatibility (`--no-db`) +- Performance tests (100+ produits) + +#### Documentation +- Update README.md (setup Phase 2) +- Update CHANGELOG.md +- Migration guide (JSON → DB) + +--- + +## 📈 Métriques d'Avancement + +| Catégorie | Complétées | Totales | % | +|-----------|------------|---------|---| +| **Semaine 1** | 10 | 10 | 100% | +| **Semaine 2** | 5 | 5 | 100% | +| **Semaine 3** | 3 | 6 | 50% | +| **Semaine 4** | 0 | 7 | 0% | +| **TOTAL Phase 2** | 18 | 28 | **64%** | + +--- + +## 🎯 Prochaine Étape Immédiate + +**Prochaine étape immédiate** +- Tests end-to-end worker + DB +- Gestion des erreurs Redis down (CLI + worker) + +**Apres (prevu)** +- Logs d'observabilite pour jobs planifies + +--- + +## 🔧 Vérifications + +### Vérification Semaine 1 (objectif) +```bash +# Setup infrastructure +docker-compose up -d +pricewatch init-db + +# Vérifier tables créées +psql -h localhost -U pricewatch pricewatch +\dt +# → 5 tables: products, price_history, product_images, product_specs, scraping_logs +``` + +### Vérification Semaine 2 (objectif) +```bash +# Test pipeline avec DB +pricewatch run --yaml scrap_url.yaml --save-db + +# Vérifier données en DB +psql -h localhost -U pricewatch pricewatch +SELECT * FROM products LIMIT 5; +SELECT * FROM price_history ORDER BY fetched_at DESC LIMIT 10; +``` + +### Vérification Semaine 3 (objectif) +```bash +# Enqueue job +pricewatch enqueue "https://www.amazon.fr/dp/B08N5WRWNW" + +# Lancer worker +pricewatch worker + +# Vérifier job traité +psql -h localhost -U pricewatch pricewatch +SELECT * FROM scraping_logs ORDER BY fetched_at DESC LIMIT 5; +``` + +--- + +## 📝 Notes Importantes + +### Backward Compatibility +- ✅ CLI Phase 1 fonctionne sans changement +- ✅ Format JSON identique +- ✅ Database optionnelle (`--no-db` flag) +- ✅ ProductSnapshot inchangé +- ✅ Tests Phase 1 continuent à passer (295 tests) + +### Architecture Décisions + +**Normalisation vs Performance**: +- Choix: Normalisation stricte (5 tables) +- Justification: Catalogue change rarement, prix changent quotidiennement +- Alternative rejetée: Tout dans products + JSONB (moins queryable) + +**Clé Naturelle vs UUID**: +- Choix: `(source, reference)` comme unique constraint +- Justification: ASIN Amazon déjà unique globalement +- Alternative rejetée: UUID artificiel (complexifie déduplication) + +**Synchrone vs Asynchrone**: +- Choix: RQ synchrone (pas d'async/await) +- Justification: Code Phase 1 réutilisable à 100%, simplicité +- Alternative rejetée: Asyncio + asyncpg (refactoring massif) + +--- + +**Dernière mise à jour**: 2026-01-14 + +### Validation locale (Semaine 1) +```bash +docker compose up -d +./venv/bin/alembic -c alembic.ini upgrade head +psql -h localhost -U pricewatch pricewatch +\\dt +``` + +**Resultat**: 6 tables visibles (products, price_history, product_images, product_specs, scraping_logs, alembic_version). +**Statut**: ✅ Semaine 1 en cours (30% complétée) diff --git a/README.md b/README.md index a48a31a..cfa1533 100755 --- a/README.md +++ b/README.md @@ -58,6 +58,13 @@ pricewatch/ │ │ ├── store.py │ │ ├── selectors.yml │ │ └── fixtures/ +│ ├── db/ # Persistence SQLAlchemy (Phase 2) +│ │ ├── models.py +│ │ ├── connection.py +│ │ └── migrations/ +│ ├── tasks/ # Jobs RQ (Phase 3) +│ │ ├── scrape.py +│ │ └── scheduler.py │ └── cli/ │ └── main.py # CLI Typer ├── tests/ # Tests pytest @@ -76,6 +83,9 @@ pricewatch run --yaml scrap_url.yaml --out scraped_store.json # Avec debug pricewatch run --yaml scrap_url.yaml --out scraped_store.json --debug + +# Avec persistence DB +pricewatch run --yaml scrap_url.yaml --out scraped_store.json --save-db ``` ### Commandes utilitaires @@ -97,6 +107,45 @@ pricewatch parse amazon --in scraped/page.html pricewatch doctor ``` +### Commandes base de donnees + +```bash +# Initialiser les tables +pricewatch init-db + +# Generer une migration +pricewatch migrate "Initial schema" + +# Appliquer les migrations +pricewatch upgrade + +# Revenir en arriere +pricewatch downgrade -1 +``` + +### Commandes worker + +```bash +# Lancer un worker RQ +pricewatch worker + +# Enqueue un job immediat +pricewatch enqueue "https://example.com/product" + +# Planifier un job recurrent +pricewatch schedule "https://example.com/product" --interval 24 +``` + +## Base de donnees (Phase 2) + +```bash +# Lancer PostgreSQL + Redis en local +docker-compose up -d + +# Exemple de configuration +cp .env.example .env +``` + ## Configuration (scrap_url.yaml) ```yaml @@ -196,8 +245,8 @@ Aucune erreur ne doit crasher silencieusement : toutes sont loggées et tracées - ✅ Tests pytest ### Phase 2 : Persistence -- [ ] Base de données PostgreSQL -- [ ] Migrations Alembic +- [x] Base de données PostgreSQL +- [x] Migrations Alembic - [ ] Historique des prix ### Phase 3 : Automation diff --git a/TODO.md b/TODO.md index 9143b29..8ce85be 100755 --- a/TODO.md +++ b/TODO.md @@ -101,72 +101,92 @@ Liste des tâches priorisées pour le développement de PriceWatch. ### Étape 9 : Tests - [x] Configurer pytest dans pyproject.toml -- [x] Tests core/schema.py +- [x] Tests core/schema.py (29 tests - 100% coverage) - [x] Validation ProductSnapshot - [x] Serialization JSON -- [x] Tests core/registry.py +- [x] Tests core/registry.py (40 tests - 100% coverage) - [x] Enregistrement stores - [x] Détection automatique -- [x] Tests stores/amazon/ + - [x] Tests d'intégration avec 4 stores réels +- [x] Tests core/io.py (36 tests - 97% coverage) + - [x] Lecture/écriture YAML/JSON + - [x] Sauvegarde debug HTML/screenshots +- [x] Tests stores/amazon/ (33 tests - 89% coverage) - [x] match() avec différentes URLs - [x] canonicalize() - [x] extract_reference() - - [~] parse() sur fixtures HTML (6 tests nécessitent fixtures réels) -- [ ] Tests stores/cdiscount/ - - [ ] Idem Amazon -- [ ] Tests scraping/ - - [ ] http_fetch avec mock - - [ ] pw_fetch avec mock + - [x] parse() sur fixtures HTML +- [x] Tests stores/cdiscount/ (30 tests - 72% coverage) + - [x] Tests complets avec fixtures réels +- [x] Tests stores/backmarket/ (25 tests - 85% coverage) + - [x] Tests complets avec fixtures réels +- [x] Tests stores/aliexpress/ (32 tests - 85% coverage) + - [x] Tests complets avec fixtures réels +- [x] Tests scraping/ (42 tests) + - [x] http_fetch avec mock (21 tests - 100% coverage) + - [x] pw_fetch avec mock (21 tests - 91% coverage) ### Étape 10 : Intégration et validation - [x] Créer scrap_url.yaml exemple - [x] Tester pipeline complet YAML → JSON - [x] Tester avec vraies URLs Amazon -- [ ] Tester avec vraies URLs Cdiscount +- [x] Tester avec vraies URLs Cdiscount - [x] Vérifier tous les modes de debug - [x] Valider sauvegarde HTML/screenshots - [x] Documentation finale -### Bilan Étape 9 (Tests pytest) -**État**: 80 tests passent / 86 tests totaux (93%) -- ✓ core/schema.py: 29/29 tests -- ✓ core/registry.py: 24/24 tests -- ✓ stores/amazon/: 27/33 tests (6 tests nécessitent fixtures HTML réalistes) +### ✅ PHASE 1 TERMINÉE À 100% +**État final**: 295 tests passent / 295 tests totaux (100%) +**Coverage global**: 76% -**Tests restants**: -- Fixtures HTML Amazon/Cdiscount -- Tests Cdiscount store -- Tests scraping avec mocks +**Détail par module**: +- ✅ core/schema.py: 100% coverage +- ✅ core/registry.py: 100% coverage (40 tests) +- ✅ core/io.py: 97% coverage (36 tests) +- ✅ scraping/http_fetch.py: 100% coverage (21 tests) +- ✅ scraping/pw_fetch.py: 91% coverage (21 tests) +- ✅ stores/amazon/: 89% coverage (33 tests) +- ✅ stores/aliexpress/: 85% coverage (32 tests) +- ✅ stores/backmarket/: 85% coverage (25 tests) +- ✅ stores/cdiscount/: 72% coverage (30 tests) + +**4 stores opérationnels**: Amazon, Cdiscount, Backmarket, AliExpress --- -## Phase 2 : Base de données (Future) +## Phase 2 : Base de données (En cours) ### Persistence -- [ ] Schéma PostgreSQL -- [ ] Migrations Alembic -- [ ] Models SQLAlchemy +- [x] Schéma PostgreSQL +- [x] Migrations Alembic +- [x] Models SQLAlchemy +- [x] Connexion DB (engine, session, init) +- [x] Tests DB de base +- [x] Repository pattern (ProductRepository) +- [x] ScrapingPipeline (persistence optionnelle) +- [x] CLI `--save-db/--no-db` +- [x] Tests end-to-end CLI + DB - [ ] CRUD produits - [ ] Historique prix ### Configuration -- [ ] Fichier config (DB credentials) -- [ ] Variables d'environnement -- [ ] Dockerfile PostgreSQL +- [x] Fichier config (DB credentials) +- [x] Variables d'environnement +- [x] Docker Compose PostgreSQL/Redis --- -## Phase 3 : Worker et automation (Future) +## Phase 3 : Worker et automation (En cours) ### Worker -- [ ] Setup Redis -- [ ] Worker RQ ou Celery -- [ ] Queue de scraping +- [x] Setup Redis +- [x] Worker RQ +- [x] Queue de scraping - [ ] Retry policy ### Planification -- [ ] Cron ou scheduler intégré -- [ ] Scraping quotidien automatique +- [x] Cron ou scheduler intégré +- [x] Scraping quotidien automatique - [ ] Logs des runs --- @@ -216,4 +236,4 @@ Liste des tâches priorisées pour le développement de PriceWatch. --- -**Dernière mise à jour**: 2026-01-13 +**Dernière mise à jour**: 2026-01-14 diff --git a/alembic.ini b/alembic.ini new file mode 100755 index 0000000..6b36638 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,36 @@ +[alembic] +script_location = pricewatch/app/db/migrations +sqlalchemy.url = postgresql://pricewatch:pricewatch@localhost:5432/pricewatch + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console + +[logger_sqlalchemy] +level = WARN +handlers = console +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = console +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100755 index 0000000..8a4c487 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,22 @@ +services: + postgres: + image: postgres:16 + environment: + POSTGRES_DB: pricewatch + POSTGRES_USER: pricewatch + POSTGRES_PASSWORD: pricewatch + ports: + - "5432:5432" + volumes: + - pricewatch_pgdata:/var/lib/postgresql/data + + redis: + image: redis:7 + ports: + - "6379:6379" + volumes: + - pricewatch_redisdata:/data + +volumes: + pricewatch_pgdata: + pricewatch_redisdata: diff --git a/pricewatch.egg-info/PKG-INFO b/pricewatch.egg-info/PKG-INFO index 7b80e3c..b784bc1 100755 --- a/pricewatch.egg-info/PKG-INFO +++ b/pricewatch.egg-info/PKG-INFO @@ -21,6 +21,13 @@ Requires-Dist: lxml>=5.1.0 Requires-Dist: cssselect>=1.2.0 Requires-Dist: pyyaml>=6.0.1 Requires-Dist: python-dateutil>=2.8.2 +Requires-Dist: sqlalchemy>=2.0.0 +Requires-Dist: psycopg2-binary>=2.9.0 +Requires-Dist: alembic>=1.13.0 +Requires-Dist: python-dotenv>=1.0.0 +Requires-Dist: redis>=5.0.0 +Requires-Dist: rq>=1.15.0 +Requires-Dist: rq-scheduler>=0.13.0 Provides-Extra: dev Requires-Dist: pytest>=8.0.0; extra == "dev" Requires-Dist: pytest-cov>=4.1.0; extra == "dev" diff --git a/pricewatch.egg-info/SOURCES.txt b/pricewatch.egg-info/SOURCES.txt index b616b16..a519fb9 100755 --- a/pricewatch.egg-info/SOURCES.txt +++ b/pricewatch.egg-info/SOURCES.txt @@ -11,6 +11,7 @@ pricewatch/app/__init__.py pricewatch/app/cli/__init__.py pricewatch/app/cli/main.py pricewatch/app/core/__init__.py +pricewatch/app/core/config.py pricewatch/app/core/io.py pricewatch/app/core/logging.py pricewatch/app/core/registry.py diff --git a/pricewatch.egg-info/requires.txt b/pricewatch.egg-info/requires.txt index e19f717..f366fd7 100755 --- a/pricewatch.egg-info/requires.txt +++ b/pricewatch.egg-info/requires.txt @@ -9,6 +9,13 @@ lxml>=5.1.0 cssselect>=1.2.0 pyyaml>=6.0.1 python-dateutil>=2.8.2 +sqlalchemy>=2.0.0 +psycopg2-binary>=2.9.0 +alembic>=1.13.0 +python-dotenv>=1.0.0 +redis>=5.0.0 +rq>=1.15.0 +rq-scheduler>=0.13.0 [dev] pytest>=8.0.0 diff --git a/pricewatch/app/cli/__pycache__/main.cpython-313.pyc b/pricewatch/app/cli/__pycache__/main.cpython-313.pyc index 4eaa2c6..0b4bad6 100755 Binary files a/pricewatch/app/cli/__pycache__/main.cpython-313.pyc and b/pricewatch/app/cli/__pycache__/main.cpython-313.pyc differ diff --git a/pricewatch/app/cli/main.py b/pricewatch/app/cli/main.py index 27e9acb..3ad3d1f 100755 --- a/pricewatch/app/cli/main.py +++ b/pricewatch/app/cli/main.py @@ -13,20 +13,28 @@ import sys from pathlib import Path from typing import Optional +import redis import typer +from rq import Connection, Worker +from alembic import command as alembic_command +from alembic.config import Config as AlembicConfig from rich import print as rprint from rich.console import Console from rich.table import Table from pricewatch.app.core import logging as app_logging +from pricewatch.app.core.config import get_config from pricewatch.app.core.io import read_yaml_config, write_json_results from pricewatch.app.core.logging import get_logger, set_level from pricewatch.app.core.registry import get_registry, register_store from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod +from pricewatch.app.db.connection import init_db from pricewatch.app.scraping.http_fetch import fetch_http +from pricewatch.app.scraping.pipeline import ScrapingPipeline from pricewatch.app.scraping.pw_fetch import fetch_playwright from pricewatch.app.stores.amazon.store import AmazonStore from pricewatch.app.stores.cdiscount.store import CdiscountStore +from pricewatch.app.tasks.scheduler import ScrapingScheduler # Créer l'application Typer app = typer.Typer( @@ -46,6 +54,75 @@ def setup_stores(): registry.register(CdiscountStore()) +def get_alembic_config() -> AlembicConfig: + """Construit la configuration Alembic à partir du repository.""" + root_path = Path(__file__).resolve().parents[3] + config_path = root_path / "alembic.ini" + migrations_path = root_path / "pricewatch" / "app" / "db" / "migrations" + + if not config_path.exists(): + logger.error(f"alembic.ini introuvable: {config_path}") + raise typer.Exit(code=1) + + alembic_cfg = AlembicConfig(str(config_path)) + alembic_cfg.set_main_option("script_location", str(migrations_path)) + alembic_cfg.set_main_option("sqlalchemy.url", get_config().db.url) + return alembic_cfg + + +@app.command("init-db") +def init_db_command(): + """ + Initialise la base de donnees (creer toutes les tables). + """ + try: + init_db(get_config()) + except Exception as e: + logger.error(f"Init DB echoue: {e}") + raise typer.Exit(code=1) + + +@app.command() +def migrate( + message: str = typer.Argument(..., help="Message de migration"), + autogenerate: bool = typer.Option(True, "--autogenerate/--no-autogenerate"), +): + """ + Genere une migration Alembic. + """ + try: + alembic_cfg = get_alembic_config() + alembic_command.revision(alembic_cfg, message=message, autogenerate=autogenerate) + except Exception as e: + logger.error(f"Migration echouee: {e}") + raise typer.Exit(code=1) + + +@app.command() +def upgrade(revision: str = typer.Argument("head", help="Revision cible")): + """ + Applique les migrations Alembic. + """ + try: + alembic_cfg = get_alembic_config() + alembic_command.upgrade(alembic_cfg, revision) + except Exception as e: + logger.error(f"Upgrade echoue: {e}") + raise typer.Exit(code=1) + + +@app.command() +def downgrade(revision: str = typer.Argument("-1", help="Revision cible")): + """ + Rollback une migration Alembic. + """ + try: + alembic_cfg = get_alembic_config() + alembic_command.downgrade(alembic_cfg, revision) + except Exception as e: + logger.error(f"Downgrade echoue: {e}") + raise typer.Exit(code=1) + @app.command() def run( yaml: Path = typer.Option( @@ -67,6 +144,11 @@ def run( "-d", help="Activer le mode debug", ), + save_db: Optional[bool] = typer.Option( + None, + "--save-db/--no-db", + help="Activer la persistence en base de donnees", + ), ): """ Pipeline complet: scrape toutes les URLs du YAML et génère le JSON. @@ -88,6 +170,12 @@ def run( logger.error(f"Erreur lecture YAML: {e}") raise typer.Exit(code=1) + app_config = get_config() + if save_db is None: + save_db = app_config.enable_db + + pipeline = ScrapingPipeline(config=app_config) + logger.info(f"{len(config.urls)} URL(s) à scraper") # Scraper chaque URL @@ -158,6 +246,11 @@ def run( snapshot = store.parse(html, canonical_url) snapshot.debug.method = fetch_method + if save_db: + product_id = pipeline.process_snapshot(snapshot, save_to_db=True) + if product_id: + logger.info(f"DB: produit id={product_id}") + snapshots.append(snapshot) status_emoji = "✓" if snapshot.is_complete() else "⚠" @@ -180,6 +273,8 @@ def run( errors=[f"Parsing failed: {str(e)}"], ), ) + if save_db: + pipeline.process_snapshot(snapshot, save_to_db=True) snapshots.append(snapshot) else: # Pas de HTML récupéré @@ -194,6 +289,8 @@ def run( errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"], ), ) + if save_db: + pipeline.process_snapshot(snapshot, save_to_db=True) snapshots.append(snapshot) # Écrire les résultats @@ -359,5 +456,65 @@ def doctor(): rprint("\n[green]✓ PriceWatch est prêt![/green]") +@app.command() +def worker( + queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"), + with_scheduler: bool = typer.Option( + True, "--with-scheduler/--no-scheduler", help="Activer le scheduler RQ" + ), +): + """ + Lance un worker RQ. + """ + config = get_config() + connection = redis.from_url(config.redis.url) + + with Connection(connection): + worker_instance = Worker([queue]) + worker_instance.work(with_scheduler=with_scheduler) + + +@app.command() +def enqueue( + url: str = typer.Argument(..., help="URL du produit a scraper"), + queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"), + save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"), + use_playwright: Optional[bool] = typer.Option( + None, "--playwright/--no-playwright", help="Forcer Playwright" + ), +): + """ + Enqueue un scraping immediat. + """ + scheduler = ScrapingScheduler(get_config(), queue_name=queue) + job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db) + rprint(f"[green]✓ Job enqueued: {job.id}[/green]") + + +@app.command() +def schedule( + url: str = typer.Argument(..., help="URL du produit a planifier"), + interval: int = typer.Option(24, "--interval", help="Intervalle en heures"), + queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"), + save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"), + use_playwright: Optional[bool] = typer.Option( + None, "--playwright/--no-playwright", help="Forcer Playwright" + ), +): + """ + Planifie un scraping recurrent. + """ + scheduler = ScrapingScheduler(get_config(), queue_name=queue) + job_info = scheduler.schedule_product( + url, + interval_hours=interval, + use_playwright=use_playwright, + save_db=save_db, + ) + rprint( + f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]" + ) + + if __name__ == "__main__": app() diff --git a/pricewatch/app/core/__pycache__/config.cpython-313.pyc b/pricewatch/app/core/__pycache__/config.cpython-313.pyc new file mode 100755 index 0000000..9347fe5 Binary files /dev/null and b/pricewatch/app/core/__pycache__/config.cpython-313.pyc differ diff --git a/pricewatch/app/core/config.py b/pricewatch/app/core/config.py new file mode 100755 index 0000000..66e4e7e --- /dev/null +++ b/pricewatch/app/core/config.py @@ -0,0 +1,186 @@ +""" +Configuration centralisée pour PriceWatch Phase 2. + +Gère la configuration de la base de données, Redis, et l'application globale. +Utilise Pydantic Settings pour validation et chargement depuis variables d'environnement. + +Justification technique: +- Pattern 12-factor app: configuration via env vars +- Pydantic validation garantit config valide au démarrage +- Valeurs par défaut pour développement local +- Support .env file pour faciliter le setup +""" + +from typing import Optional + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + +from pricewatch.app.core.logging import get_logger + +logger = get_logger("core.config") + + +class DatabaseConfig(BaseSettings): + """Configuration PostgreSQL.""" + + host: str = Field(default="localhost", description="PostgreSQL host") + port: int = Field(default=5432, description="PostgreSQL port") + database: str = Field(default="pricewatch", description="Database name") + user: str = Field(default="pricewatch", description="Database user") + password: str = Field(default="pricewatch", description="Database password") + + model_config = SettingsConfigDict( + env_prefix="PW_DB_", # PW_DB_HOST, PW_DB_PORT, etc. + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + @property + def url(self) -> str: + """ + SQLAlchemy connection URL. + + Format: postgresql://user:password@host:port/database + """ + return f"postgresql://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}" + + @property + def url_async(self) -> str: + """ + Async SQLAlchemy connection URL (pour usage futur avec asyncpg). + + Format: postgresql+asyncpg://user:password@host:port/database + """ + return f"postgresql+asyncpg://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}" + + +class RedisConfig(BaseSettings): + """Configuration Redis pour RQ worker.""" + + host: str = Field(default="localhost", description="Redis host") + port: int = Field(default=6379, description="Redis port") + db: int = Field(default=0, description="Redis database number (0-15)") + password: Optional[str] = Field(default=None, description="Redis password (optional)") + + model_config = SettingsConfigDict( + env_prefix="PW_REDIS_", # PW_REDIS_HOST, PW_REDIS_PORT, etc. + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + @property + def url(self) -> str: + """ + Redis connection URL pour RQ. + + Format: redis://[password@]host:port/db + """ + auth = f":{self.password}@" if self.password else "" + return f"redis://{auth}{self.host}:{self.port}/{self.db}" + + +class AppConfig(BaseSettings): + """Configuration globale de l'application.""" + + # Mode debug + debug: bool = Field( + default=False, description="Enable debug mode (verbose logging, SQL echo)" + ) + + # Worker configuration + worker_timeout: int = Field( + default=300, description="Worker job timeout in seconds (5 minutes)" + ) + + worker_concurrency: int = Field( + default=2, description="Number of concurrent worker processes" + ) + + # Feature flags + enable_db: bool = Field( + default=True, description="Enable database persistence (can disable for testing)" + ) + + enable_worker: bool = Field( + default=True, description="Enable background worker functionality" + ) + + # Scraping defaults + default_playwright_timeout: int = Field( + default=60000, description="Default Playwright timeout in milliseconds" + ) + + default_use_playwright: bool = Field( + default=True, description="Use Playwright fallback by default" + ) + + model_config = SettingsConfigDict( + env_prefix="PW_", # PW_DEBUG, PW_WORKER_TIMEOUT, etc. + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + # Nested configs (instances, not classes) + db: DatabaseConfig = Field(default_factory=DatabaseConfig) + redis: RedisConfig = Field(default_factory=RedisConfig) + + def log_config(self) -> None: + """Log la configuration active (sans password).""" + logger.info("=== Configuration PriceWatch ===") + logger.info(f"Debug mode: {self.debug}") + logger.info(f"Database: {self.db.host}:{self.db.port}/{self.db.database}") + logger.info(f"Redis: {self.redis.host}:{self.redis.port}/{self.redis.db}") + logger.info(f"DB enabled: {self.enable_db}") + logger.info(f"Worker enabled: {self.enable_worker}") + logger.info(f"Worker timeout: {self.worker_timeout}s") + logger.info(f"Worker concurrency: {self.worker_concurrency}") + logger.info("================================") + + +# Singleton global config instance +_config: Optional[AppConfig] = None + + +def get_config() -> AppConfig: + """ + Récupère l'instance globale de configuration (singleton). + + Returns: + Instance AppConfig + + Justification: + - Évite de recharger la config à chaque appel + - Centralise la configuration pour toute l'application + - Permet d'override pour les tests + """ + global _config + + if _config is None: + _config = AppConfig() + if _config.debug: + _config.log_config() + + return _config + + +def set_config(config: AppConfig) -> None: + """ + Override la configuration globale (principalement pour tests). + + Args: + config: Instance AppConfig à utiliser + """ + global _config + _config = config + logger.debug("Configuration overridden") + + +def reset_config() -> None: + """Reset la configuration globale (pour tests).""" + global _config + _config = None + logger.debug("Configuration reset") diff --git a/pricewatch/app/db/__init__.py b/pricewatch/app/db/__init__.py new file mode 100755 index 0000000..c466e97 --- /dev/null +++ b/pricewatch/app/db/__init__.py @@ -0,0 +1,41 @@ +""" +Module de base de données pour PriceWatch Phase 2. + +Gère la persistence PostgreSQL avec SQLAlchemy ORM. +""" + +from pricewatch.app.db.connection import ( + check_db_connection, + get_engine, + get_session, + get_session_factory, + init_db, + reset_engine, +) +from pricewatch.app.db.repository import ProductRepository +from pricewatch.app.db.models import ( + Base, + Product, + PriceHistory, + ProductImage, + ProductSpec, + ScrapingLog, +) + +__all__ = [ + # Models + "Base", + "Product", + "PriceHistory", + "ProductImage", + "ProductSpec", + "ScrapingLog", + "ProductRepository", + # Connection + "get_engine", + "get_session_factory", + "get_session", + "init_db", + "check_db_connection", + "reset_engine", +] diff --git a/pricewatch/app/db/__pycache__/__init__.cpython-313.pyc b/pricewatch/app/db/__pycache__/__init__.cpython-313.pyc new file mode 100755 index 0000000..6d900a4 Binary files /dev/null and b/pricewatch/app/db/__pycache__/__init__.cpython-313.pyc differ diff --git a/pricewatch/app/db/__pycache__/connection.cpython-313.pyc b/pricewatch/app/db/__pycache__/connection.cpython-313.pyc new file mode 100755 index 0000000..116445b Binary files /dev/null and b/pricewatch/app/db/__pycache__/connection.cpython-313.pyc differ diff --git a/pricewatch/app/db/__pycache__/models.cpython-313.pyc b/pricewatch/app/db/__pycache__/models.cpython-313.pyc new file mode 100755 index 0000000..7e34247 Binary files /dev/null and b/pricewatch/app/db/__pycache__/models.cpython-313.pyc differ diff --git a/pricewatch/app/db/__pycache__/repository.cpython-313.pyc b/pricewatch/app/db/__pycache__/repository.cpython-313.pyc new file mode 100755 index 0000000..cbee917 Binary files /dev/null and b/pricewatch/app/db/__pycache__/repository.cpython-313.pyc differ diff --git a/pricewatch/app/db/connection.py b/pricewatch/app/db/connection.py new file mode 100755 index 0000000..ffa40ea --- /dev/null +++ b/pricewatch/app/db/connection.py @@ -0,0 +1,238 @@ +""" +Gestion des connexions PostgreSQL pour PriceWatch Phase 2. + +Fournit: +- Engine SQLAlchemy avec connection pooling +- Session factory avec context manager +- Initialisation des tables +- Health check + +Justification technique: +- Connection pooling: réutilisation connexions pour performance +- Context manager: garantit fermeture session (pas de leak) +- pool_pre_ping: vérifie connexion avant usage (robustesse) +- echo=debug: logs SQL en mode debug +""" + +from contextlib import contextmanager +from typing import Generator, Optional + +from sqlalchemy import create_engine, text +from sqlalchemy.engine import Engine +from sqlalchemy.engine.url import make_url +from sqlalchemy.exc import OperationalError, SQLAlchemyError +from sqlalchemy.orm import Session, sessionmaker + +from pricewatch.app.core.config import AppConfig, get_config +from pricewatch.app.core.logging import get_logger +from pricewatch.app.db.models import Base + +logger = get_logger("db.connection") + +# Global engine instance (singleton) +_engine: Optional[Engine] = None +_session_factory: Optional[sessionmaker] = None + + +def get_engine(config: Optional[AppConfig] = None) -> Engine: + """ + Récupère ou crée l'Engine SQLAlchemy (singleton). + + Args: + config: Configuration app (utilise get_config() si None) + + Returns: + Engine SQLAlchemy configuré + + Justification: + - Singleton: une seule pool de connexions par application + - pool_pre_ping: vérifie connexion avant usage (évite "connection closed") + - pool_size=5, max_overflow=10: limite connexions (15 max) + - echo=debug: logs SQL pour debugging + """ + global _engine + + if _engine is None: + if config is None: + config = get_config() + + db_url = config.db.url + url = make_url(db_url) + is_sqlite = url.get_backend_name() == "sqlite" + + logger.info(f"Creating database engine: {db_url}") + + engine_kwargs = { + "pool_pre_ping": True, + "pool_recycle": 3600, + "echo": config.debug, + } + + if not is_sqlite: + engine_kwargs.update( + { + "pool_size": 5, + "max_overflow": 10, + } + ) + + _engine = create_engine(db_url, **engine_kwargs) + + logger.info("Database engine created successfully") + + return _engine + + +def init_db(config: Optional[AppConfig] = None) -> None: + """ + Initialise la base de données (crée toutes les tables). + + Args: + config: Configuration app (utilise get_config() si None) + + Raises: + OperationalError: Si connexion impossible + SQLAlchemyError: Si création tables échoue + + Note: + Utilise Base.metadata.create_all() - idempotent (ne crash pas si tables existent) + """ + if config is None: + config = get_config() + + logger.info("Initializing database...") + + try: + engine = get_engine(config) + + # Créer toutes les tables définies dans Base.metadata + Base.metadata.create_all(bind=engine) + + logger.info("Database initialized successfully") + logger.info(f"Tables created: {', '.join(Base.metadata.tables.keys())}") + + except OperationalError as e: + logger.error(f"Failed to connect to database: {e}") + raise + except SQLAlchemyError as e: + logger.error(f"Failed to create tables: {e}") + raise + + +def get_session_factory(config: Optional[AppConfig] = None) -> sessionmaker: + """ + Récupère ou crée la session factory (singleton). + + Args: + config: Configuration app (utilise get_config() si None) + + Returns: + Session factory SQLAlchemy + + Justification: + - expire_on_commit=False: objets restent accessibles après commit + - autocommit=False, autoflush=False: contrôle explicite + """ + global _session_factory + + if _session_factory is None: + engine = get_engine(config) + + _session_factory = sessionmaker( + bind=engine, + expire_on_commit=False, # Objets restent accessibles après commit + autocommit=False, # Contrôle explicite du commit + autoflush=False, # Contrôle explicite du flush + ) + + logger.debug("Session factory created") + + return _session_factory + + +@contextmanager +def get_session(config: Optional[AppConfig] = None) -> Generator[Session, None, None]: + """ + Context manager pour session SQLAlchemy. + + Args: + config: Configuration app (utilise get_config() si None) + + Yields: + Session SQLAlchemy + + Usage: + with get_session() as session: + product = session.query(Product).filter_by(reference="B08N5WRWNW").first() + session.commit() + + Justification: + - Context manager: garantit fermeture session (pas de leak) + - Rollback automatique sur exception + - Close automatique en fin de bloc + """ + factory = get_session_factory(config) + session = factory() + + try: + logger.debug("Session opened") + yield session + except Exception as e: + logger.error(f"Session error, rolling back: {e}") + session.rollback() + raise + finally: + logger.debug("Session closed") + session.close() + + +def check_db_connection(config: Optional[AppConfig] = None) -> bool: + """ + Vérifie la connexion à la base de données (health check). + + Args: + config: Configuration app (utilise get_config() si None) + + Returns: + True si connexion OK, False sinon + + Note: + Execute une query simple: SELECT 1 + """ + if config is None: + config = get_config() + + try: + engine = get_engine(config) + + with engine.connect() as conn: + result = conn.execute(text("SELECT 1")) + result.scalar() + + logger.info("Database connection OK") + return True + + except OperationalError as e: + logger.error(f"Database connection failed: {e}") + return False + except SQLAlchemyError as e: + logger.error(f"Database health check failed: {e}") + return False + + +def reset_engine() -> None: + """ + Reset l'engine global (pour tests). + + Note: + Dispose l'engine et reset les singletons. + """ + global _engine, _session_factory + + if _engine is not None: + logger.debug("Disposing database engine") + _engine.dispose() + _engine = None + + _session_factory = None + logger.debug("Engine reset complete") diff --git a/pricewatch/app/db/migrations/__pycache__/env.cpython-313.pyc b/pricewatch/app/db/migrations/__pycache__/env.cpython-313.pyc new file mode 100755 index 0000000..b9f270e Binary files /dev/null and b/pricewatch/app/db/migrations/__pycache__/env.cpython-313.pyc differ diff --git a/pricewatch/app/db/migrations/env.py b/pricewatch/app/db/migrations/env.py new file mode 100755 index 0000000..34cc133 --- /dev/null +++ b/pricewatch/app/db/migrations/env.py @@ -0,0 +1,80 @@ +""" +Configuration Alembic pour PriceWatch. + +Recupere l'URL DB depuis AppConfig pour garantir un setup coherent. +""" + +from logging.config import fileConfig + +from alembic import context +from sqlalchemy import engine_from_config, pool + +from pricewatch.app.core.config import get_config +from pricewatch.app.db.models import Base + +# Alembic Config object +config = context.config + +# Configure logging +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# Metadata SQLAlchemy pour autogenerate +target_metadata = Base.metadata + + +def _get_database_url() -> str: + """Construit l'URL DB depuis la config applicative.""" + app_config = get_config() + return app_config.db.url + + +def run_migrations_offline() -> None: + """ + Execute les migrations en mode offline. + + Configure le contexte avec l'URL DB sans creer d'engine. + """ + url = _get_database_url() + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + compare_type=True, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """ + Execute les migrations en mode online. + + Cree un engine SQLAlchemy et etablit la connexion. + """ + configuration = config.get_section(config.config_ini_section) or {} + configuration["sqlalchemy.url"] = _get_database_url() + + connectable = engine_from_config( + configuration, + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, + target_metadata=target_metadata, + compare_type=True, + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/pricewatch/app/db/migrations/script.py.mako b/pricewatch/app/db/migrations/script.py.mako new file mode 100755 index 0000000..44417d4 --- /dev/null +++ b/pricewatch/app/db/migrations/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} +""" + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# Revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py b/pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py new file mode 100755 index 0000000..94afcf5 --- /dev/null +++ b/pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py @@ -0,0 +1,124 @@ +"""Initial schema + +Revision ID: 20260114_01 +Revises: None +Create Date: 2026-01-14 00:00:00 +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# Revision identifiers, used by Alembic. +revision = "20260114_01" +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "products", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("source", sa.String(length=50), nullable=False), + sa.Column("reference", sa.String(length=100), nullable=False), + sa.Column("url", sa.Text(), nullable=False), + sa.Column("title", sa.Text(), nullable=True), + sa.Column("category", sa.Text(), nullable=True), + sa.Column("currency", sa.String(length=3), nullable=True), + sa.Column("first_seen_at", sa.TIMESTAMP(), nullable=False), + sa.Column("last_updated_at", sa.TIMESTAMP(), nullable=False), + sa.UniqueConstraint("source", "reference", name="uq_product_source_reference"), + ) + op.create_index("ix_product_source", "products", ["source"], unique=False) + op.create_index("ix_product_reference", "products", ["reference"], unique=False) + op.create_index("ix_product_last_updated", "products", ["last_updated_at"], unique=False) + + op.create_table( + "price_history", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("product_id", sa.Integer(), nullable=False), + sa.Column("price", sa.Numeric(10, 2), nullable=True), + sa.Column("shipping_cost", sa.Numeric(10, 2), nullable=True), + sa.Column("stock_status", sa.String(length=20), nullable=True), + sa.Column("fetch_method", sa.String(length=20), nullable=False), + sa.Column("fetch_status", sa.String(length=20), nullable=False), + sa.Column("fetched_at", sa.TIMESTAMP(), nullable=False), + sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"), + sa.UniqueConstraint("product_id", "fetched_at", name="uq_price_history_product_time"), + sa.CheckConstraint("stock_status IN ('in_stock', 'out_of_stock', 'unknown')"), + sa.CheckConstraint("fetch_method IN ('http', 'playwright')"), + sa.CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"), + ) + op.create_index("ix_price_history_product_id", "price_history", ["product_id"], unique=False) + op.create_index("ix_price_history_fetched_at", "price_history", ["fetched_at"], unique=False) + + op.create_table( + "product_images", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("product_id", sa.Integer(), nullable=False), + sa.Column("image_url", sa.Text(), nullable=False), + sa.Column("position", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"), + sa.UniqueConstraint("product_id", "image_url", name="uq_product_image_url"), + ) + op.create_index("ix_product_image_product_id", "product_images", ["product_id"], unique=False) + + op.create_table( + "product_specs", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("product_id", sa.Integer(), nullable=False), + sa.Column("spec_key", sa.String(length=200), nullable=False), + sa.Column("spec_value", sa.Text(), nullable=False), + sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"), + sa.UniqueConstraint("product_id", "spec_key", name="uq_product_spec_key"), + ) + op.create_index("ix_product_spec_product_id", "product_specs", ["product_id"], unique=False) + op.create_index("ix_product_spec_key", "product_specs", ["spec_key"], unique=False) + + op.create_table( + "scraping_logs", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("product_id", sa.Integer(), nullable=True), + sa.Column("url", sa.Text(), nullable=False), + sa.Column("source", sa.String(length=50), nullable=False), + sa.Column("reference", sa.String(length=100), nullable=True), + sa.Column("fetch_method", sa.String(length=20), nullable=False), + sa.Column("fetch_status", sa.String(length=20), nullable=False), + sa.Column("fetched_at", sa.TIMESTAMP(), nullable=False), + sa.Column("duration_ms", sa.Integer(), nullable=True), + sa.Column("html_size_bytes", sa.Integer(), nullable=True), + sa.Column("errors", postgresql.JSONB(), nullable=True), + sa.Column("notes", postgresql.JSONB(), nullable=True), + sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="SET NULL"), + sa.CheckConstraint("fetch_method IN ('http', 'playwright')"), + sa.CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"), + ) + op.create_index("ix_scraping_log_product_id", "scraping_logs", ["product_id"], unique=False) + op.create_index("ix_scraping_log_source", "scraping_logs", ["source"], unique=False) + op.create_index("ix_scraping_log_fetched_at", "scraping_logs", ["fetched_at"], unique=False) + op.create_index("ix_scraping_log_fetch_status", "scraping_logs", ["fetch_status"], unique=False) + + +def downgrade() -> None: + op.drop_index("ix_scraping_log_fetch_status", table_name="scraping_logs") + op.drop_index("ix_scraping_log_fetched_at", table_name="scraping_logs") + op.drop_index("ix_scraping_log_source", table_name="scraping_logs") + op.drop_index("ix_scraping_log_product_id", table_name="scraping_logs") + op.drop_table("scraping_logs") + + op.drop_index("ix_product_spec_key", table_name="product_specs") + op.drop_index("ix_product_spec_product_id", table_name="product_specs") + op.drop_table("product_specs") + + op.drop_index("ix_product_image_product_id", table_name="product_images") + op.drop_table("product_images") + + op.drop_index("ix_price_history_fetched_at", table_name="price_history") + op.drop_index("ix_price_history_product_id", table_name="price_history") + op.drop_table("price_history") + + op.drop_index("ix_product_last_updated", table_name="products") + op.drop_index("ix_product_reference", table_name="products") + op.drop_index("ix_product_source", table_name="products") + op.drop_table("products") diff --git a/pricewatch/app/db/migrations/versions/__pycache__/20260114_01_initial_schema.cpython-313.pyc b/pricewatch/app/db/migrations/versions/__pycache__/20260114_01_initial_schema.cpython-313.pyc new file mode 100755 index 0000000..8258ddb Binary files /dev/null and b/pricewatch/app/db/migrations/versions/__pycache__/20260114_01_initial_schema.cpython-313.pyc differ diff --git a/pricewatch/app/db/models.py b/pricewatch/app/db/models.py new file mode 100755 index 0000000..096a0cf --- /dev/null +++ b/pricewatch/app/db/models.py @@ -0,0 +1,320 @@ +""" +Modèles SQLAlchemy pour PriceWatch Phase 2. + +Schéma normalisé pour persistence PostgreSQL: +- products: Catalogue produits (déduplication sur source + reference) +- price_history: Historique prix time-series +- product_images: Images produit (N par produit) +- product_specs: Caractéristiques produit (key-value) +- scraping_logs: Logs observabilité pour debugging + +Justification technique: +- Normalisation: products séparée de price_history (catalogue vs time-series) +- Clé naturelle: (source, reference) comme unique constraint (ASIN Amazon, etc.) +- Pas de JSONB pour données structurées: tables séparées pour images/specs +- JSONB uniquement pour données variables: errors, notes dans logs +""" + +from datetime import datetime +from decimal import Decimal +from typing import List, Optional + +from sqlalchemy import ( + TIMESTAMP, + CheckConstraint, + Column, + ForeignKey, + Index, + Integer, + JSON, + Numeric, + String, + Text, + UniqueConstraint, +) +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship + + +class Base(DeclarativeBase): + """Base class pour tous les modèles SQLAlchemy.""" + + pass + + +class Product(Base): + """ + Catalogue produits (1 ligne par produit unique). + + Clé naturelle: (source, reference) - Ex: (amazon, B08N5WRWNW) + Mise à jour: title, category, url à chaque scraping + Historique prix: relation 1-N vers PriceHistory + """ + + __tablename__ = "products" + + # Primary key + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + # Natural key (unique) + source: Mapped[str] = mapped_column( + String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)" + ) + reference: Mapped[str] = mapped_column( + String(100), nullable=False, comment="Product reference (ASIN, SKU, etc.)" + ) + + # Product metadata + url: Mapped[str] = mapped_column(Text, nullable=False, comment="Canonical product URL") + title: Mapped[Optional[str]] = mapped_column(Text, nullable=True, comment="Product title") + category: Mapped[Optional[str]] = mapped_column( + Text, nullable=True, comment="Product category (breadcrumb)" + ) + currency: Mapped[Optional[str]] = mapped_column( + String(3), nullable=True, comment="Currency code (EUR, USD, GBP)" + ) + + # Timestamps + first_seen_at: Mapped[datetime] = mapped_column( + TIMESTAMP, nullable=False, default=datetime.utcnow, comment="First scraping timestamp" + ) + last_updated_at: Mapped[datetime] = mapped_column( + TIMESTAMP, + nullable=False, + default=datetime.utcnow, + onupdate=datetime.utcnow, + comment="Last metadata update", + ) + + # Relationships + price_history: Mapped[List["PriceHistory"]] = relationship( + "PriceHistory", back_populates="product", cascade="all, delete-orphan" + ) + images: Mapped[List["ProductImage"]] = relationship( + "ProductImage", back_populates="product", cascade="all, delete-orphan" + ) + specs: Mapped[List["ProductSpec"]] = relationship( + "ProductSpec", back_populates="product", cascade="all, delete-orphan" + ) + logs: Mapped[List["ScrapingLog"]] = relationship( + "ScrapingLog", back_populates="product", cascade="all, delete-orphan" + ) + + # Constraints + __table_args__ = ( + UniqueConstraint("source", "reference", name="uq_product_source_reference"), + Index("ix_product_source", "source"), + Index("ix_product_reference", "reference"), + Index("ix_product_last_updated", "last_updated_at"), + ) + + def __repr__(self) -> str: + return f"" + + +class PriceHistory(Base): + """ + Historique prix (time-series). + + Une ligne par scraping réussi avec extraction prix. + Unique constraint sur (product_id, fetched_at) évite doublons. + """ + + __tablename__ = "price_history" + + # Primary key + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + # Foreign key + product_id: Mapped[int] = mapped_column( + Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False + ) + + # Price data + price: Mapped[Optional[Decimal]] = mapped_column( + Numeric(10, 2), nullable=True, comment="Product price" + ) + shipping_cost: Mapped[Optional[Decimal]] = mapped_column( + Numeric(10, 2), nullable=True, comment="Shipping cost" + ) + stock_status: Mapped[Optional[str]] = mapped_column( + String(20), nullable=True, comment="Stock status (in_stock, out_of_stock, unknown)" + ) + + # Fetch metadata + fetch_method: Mapped[str] = mapped_column( + String(20), nullable=False, comment="Fetch method (http, playwright)" + ) + fetch_status: Mapped[str] = mapped_column( + String(20), nullable=False, comment="Fetch status (success, partial, failed)" + ) + fetched_at: Mapped[datetime] = mapped_column( + TIMESTAMP, nullable=False, comment="Scraping timestamp" + ) + + # Relationship + product: Mapped["Product"] = relationship("Product", back_populates="price_history") + + # Constraints + __table_args__ = ( + UniqueConstraint("product_id", "fetched_at", name="uq_price_history_product_time"), + Index("ix_price_history_product_id", "product_id"), + Index("ix_price_history_fetched_at", "fetched_at"), + CheckConstraint("stock_status IN ('in_stock', 'out_of_stock', 'unknown')"), + CheckConstraint("fetch_method IN ('http', 'playwright')"), + CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"), + ) + + def __repr__(self) -> str: + return f"" + + +class ProductImage(Base): + """ + Images produit (N images par produit). + + Unique constraint sur (product_id, image_url) évite doublons. + Position permet de garder l'ordre des images. + """ + + __tablename__ = "product_images" + + # Primary key + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + # Foreign key + product_id: Mapped[int] = mapped_column( + Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False + ) + + # Image data + image_url: Mapped[str] = mapped_column(Text, nullable=False, comment="Image URL") + position: Mapped[int] = mapped_column( + Integer, nullable=False, default=0, comment="Image position (0=main)" + ) + + # Relationship + product: Mapped["Product"] = relationship("Product", back_populates="images") + + # Constraints + __table_args__ = ( + UniqueConstraint("product_id", "image_url", name="uq_product_image_url"), + Index("ix_product_image_product_id", "product_id"), + ) + + def __repr__(self) -> str: + return f"" + + +class ProductSpec(Base): + """ + Caractéristiques produit (key-value). + + Unique constraint sur (product_id, spec_key) évite doublons. + Permet queries efficaces par clé. + """ + + __tablename__ = "product_specs" + + # Primary key + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + # Foreign key + product_id: Mapped[int] = mapped_column( + Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False + ) + + # Spec data + spec_key: Mapped[str] = mapped_column( + String(200), nullable=False, comment="Specification key (e.g., 'Brand', 'Color')" + ) + spec_value: Mapped[str] = mapped_column(Text, nullable=False, comment="Specification value") + + # Relationship + product: Mapped["Product"] = relationship("Product", back_populates="specs") + + # Constraints + __table_args__ = ( + UniqueConstraint("product_id", "spec_key", name="uq_product_spec_key"), + Index("ix_product_spec_product_id", "product_id"), + Index("ix_product_spec_key", "spec_key"), + ) + + def __repr__(self) -> str: + return f"" + + +class ScrapingLog(Base): + """ + Logs observabilité pour debugging. + + FK optionnelle vers products (permet logs même si produit non créé). + JSONB pour errors/notes car structure variable. + Permet analytics: taux succès, durée moyenne, etc. + """ + + __tablename__ = "scraping_logs" + + # Primary key + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + # Foreign key (optional) + product_id: Mapped[Optional[int]] = mapped_column( + Integer, ForeignKey("products.id", ondelete="SET NULL"), nullable=True + ) + + # Scraping metadata + url: Mapped[str] = mapped_column(Text, nullable=False, comment="Scraped URL") + source: Mapped[str] = mapped_column( + String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)" + ) + reference: Mapped[Optional[str]] = mapped_column( + String(100), nullable=True, comment="Product reference (if extracted)" + ) + + # Fetch metadata + fetch_method: Mapped[str] = mapped_column( + String(20), nullable=False, comment="Fetch method (http, playwright)" + ) + fetch_status: Mapped[str] = mapped_column( + String(20), nullable=False, comment="Fetch status (success, partial, failed)" + ) + fetched_at: Mapped[datetime] = mapped_column( + TIMESTAMP, nullable=False, default=datetime.utcnow, comment="Scraping timestamp" + ) + + # Performance metrics + duration_ms: Mapped[Optional[int]] = mapped_column( + Integer, nullable=True, comment="Fetch duration in milliseconds" + ) + html_size_bytes: Mapped[Optional[int]] = mapped_column( + Integer, nullable=True, comment="HTML response size in bytes" + ) + + # Debug data (JSONB) + errors: Mapped[Optional[list[str]]] = mapped_column( + JSON().with_variant(JSONB, "postgresql"), + nullable=True, + comment="Error messages (list of strings)", + ) + notes: Mapped[Optional[list[str]]] = mapped_column( + JSON().with_variant(JSONB, "postgresql"), + nullable=True, + comment="Debug notes (list of strings)", + ) + + # Relationship + product: Mapped[Optional["Product"]] = relationship("Product", back_populates="logs") + + # Constraints + __table_args__ = ( + Index("ix_scraping_log_product_id", "product_id"), + Index("ix_scraping_log_source", "source"), + Index("ix_scraping_log_fetched_at", "fetched_at"), + Index("ix_scraping_log_fetch_status", "fetch_status"), + CheckConstraint("fetch_method IN ('http', 'playwright')"), + CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"), + ) + + def __repr__(self) -> str: + return f"" diff --git a/pricewatch/app/db/repository.py b/pricewatch/app/db/repository.py new file mode 100755 index 0000000..5474b98 --- /dev/null +++ b/pricewatch/app/db/repository.py @@ -0,0 +1,140 @@ +""" +Repository pattern pour la persistence SQLAlchemy. + +Centralise les operations CRUD sur les modeles DB a partir d'un ProductSnapshot. +""" + +from __future__ import annotations + +from typing import Optional + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.orm import Session + +from pricewatch.app.core.logging import get_logger +from pricewatch.app.core.schema import ProductSnapshot +from pricewatch.app.db.models import PriceHistory, Product, ProductImage, ProductSpec, ScrapingLog + +logger = get_logger("db.repository") + + +class ProductRepository: + """Repository de persistence pour ProductSnapshot.""" + + def __init__(self, session: Session) -> None: + self.session = session + + def get_or_create(self, source: str, reference: str, url: str) -> Product: + """ + Recuperer ou creer un produit par cle naturelle (source, reference). + """ + product = ( + self.session.query(Product) + .filter(Product.source == source, Product.reference == reference) + .one_or_none() + ) + if product: + return product + + product = Product(source=source, reference=reference, url=url) + self.session.add(product) + self.session.flush() + return product + + def update_product_metadata(self, product: Product, snapshot: ProductSnapshot) -> None: + """Met a jour les metadonnees produit si disponibles.""" + if snapshot.url: + product.url = snapshot.url + if snapshot.title: + product.title = snapshot.title + if snapshot.category: + product.category = snapshot.category + if snapshot.currency: + product.currency = snapshot.currency + + def add_price_history(self, product: Product, snapshot: ProductSnapshot) -> Optional[PriceHistory]: + """Ajoute une entree d'historique de prix si inexistante.""" + existing = ( + self.session.query(PriceHistory) + .filter( + PriceHistory.product_id == product.id, + PriceHistory.fetched_at == snapshot.fetched_at, + ) + .one_or_none() + ) + if existing: + return existing + + price_entry = PriceHistory( + product_id=product.id, + price=snapshot.price, + shipping_cost=snapshot.shipping_cost, + stock_status=snapshot.stock_status, + fetch_method=snapshot.debug.method, + fetch_status=snapshot.debug.status, + fetched_at=snapshot.fetched_at, + ) + self.session.add(price_entry) + return price_entry + + def sync_images(self, product: Product, images: list[str]) -> None: + """Synchronise les images (ajout des nouvelles).""" + existing_urls = {image.image_url for image in product.images} + for position, url in enumerate(images): + if url in existing_urls: + continue + self.session.add(ProductImage(product_id=product.id, image_url=url, position=position)) + + def sync_specs(self, product: Product, specs: dict[str, str]) -> None: + """Synchronise les specs (upsert par cle).""" + existing_specs = {spec.spec_key: spec for spec in product.specs} + for key, value in specs.items(): + if key in existing_specs: + existing_specs[key].spec_value = value + else: + self.session.add(ProductSpec(product_id=product.id, spec_key=key, spec_value=value)) + + def add_scraping_log(self, snapshot: ProductSnapshot, product_id: Optional[int]) -> ScrapingLog: + """Ajoute un log de scraping pour observabilite.""" + log_entry = ScrapingLog( + product_id=product_id, + url=snapshot.url, + source=snapshot.source, + reference=snapshot.reference, + fetch_method=snapshot.debug.method, + fetch_status=snapshot.debug.status, + fetched_at=snapshot.fetched_at, + duration_ms=snapshot.debug.duration_ms, + html_size_bytes=snapshot.debug.html_size_bytes, + errors=snapshot.debug.errors or None, + notes=snapshot.debug.notes or None, + ) + self.session.add(log_entry) + return log_entry + + def save_snapshot(self, snapshot: ProductSnapshot) -> Optional[int]: + """ + Persiste un ProductSnapshot complet dans la base. + + Retourne l'id produit ou None si reference absente. + """ + if not snapshot.reference: + logger.warning("Reference absente: persistence ignoree") + self.add_scraping_log(snapshot, product_id=None) + return None + + product = self.get_or_create(snapshot.source, snapshot.reference, snapshot.url) + self.update_product_metadata(product, snapshot) + self.add_price_history(product, snapshot) + self.sync_images(product, snapshot.images) + self.sync_specs(product, snapshot.specs) + self.add_scraping_log(snapshot, product_id=product.id) + return product.id + + def safe_save_snapshot(self, snapshot: ProductSnapshot) -> Optional[int]: + """Sauvegarde avec gestion d'erreur SQLAlchemy.""" + try: + return self.save_snapshot(snapshot) + except SQLAlchemyError as exc: + logger.error(f"Erreur SQLAlchemy: {exc}") + raise diff --git a/pricewatch/app/scraping/__init__.py b/pricewatch/app/scraping/__init__.py index e69de29..7afef5d 100755 --- a/pricewatch/app/scraping/__init__.py +++ b/pricewatch/app/scraping/__init__.py @@ -0,0 +1,3 @@ +from pricewatch.app.scraping.pipeline import ScrapingPipeline + +__all__ = ["ScrapingPipeline"] diff --git a/pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc b/pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc index 1895f33..9cc8384 100755 Binary files a/pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc and b/pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc differ diff --git a/pricewatch/app/scraping/__pycache__/pipeline.cpython-313.pyc b/pricewatch/app/scraping/__pycache__/pipeline.cpython-313.pyc new file mode 100755 index 0000000..da613bb Binary files /dev/null and b/pricewatch/app/scraping/__pycache__/pipeline.cpython-313.pyc differ diff --git a/pricewatch/app/scraping/pipeline.py b/pricewatch/app/scraping/pipeline.py new file mode 100755 index 0000000..cbf7865 --- /dev/null +++ b/pricewatch/app/scraping/pipeline.py @@ -0,0 +1,52 @@ +""" +Pipeline de persistence pour les snapshots de scraping. + +Ne doit jamais bloquer le pipeline principal si la DB est indisponible. +""" + +from __future__ import annotations + +from typing import Optional + +from sqlalchemy.exc import SQLAlchemyError + +from pricewatch.app.core.config import AppConfig, get_config +from pricewatch.app.core.logging import get_logger +from pricewatch.app.core.schema import ProductSnapshot +from pricewatch.app.db.connection import get_session +from pricewatch.app.db.repository import ProductRepository + +logger = get_logger("scraping.pipeline") + + +class ScrapingPipeline: + """Orchestration de persistence DB pour un ProductSnapshot.""" + + def __init__(self, config: Optional[AppConfig] = None) -> None: + self.config = config + + def process_snapshot(self, snapshot: ProductSnapshot, save_to_db: bool = True) -> Optional[int]: + """ + Persiste un snapshot en base si active. + + Retourne l'id produit si sauve, sinon None. + """ + app_config = self.config or get_config() + if not save_to_db or not app_config.enable_db: + logger.debug("Persistence DB desactivee") + return None + + try: + with get_session(app_config) as session: + repo = ProductRepository(session) + product_id = repo.safe_save_snapshot(snapshot) + session.commit() + return product_id + except SQLAlchemyError as exc: + snapshot.add_note(f"Persistence DB echouee: {exc}") + logger.error(f"Persistence DB echouee: {exc}") + return None + except Exception as exc: + snapshot.add_note(f"Erreur pipeline DB: {exc}") + logger.error(f"Erreur pipeline DB: {exc}") + return None diff --git a/pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc b/pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc index fc01085..89edb9b 100755 Binary files a/pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc and b/pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc differ diff --git a/pricewatch/app/stores/amazon/store.py b/pricewatch/app/stores/amazon/store.py index 713593b..a2bdaca 100755 --- a/pricewatch/app/stores/amazon/store.py +++ b/pricewatch/app/stores/amazon/store.py @@ -214,6 +214,18 @@ class AmazonStore(BaseStore): except ValueError: continue + # Fallback: chercher les spans séparés a-price-whole et a-price-fraction + whole = soup.select_one("span.a-price-whole") + fraction = soup.select_one("span.a-price-fraction") + if whole and fraction: + whole_text = whole.get_text(strip=True) + fraction_text = fraction.get_text(strip=True) + try: + price_str = f"{whole_text}.{fraction_text}" + return float(price_str) + except ValueError: + pass + debug.errors.append("Prix non trouvé") return None @@ -270,6 +282,14 @@ class AmazonStore(BaseStore): if url and url.startswith("http"): images.append(url) + # Fallback: chercher tous les img tags si aucune image trouvée + if not images: + all_imgs = soup.find_all("img") + for img in all_imgs: + url = img.get("src") or img.get("data-src") + if url and url.startswith("http"): + images.append(url) + return list(set(images)) # Dédupliquer def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]: diff --git a/pricewatch/app/tasks/__init__.py b/pricewatch/app/tasks/__init__.py new file mode 100755 index 0000000..7ffa53d --- /dev/null +++ b/pricewatch/app/tasks/__init__.py @@ -0,0 +1,8 @@ +""" +Module tasks pour les jobs RQ. +""" + +from pricewatch.app.tasks.scrape import scrape_product +from pricewatch.app.tasks.scheduler import ScrapingScheduler + +__all__ = ["scrape_product", "ScrapingScheduler"] diff --git a/pricewatch/app/tasks/scheduler.py b/pricewatch/app/tasks/scheduler.py new file mode 100755 index 0000000..628594c --- /dev/null +++ b/pricewatch/app/tasks/scheduler.py @@ -0,0 +1,75 @@ +""" +Planification des jobs de scraping via RQ Scheduler. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from typing import Optional + +import redis +from rq import Queue +from rq_scheduler import Scheduler + +from pricewatch.app.core.config import AppConfig, get_config +from pricewatch.app.core.logging import get_logger +from pricewatch.app.tasks.scrape import scrape_product + +logger = get_logger("tasks.scheduler") + + +@dataclass +class ScheduledJobInfo: + """Infos de retour pour un job planifie.""" + + job_id: str + next_run: datetime + + +class ScrapingScheduler: + """Scheduler pour les jobs de scraping avec RQ.""" + + def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None: + self.config = config or get_config() + self.redis = redis.from_url(self.config.redis.url) + self.queue = Queue(queue_name, connection=self.redis) + self.scheduler = Scheduler(queue=self.queue, connection=self.redis) + + def enqueue_immediate( + self, + url: str, + use_playwright: Optional[bool] = None, + save_db: bool = True, + ): + """Enqueue un job immediat.""" + job = self.queue.enqueue( + scrape_product, + url, + use_playwright=use_playwright, + save_db=save_db, + ) + logger.info(f"Job enqueued: {job.id}") + return job + + def schedule_product( + self, + url: str, + interval_hours: int = 24, + use_playwright: Optional[bool] = None, + save_db: bool = True, + ) -> ScheduledJobInfo: + """Planifie un scraping recurrent (intervalle en heures).""" + interval_seconds = int(timedelta(hours=interval_hours).total_seconds()) + next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds) + + job = self.scheduler.schedule( + scheduled_time=next_run, + func=scrape_product, + args=[url], + kwargs={"use_playwright": use_playwright, "save_db": save_db}, + interval=interval_seconds, + repeat=None, + ) + logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}") + return ScheduledJobInfo(job_id=job.id, next_run=next_run) diff --git a/pricewatch/app/tasks/scrape.py b/pricewatch/app/tasks/scrape.py new file mode 100755 index 0000000..3db721a --- /dev/null +++ b/pricewatch/app/tasks/scrape.py @@ -0,0 +1,160 @@ +""" +Tache de scraping asynchrone pour RQ. +""" + +from __future__ import annotations + +from typing import Any, Optional + +from pricewatch.app.core.config import AppConfig, get_config +from pricewatch.app.core.logging import get_logger +from pricewatch.app.core.registry import get_registry +from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot +from pricewatch.app.scraping.http_fetch import fetch_http +from pricewatch.app.scraping.pipeline import ScrapingPipeline +from pricewatch.app.scraping.pw_fetch import fetch_playwright +from pricewatch.app.stores.aliexpress.store import AliexpressStore +from pricewatch.app.stores.amazon.store import AmazonStore +from pricewatch.app.stores.backmarket.store import BackmarketStore +from pricewatch.app.stores.cdiscount.store import CdiscountStore + +logger = get_logger("tasks.scrape") + + +def setup_stores() -> None: + """Enregistre les stores disponibles si besoin.""" + registry = get_registry() + if registry.list_stores(): + return + registry.register(AmazonStore()) + registry.register(CdiscountStore()) + registry.register(BackmarketStore()) + registry.register(AliexpressStore()) + + +def scrape_product( + url: str, + use_playwright: Optional[bool] = None, + save_db: bool = True, + save_html: bool = False, + save_screenshot: bool = False, + headful: bool = False, + timeout_ms: Optional[int] = None, +) -> dict[str, Any]: + """ + Scrape un produit et persiste en base via ScrapingPipeline. + + Retourne un dict avec success, product_id, snapshot, error. + """ + config: AppConfig = get_config() + setup_stores() + + if use_playwright is None: + use_playwright = config.default_use_playwright + + if timeout_ms is None: + timeout_ms = config.default_playwright_timeout + + registry = get_registry() + store = registry.detect_store(url) + if not store: + snapshot = ProductSnapshot( + source="unknown", + url=url, + debug=DebugInfo( + method=FetchMethod.HTTP, + status=DebugStatus.FAILED, + errors=["Aucun store detecte"], + ), + ) + ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db) + return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"} + + canonical_url = store.canonicalize(url) + + html = None + fetch_method = FetchMethod.HTTP + fetch_error = None + duration_ms = None + html_size_bytes = None + pw_result = None + + http_result = fetch_http(canonical_url) + duration_ms = http_result.duration_ms + + if http_result.success: + html = http_result.html + fetch_method = FetchMethod.HTTP + elif use_playwright: + pw_result = fetch_playwright( + canonical_url, + headless=not headful, + timeout_ms=timeout_ms, + save_screenshot=save_screenshot, + ) + duration_ms = pw_result.duration_ms + + if pw_result.success: + html = pw_result.html + fetch_method = FetchMethod.PLAYWRIGHT + else: + fetch_error = pw_result.error + else: + fetch_error = http_result.error + + if html: + html_size_bytes = len(html.encode("utf-8")) + if save_html: + from pricewatch.app.core.io import save_debug_html + + ref = store.extract_reference(canonical_url) or "unknown" + save_debug_html(html, f"{store.store_id}_{ref}") + + if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result: + from pricewatch.app.core.io import save_debug_screenshot + + if pw_result and pw_result.screenshot: + ref = store.extract_reference(canonical_url) or "unknown" + save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}") + + try: + snapshot = store.parse(html, canonical_url) + snapshot.debug.method = fetch_method + snapshot.debug.duration_ms = duration_ms + snapshot.debug.html_size_bytes = html_size_bytes + success = snapshot.debug.status != DebugStatus.FAILED + except Exception as exc: + snapshot = ProductSnapshot( + source=store.store_id, + url=canonical_url, + debug=DebugInfo( + method=fetch_method, + status=DebugStatus.FAILED, + errors=[f"Parsing failed: {exc}"], + duration_ms=duration_ms, + html_size_bytes=html_size_bytes, + ), + ) + success = False + fetch_error = str(exc) + else: + snapshot = ProductSnapshot( + source=store.store_id, + url=canonical_url, + debug=DebugInfo( + method=fetch_method, + status=DebugStatus.FAILED, + errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"], + duration_ms=duration_ms, + ), + ) + success = False + + product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db) + + return { + "success": success, + "product_id": product_id, + "snapshot": snapshot, + "error": fetch_error, + } diff --git a/pyproject.toml b/pyproject.toml index 9e92604..4697124 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,19 @@ dependencies = [ # Date/time utilities "python-dateutil>=2.8.2", + + # Database (Phase 2) + "sqlalchemy>=2.0.0", + "psycopg2-binary>=2.9.0", + "alembic>=1.13.0", + + # Configuration (Phase 2) + "python-dotenv>=1.0.0", + + # Worker/Queue (Phase 2) + "redis>=5.0.0", + "rq>=1.15.0", + "rq-scheduler>=0.13.0", ] [project.optional-dependencies] diff --git a/tests/cli/__pycache__/test_run_db.cpython-313-pytest-9.0.2.pyc b/tests/cli/__pycache__/test_run_db.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..10efd14 Binary files /dev/null and b/tests/cli/__pycache__/test_run_db.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/cli/test_run_db.py b/tests/cli/test_run_db.py new file mode 100755 index 0000000..b22274c --- /dev/null +++ b/tests/cli/test_run_db.py @@ -0,0 +1,106 @@ +""" +Tests end-to-end pour la commande CLI run avec persistence DB. +""" + +from dataclasses import dataclass +from pathlib import Path + +from typer.testing import CliRunner + +from pricewatch.app.cli import main as cli_main +from pricewatch.app.core.registry import get_registry +from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot +from pricewatch.app.db.connection import get_session, init_db, reset_engine +from pricewatch.app.db.models import Product +from pricewatch.app.stores.base import BaseStore + + +@dataclass +class FakeDbConfig: + url: str + + +@dataclass +class FakeAppConfig: + db: FakeDbConfig + debug: bool = False + enable_db: bool = True + + +class DummyStore(BaseStore): + def __init__(self) -> None: + super().__init__(store_id="dummy") + + def match(self, url: str) -> float: + return 1.0 if "example.com" in url else 0.0 + + def canonicalize(self, url: str) -> str: + return url + + def extract_reference(self, url: str) -> str | None: + return "REF123" + + def parse(self, html: str, url: str) -> ProductSnapshot: + return ProductSnapshot( + source=self.store_id, + url=url, + title="Produit dummy", + price=9.99, + currency="EUR", + reference="REF123", + debug=DebugInfo(method=FetchMethod.HTTP, status=DebugStatus.SUCCESS), + ) + + +class DummyFetchResult: + def __init__(self, html: str) -> None: + self.success = True + self.html = html + self.error = None + + +def test_cli_run_persists_db(tmp_path, monkeypatch): + """Le CLI run persiste en base quand --save-db est active.""" + reset_engine() + db_path = tmp_path / "test.db" + config = FakeAppConfig(db=FakeDbConfig(url=f"sqlite:///{db_path}")) + init_db(config) + + yaml_path = tmp_path / "config.yaml" + out_path = tmp_path / "out.json" + yaml_path.write_text( + """ +urls: + - "https://example.com/product" +options: + use_playwright: false + save_html: false + save_screenshot: false +""", + encoding="utf-8", + ) + + registry = get_registry() + previous_stores = list(registry._stores) + registry._stores = [] + registry.register(DummyStore()) + + monkeypatch.setattr(cli_main, "get_config", lambda: config) + monkeypatch.setattr(cli_main, "setup_stores", lambda: None) + monkeypatch.setattr(cli_main, "fetch_http", lambda url: DummyFetchResult("")) + + runner = CliRunner() + try: + result = runner.invoke( + cli_main.app, + ["run", "--yaml", str(yaml_path), "--out", str(out_path), "--save-db"], + ) + finally: + registry._stores = previous_stores + reset_engine() + + assert result.exit_code == 0 + assert out_path.exists() + + with get_session(config) as session: + assert session.query(Product).count() == 1 diff --git a/tests/core/__pycache__/test_io.cpython-313-pytest-9.0.2.pyc b/tests/core/__pycache__/test_io.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..88a79f9 Binary files /dev/null and b/tests/core/__pycache__/test_io.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/core/__pycache__/test_registry_integration.cpython-313-pytest-9.0.2.pyc b/tests/core/__pycache__/test_registry_integration.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..0f51b6b Binary files /dev/null and b/tests/core/__pycache__/test_registry_integration.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/core/test_io.py b/tests/core/test_io.py new file mode 100755 index 0000000..db2edcb --- /dev/null +++ b/tests/core/test_io.py @@ -0,0 +1,462 @@ +""" +Tests pour pricewatch.app.core.io + +Teste la lecture/écriture YAML/JSON et les fonctions de sauvegarde debug. +""" + +import json +import tempfile +from datetime import datetime +from pathlib import Path + +import pytest +import yaml + +from pricewatch.app.core.io import ( + ScrapingConfig, + ScrapingOptions, + read_json_results, + read_yaml_config, + save_debug_html, + save_debug_screenshot, + write_json_results, +) +from pricewatch.app.core.schema import ( + DebugInfo, + DebugStatus, + FetchMethod, + ProductSnapshot, + StockStatus, +) + + +class TestScrapingOptions: + """Tests pour le modèle ScrapingOptions.""" + + def test_default_values(self): + """Les valeurs par défaut sont correctes.""" + options = ScrapingOptions() + assert options.use_playwright is True + assert options.headful is False + assert options.save_html is True + assert options.save_screenshot is True + assert options.timeout_ms == 60000 + + def test_custom_values(self): + """Les valeurs personnalisées sont acceptées.""" + options = ScrapingOptions( + use_playwright=False, + headful=True, + save_html=False, + save_screenshot=False, + timeout_ms=30000, + ) + assert options.use_playwright is False + assert options.headful is True + assert options.save_html is False + assert options.save_screenshot is False + assert options.timeout_ms == 30000 + + def test_timeout_validation_min(self): + """Timeout inférieur à 1000ms est rejeté.""" + with pytest.raises(ValueError): + ScrapingOptions(timeout_ms=500) + + def test_timeout_validation_valid(self): + """Timeout >= 1000ms est accepté.""" + options = ScrapingOptions(timeout_ms=1000) + assert options.timeout_ms == 1000 + + +class TestScrapingConfig: + """Tests pour le modèle ScrapingConfig.""" + + def test_minimal_config(self): + """Config minimale avec URLs uniquement.""" + config = ScrapingConfig(urls=["https://example.com"]) + assert len(config.urls) == 1 + assert config.urls[0] == "https://example.com" + assert isinstance(config.options, ScrapingOptions) + + def test_config_with_options(self): + """Config avec URLs et options.""" + options = ScrapingOptions(use_playwright=False, timeout_ms=10000) + config = ScrapingConfig( + urls=["https://example.com", "https://test.com"], options=options + ) + assert len(config.urls) == 2 + assert config.options.use_playwright is False + assert config.options.timeout_ms == 10000 + + def test_validate_urls_empty_list(self): + """Liste d'URLs vide est rejetée.""" + with pytest.raises(ValueError, match="Au moins une URL"): + ScrapingConfig(urls=[]) + + def test_validate_urls_strips_whitespace(self): + """Les espaces sont nettoyés.""" + config = ScrapingConfig(urls=[" https://example.com ", "https://test.com"]) + assert config.urls == ["https://example.com", "https://test.com"] + + def test_validate_urls_removes_empty(self): + """Les URLs vides sont supprimées.""" + config = ScrapingConfig( + urls=["https://example.com", "", " ", "https://test.com"] + ) + assert len(config.urls) == 2 + assert config.urls == ["https://example.com", "https://test.com"] + + def test_validate_urls_all_empty(self): + """Si toutes les URLs sont vides, erreur.""" + with pytest.raises(ValueError, match="Aucune URL valide"): + ScrapingConfig(urls=["", " ", "\t"]) + + +class TestReadYamlConfig: + """Tests pour read_yaml_config().""" + + def test_read_valid_yaml(self, tmp_path): + """Lit un fichier YAML valide.""" + yaml_path = tmp_path / "config.yaml" + yaml_content = { + "urls": ["https://example.com", "https://test.com"], + "options": {"use_playwright": False, "timeout_ms": 30000}, + } + with open(yaml_path, "w") as f: + yaml.dump(yaml_content, f) + + config = read_yaml_config(yaml_path) + assert len(config.urls) == 2 + assert config.urls[0] == "https://example.com" + assert config.options.use_playwright is False + assert config.options.timeout_ms == 30000 + + def test_read_yaml_minimal(self, tmp_path): + """Lit un YAML minimal (URLs uniquement).""" + yaml_path = tmp_path / "config.yaml" + yaml_content = {"urls": ["https://example.com"]} + with open(yaml_path, "w") as f: + yaml.dump(yaml_content, f) + + config = read_yaml_config(yaml_path) + assert len(config.urls) == 1 + # Options par défaut + assert config.options.use_playwright is True + assert config.options.timeout_ms == 60000 + + def test_read_yaml_file_not_found(self, tmp_path): + """Fichier introuvable lève FileNotFoundError.""" + yaml_path = tmp_path / "nonexistent.yaml" + with pytest.raises(FileNotFoundError): + read_yaml_config(yaml_path) + + def test_read_yaml_empty_file(self, tmp_path): + """Fichier YAML vide lève ValueError.""" + yaml_path = tmp_path / "empty.yaml" + yaml_path.write_text("") + + with pytest.raises(ValueError, match="Fichier YAML vide"): + read_yaml_config(yaml_path) + + def test_read_yaml_invalid_syntax(self, tmp_path): + """YAML avec syntaxe invalide lève ValueError.""" + yaml_path = tmp_path / "invalid.yaml" + yaml_path.write_text("urls: [invalid yaml syntax") + + with pytest.raises(ValueError, match="YAML invalide"): + read_yaml_config(yaml_path) + + def test_read_yaml_missing_urls(self, tmp_path): + """YAML sans champ 'urls' lève erreur de validation.""" + yaml_path = tmp_path / "config.yaml" + yaml_content = {"options": {"use_playwright": False}} + with open(yaml_path, "w") as f: + yaml.dump(yaml_content, f) + + with pytest.raises(Exception): # Pydantic validation error + read_yaml_config(yaml_path) + + def test_read_yaml_accepts_path_string(self, tmp_path): + """Accepte un string comme chemin.""" + yaml_path = tmp_path / "config.yaml" + yaml_content = {"urls": ["https://example.com"]} + with open(yaml_path, "w") as f: + yaml.dump(yaml_content, f) + + config = read_yaml_config(str(yaml_path)) + assert len(config.urls) == 1 + + +class TestWriteJsonResults: + """Tests pour write_json_results().""" + + @pytest.fixture + def sample_snapshot(self) -> ProductSnapshot: + """Fixture: ProductSnapshot exemple.""" + return ProductSnapshot( + source="test", + url="https://example.com/product", + fetched_at=datetime(2024, 1, 1, 12, 0, 0), + title="Test Product", + price=99.99, + currency="EUR", + stock_status=StockStatus.IN_STOCK, + reference="TEST123", + images=["https://example.com/img1.jpg"], + category="Test Category", + specs={"Brand": "TestBrand"}, + debug=DebugInfo( + method=FetchMethod.HTTP, + status=DebugStatus.SUCCESS, + errors=[], + notes=[], + ), + ) + + def test_write_single_snapshot(self, tmp_path, sample_snapshot): + """Écrit un seul snapshot.""" + json_path = tmp_path / "results.json" + write_json_results([sample_snapshot], json_path) + + assert json_path.exists() + + # Vérifier le contenu + with open(json_path) as f: + data = json.load(f) + + assert isinstance(data, list) + assert len(data) == 1 + assert data[0]["source"] == "test" + assert data[0]["title"] == "Test Product" + + def test_write_multiple_snapshots(self, tmp_path, sample_snapshot): + """Écrit plusieurs snapshots.""" + snapshot2 = ProductSnapshot( + source="test2", + url="https://example.com/product2", + fetched_at=datetime(2024, 1, 2, 12, 0, 0), + title="Test Product 2", + price=49.99, + currency="EUR", + stock_status=StockStatus.OUT_OF_STOCK, + debug=DebugInfo( + method=FetchMethod.PLAYWRIGHT, + status=DebugStatus.PARTIAL, + errors=["Test error"], + notes=[], + ), + ) + + json_path = tmp_path / "results.json" + write_json_results([sample_snapshot, snapshot2], json_path) + + with open(json_path) as f: + data = json.load(f) + + assert len(data) == 2 + assert data[0]["source"] == "test" + assert data[1]["source"] == "test2" + + def test_write_creates_parent_dirs(self, tmp_path, sample_snapshot): + """Crée les dossiers parents si nécessaire.""" + json_path = tmp_path / "sub" / "dir" / "results.json" + write_json_results([sample_snapshot], json_path) + + assert json_path.exists() + assert json_path.parent.exists() + + def test_write_empty_list(self, tmp_path): + """Écrit une liste vide.""" + json_path = tmp_path / "empty.json" + write_json_results([], json_path) + + assert json_path.exists() + + with open(json_path) as f: + data = json.load(f) + + assert data == [] + + def test_write_indent_control(self, tmp_path, sample_snapshot): + """Contrôle l'indentation.""" + # Avec indent + json_path1 = tmp_path / "pretty.json" + write_json_results([sample_snapshot], json_path1, indent=2) + content1 = json_path1.read_text() + assert "\n" in content1 # Pretty-printed + + # Sans indent (compact) + json_path2 = tmp_path / "compact.json" + write_json_results([sample_snapshot], json_path2, indent=None) + content2 = json_path2.read_text() + assert len(content2) < len(content1) # Plus compact + + def test_write_accepts_path_string(self, tmp_path, sample_snapshot): + """Accepte un string comme chemin.""" + json_path = tmp_path / "results.json" + write_json_results([sample_snapshot], str(json_path)) + assert json_path.exists() + + +class TestReadJsonResults: + """Tests pour read_json_results().""" + + @pytest.fixture + def json_file_with_snapshot(self, tmp_path) -> Path: + """Fixture: Fichier JSON avec un snapshot.""" + json_path = tmp_path / "results.json" + snapshot_data = { + "source": "test", + "url": "https://example.com/product", + "fetched_at": "2024-01-01T12:00:00", + "title": "Test Product", + "price": 99.99, + "currency": "EUR", + "shipping_cost": None, + "stock_status": "in_stock", + "reference": "TEST123", + "images": ["https://example.com/img.jpg"], + "category": "Test", + "specs": {"Brand": "Test"}, + "debug": { + "method": "http", + "status": "success", + "errors": [], + "notes": [], + "duration_ms": None, + "html_size_bytes": None, + }, + } + + with open(json_path, "w") as f: + json.dump([snapshot_data], f) + + return json_path + + def test_read_single_snapshot(self, json_file_with_snapshot): + """Lit un fichier avec un snapshot.""" + snapshots = read_json_results(json_file_with_snapshot) + + assert len(snapshots) == 1 + assert isinstance(snapshots[0], ProductSnapshot) + assert snapshots[0].source == "test" + assert snapshots[0].title == "Test Product" + assert snapshots[0].price == 99.99 + + def test_read_file_not_found(self, tmp_path): + """Fichier introuvable lève FileNotFoundError.""" + json_path = tmp_path / "nonexistent.json" + with pytest.raises(FileNotFoundError): + read_json_results(json_path) + + def test_read_invalid_json(self, tmp_path): + """JSON invalide lève ValueError.""" + json_path = tmp_path / "invalid.json" + json_path.write_text("{invalid json") + + with pytest.raises(ValueError, match="JSON invalide"): + read_json_results(json_path) + + def test_read_not_a_list(self, tmp_path): + """JSON qui n'est pas une liste lève ValueError.""" + json_path = tmp_path / "notlist.json" + with open(json_path, "w") as f: + json.dump({"key": "value"}, f) + + with pytest.raises(ValueError, match="doit contenir une liste"): + read_json_results(json_path) + + def test_read_empty_list(self, tmp_path): + """Liste vide est acceptée.""" + json_path = tmp_path / "empty.json" + with open(json_path, "w") as f: + json.dump([], f) + + snapshots = read_json_results(json_path) + assert snapshots == [] + + def test_read_accepts_path_string(self, json_file_with_snapshot): + """Accepte un string comme chemin.""" + snapshots = read_json_results(str(json_file_with_snapshot)) + assert len(snapshots) == 1 + + +class TestSaveDebugHtml: + """Tests pour save_debug_html().""" + + def test_save_html_default_dir(self, tmp_path, monkeypatch): + """Sauvegarde HTML dans le dossier par défaut.""" + # Changer le répertoire de travail pour le test + monkeypatch.chdir(tmp_path) + + html = "Test" + result_path = save_debug_html(html, "test_page") + + assert result_path.exists() + assert result_path.name == "test_page.html" + assert result_path.read_text(encoding="utf-8") == html + + def test_save_html_custom_dir(self, tmp_path): + """Sauvegarde HTML dans un dossier personnalisé.""" + output_dir = tmp_path / "debug_html" + html = "Test" + + result_path = save_debug_html(html, "test_page", output_dir) + + assert result_path.parent == output_dir + assert result_path.name == "test_page.html" + assert result_path.read_text(encoding="utf-8") == html + + def test_save_html_creates_dir(self, tmp_path): + """Crée le dossier de sortie s'il n'existe pas.""" + output_dir = tmp_path / "sub" / "dir" / "html" + html = "Test" + + result_path = save_debug_html(html, "test_page", output_dir) + + assert output_dir.exists() + assert result_path.exists() + + def test_save_html_large_content(self, tmp_path): + """Sauvegarde du HTML volumineux.""" + html = "" + ("x" * 100000) + "" + result_path = save_debug_html(html, "large_page", tmp_path) + + assert result_path.exists() + assert len(result_path.read_text(encoding="utf-8")) == len(html) + + +class TestSaveDebugScreenshot: + """Tests pour save_debug_screenshot().""" + + def test_save_screenshot_default_dir(self, tmp_path, monkeypatch): + """Sauvegarde screenshot dans le dossier par défaut.""" + monkeypatch.chdir(tmp_path) + + screenshot_bytes = b"\x89PNG fake image data" + result_path = save_debug_screenshot(screenshot_bytes, "test_screenshot") + + assert result_path.exists() + assert result_path.name == "test_screenshot.png" + assert result_path.read_bytes() == screenshot_bytes + + def test_save_screenshot_custom_dir(self, tmp_path): + """Sauvegarde screenshot dans un dossier personnalisé.""" + output_dir = tmp_path / "screenshots" + screenshot_bytes = b"\x89PNG fake image data" + + result_path = save_debug_screenshot(screenshot_bytes, "test_screenshot", output_dir) + + assert result_path.parent == output_dir + assert result_path.name == "test_screenshot.png" + assert result_path.read_bytes() == screenshot_bytes + + def test_save_screenshot_creates_dir(self, tmp_path): + """Crée le dossier de sortie s'il n'existe pas.""" + output_dir = tmp_path / "sub" / "dir" / "screenshots" + screenshot_bytes = b"\x89PNG fake image data" + + result_path = save_debug_screenshot(screenshot_bytes, "test_screenshot", output_dir) + + assert output_dir.exists() + assert result_path.exists() diff --git a/tests/core/test_registry_integration.py b/tests/core/test_registry_integration.py new file mode 100755 index 0000000..a8f6076 --- /dev/null +++ b/tests/core/test_registry_integration.py @@ -0,0 +1,174 @@ +""" +Tests d'intégration pour le registry avec les stores réels. + +Teste la détection automatique du bon store pour des URLs +Amazon, Cdiscount, Backmarket et AliExpress. +""" + +import pytest + +from pricewatch.app.core.registry import StoreRegistry +from pricewatch.app.stores.amazon.store import AmazonStore +from pricewatch.app.stores.cdiscount.store import CdiscountStore +from pricewatch.app.stores.backmarket.store import BackmarketStore +from pricewatch.app.stores.aliexpress.store import AliexpressStore + + +class TestRegistryRealStores: + """Tests d'intégration avec les 4 stores réels.""" + + @pytest.fixture + def registry_with_all_stores(self) -> StoreRegistry: + """Fixture: Registry avec les 4 stores réels enregistrés.""" + registry = StoreRegistry() + registry.register(AmazonStore()) + registry.register(CdiscountStore()) + registry.register(BackmarketStore()) + registry.register(AliexpressStore()) + return registry + + def test_all_stores_registered(self, registry_with_all_stores): + """Vérifie que les 4 stores sont enregistrés.""" + assert len(registry_with_all_stores) == 4 + stores = registry_with_all_stores.list_stores() + assert "amazon" in stores + assert "cdiscount" in stores + assert "backmarket" in stores + assert "aliexpress" in stores + + def test_detect_amazon_fr(self, registry_with_all_stores): + """Détecte Amazon.fr correctement.""" + url = "https://www.amazon.fr/dp/B08N5WRWNW" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "amazon" + + def test_detect_amazon_com(self, registry_with_all_stores): + """Détecte Amazon.com correctement.""" + url = "https://www.amazon.com/dp/B08N5WRWNW" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "amazon" + + def test_detect_amazon_with_product_name(self, registry_with_all_stores): + """Détecte Amazon avec nom de produit dans l'URL.""" + url = "https://www.amazon.fr/Product-Name-Here/dp/B08N5WRWNW/ref=sr_1_1" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "amazon" + + def test_detect_cdiscount(self, registry_with_all_stores): + """Détecte Cdiscount correctement.""" + url = "https://www.cdiscount.com/informatique/clavier-souris-webcam/example/f-1070123-example.html" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "cdiscount" + + def test_detect_backmarket(self, registry_with_all_stores): + """Détecte Backmarket correctement.""" + url = "https://www.backmarket.fr/fr-fr/p/iphone-15-pro" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "backmarket" + + def test_detect_backmarket_locale_en(self, registry_with_all_stores): + """Détecte Backmarket avec locale anglais.""" + url = "https://www.backmarket.fr/en-fr/p/macbook-air-15-2024" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "backmarket" + + def test_detect_aliexpress_fr(self, registry_with_all_stores): + """Détecte AliExpress.fr correctement.""" + url = "https://fr.aliexpress.com/item/1005007187023722.html" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "aliexpress" + + def test_detect_aliexpress_com(self, registry_with_all_stores): + """Détecte AliExpress.com correctement.""" + url = "https://www.aliexpress.com/item/1005007187023722.html" + store = registry_with_all_stores.detect_store(url) + assert store is not None + assert store.store_id == "aliexpress" + + def test_detect_unknown_store(self, registry_with_all_stores): + """URL inconnue retourne None.""" + url = "https://www.ebay.com/itm/123456789" + store = registry_with_all_stores.detect_store(url) + assert store is None + + def test_detect_invalid_url(self, registry_with_all_stores): + """URL invalide retourne None.""" + url = "not-a-valid-url" + store = registry_with_all_stores.detect_store(url) + assert store is None + + def test_detect_priority_amazon_over_others(self, registry_with_all_stores): + """Amazon.fr doit avoir le meilleur score pour ses URLs.""" + url = "https://www.amazon.fr/dp/B08N5WRWNW" + store = registry_with_all_stores.detect_store(url) + # Amazon.fr devrait avoir score 0.9, les autres 0.0 + assert store.store_id == "amazon" + + def test_each_store_matches_only_own_urls(self, registry_with_all_stores): + """Chaque store ne matche que ses propres URLs.""" + test_cases = [ + ("https://www.amazon.fr/dp/B08N5WRWNW", "amazon"), + ("https://www.cdiscount.com/product", "cdiscount"), + ("https://www.backmarket.fr/fr-fr/p/product", "backmarket"), + ("https://fr.aliexpress.com/item/12345.html", "aliexpress"), + ] + + for url, expected_store_id in test_cases: + store = registry_with_all_stores.detect_store(url) + assert store is not None, f"Aucun store détecté pour {url}" + assert store.store_id == expected_store_id, ( + f"Mauvais store pour {url}: " + f"attendu {expected_store_id}, obtenu {store.store_id}" + ) + + def test_get_store_by_id(self, registry_with_all_stores): + """Récupère chaque store par son ID.""" + amazon = registry_with_all_stores.get_store("amazon") + assert amazon is not None + assert isinstance(amazon, AmazonStore) + + cdiscount = registry_with_all_stores.get_store("cdiscount") + assert cdiscount is not None + assert isinstance(cdiscount, CdiscountStore) + + backmarket = registry_with_all_stores.get_store("backmarket") + assert backmarket is not None + assert isinstance(backmarket, BackmarketStore) + + aliexpress = registry_with_all_stores.get_store("aliexpress") + assert aliexpress is not None + assert isinstance(aliexpress, AliexpressStore) + + def test_unregister_store(self, registry_with_all_stores): + """Désenregistre un store et vérifie qu'il n'est plus détecté.""" + assert len(registry_with_all_stores) == 4 + + # Désenregistrer Amazon + removed = registry_with_all_stores.unregister("amazon") + assert removed is True + assert len(registry_with_all_stores) == 3 + + # Amazon ne doit plus être détecté + store = registry_with_all_stores.detect_store("https://www.amazon.fr/dp/B08N5WRWNW") + assert store is None + + # Les autres stores doivent toujours fonctionner + store = registry_with_all_stores.detect_store("https://www.cdiscount.com/product") + assert store is not None + assert store.store_id == "cdiscount" + + def test_repr_includes_all_stores(self, registry_with_all_stores): + """La représentation string inclut tous les stores.""" + repr_str = repr(registry_with_all_stores) + assert "StoreRegistry" in repr_str + assert "amazon" in repr_str + assert "cdiscount" in repr_str + assert "backmarket" in repr_str + assert "aliexpress" in repr_str diff --git a/tests/db/__pycache__/test_connection.cpython-313-pytest-9.0.2.pyc b/tests/db/__pycache__/test_connection.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..e883ddc Binary files /dev/null and b/tests/db/__pycache__/test_connection.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/db/__pycache__/test_models.cpython-313-pytest-9.0.2.pyc b/tests/db/__pycache__/test_models.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..4dfdd8d Binary files /dev/null and b/tests/db/__pycache__/test_models.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/db/__pycache__/test_repository.cpython-313-pytest-9.0.2.pyc b/tests/db/__pycache__/test_repository.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..83e10c2 Binary files /dev/null and b/tests/db/__pycache__/test_repository.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/db/test_connection.py b/tests/db/test_connection.py new file mode 100755 index 0000000..3b45792 --- /dev/null +++ b/tests/db/test_connection.py @@ -0,0 +1,87 @@ +""" +Tests pour la couche de connexion SQLAlchemy. +""" + +from dataclasses import dataclass + +import pytest +from sqlalchemy import inspect + +from pricewatch.app.db.connection import ( + check_db_connection, + get_engine, + get_session, + init_db, + reset_engine, +) +from pricewatch.app.db.models import Product + + +@dataclass +class FakeDbConfig: + """Config DB minimale pour tests SQLite.""" + + url: str + host: str = "sqlite" + port: int = 0 + database: str = ":memory:" + + +@dataclass +class FakeAppConfig: + """Config App minimale pour tests.""" + + db: FakeDbConfig + debug: bool = False + + +@pytest.fixture(autouse=True) +def reset_db_engine(): + """Reset l'engine global entre les tests.""" + reset_engine() + yield + reset_engine() + + +@pytest.fixture +def sqlite_config() -> FakeAppConfig: + """Config SQLite in-memory pour tests.""" + return FakeAppConfig(db=FakeDbConfig(url="sqlite:///:memory:")) + + +def test_get_engine_sqlite(sqlite_config: FakeAppConfig): + """Cree un engine SQLite fonctionnel.""" + engine = get_engine(sqlite_config) + assert engine.url.get_backend_name() == "sqlite" + + +def test_init_db_creates_tables(sqlite_config: FakeAppConfig): + """Init DB cree toutes les tables attendues.""" + init_db(sqlite_config) + engine = get_engine(sqlite_config) + inspector = inspect(engine) + tables = set(inspector.get_table_names()) + assert "products" in tables + assert "price_history" in tables + assert "product_images" in tables + assert "product_specs" in tables + assert "scraping_logs" in tables + + +def test_get_session_commit(sqlite_config: FakeAppConfig): + """La session permet un commit simple.""" + init_db(sqlite_config) + + with get_session(sqlite_config) as session: + product = Product(source="amazon", reference="B08N5WRWNW", url="https://example.com") + session.add(product) + session.commit() + + with get_session(sqlite_config) as session: + assert session.query(Product).count() == 1 + + +def test_check_db_connection(sqlite_config: FakeAppConfig): + """Le health check DB retourne True en SQLite.""" + init_db(sqlite_config) + assert check_db_connection(sqlite_config) is True diff --git a/tests/db/test_models.py b/tests/db/test_models.py new file mode 100755 index 0000000..34f6e20 --- /dev/null +++ b/tests/db/test_models.py @@ -0,0 +1,89 @@ +""" +Tests pour les modeles SQLAlchemy. +""" + +from datetime import datetime + +import pytest +from sqlalchemy import create_engine +from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import Session, sessionmaker + +from pricewatch.app.db.models import ( + Base, + PriceHistory, + Product, + ProductImage, + ProductSpec, + ScrapingLog, +) + + +@pytest.fixture +def session() -> Session: + """Session SQLite in-memory pour tests de modeles.""" + engine = create_engine("sqlite:///:memory:") + Base.metadata.create_all(engine) + SessionLocal = sessionmaker(bind=engine) + session = SessionLocal() + try: + yield session + finally: + session.close() + + +def test_product_relationships(session: Session): + """Les relations principales fonctionnent (prix, images, specs, logs).""" + product = Product(source="amazon", reference="B08N5WRWNW", url="https://example.com") + + price = PriceHistory( + price=199.99, + shipping_cost=0, + stock_status="in_stock", + fetch_method="http", + fetch_status="success", + fetched_at=datetime.utcnow(), + ) + image = ProductImage(image_url="https://example.com/image.jpg", position=0) + spec = ProductSpec(spec_key="Couleur", spec_value="Noir") + log = ScrapingLog( + url="https://example.com", + source="amazon", + reference="B08N5WRWNW", + fetch_method="http", + fetch_status="success", + fetched_at=datetime.utcnow(), + duration_ms=1200, + html_size_bytes=2048, + errors={"items": []}, + notes={"items": ["OK"]}, + ) + + product.price_history.append(price) + product.images.append(image) + product.specs.append(spec) + product.logs.append(log) + + session.add(product) + session.commit() + + loaded = session.query(Product).first() + assert loaded is not None + assert len(loaded.price_history) == 1 + assert len(loaded.images) == 1 + assert len(loaded.specs) == 1 + assert len(loaded.logs) == 1 + + +def test_unique_product_constraint(session: Session): + """La contrainte unique source+reference est respectee.""" + product_a = Product(source="amazon", reference="B08N5WRWNW", url="https://example.com/a") + product_b = Product(source="amazon", reference="B08N5WRWNW", url="https://example.com/b") + + session.add(product_a) + session.commit() + + session.add(product_b) + with pytest.raises(IntegrityError): + session.commit() + session.rollback() diff --git a/tests/db/test_repository.py b/tests/db/test_repository.py new file mode 100755 index 0000000..d93824a --- /dev/null +++ b/tests/db/test_repository.py @@ -0,0 +1,82 @@ +""" +Tests pour le repository SQLAlchemy. +""" + +from datetime import datetime + +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import Session, sessionmaker + +from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot +from pricewatch.app.db.models import Base, Product, ScrapingLog +from pricewatch.app.db.repository import ProductRepository + + +@pytest.fixture +def session() -> Session: + """Session SQLite in-memory pour tests repository.""" + engine = create_engine("sqlite:///:memory:") + Base.metadata.create_all(engine) + SessionLocal = sessionmaker(bind=engine) + session = SessionLocal() + try: + yield session + finally: + session.close() + engine.dispose() + + +def _make_snapshot(reference: str | None) -> ProductSnapshot: + return ProductSnapshot( + source="amazon", + url="https://example.com/product", + fetched_at=datetime(2026, 1, 14, 12, 0, 0), + title="Produit test", + price=199.99, + currency="EUR", + shipping_cost=0.0, + reference=reference, + images=["https://example.com/img1.jpg"], + specs={"Couleur": "Noir"}, + debug=DebugInfo( + method=FetchMethod.HTTP, + status=DebugStatus.SUCCESS, + errors=["Avertissement"], + notes=["OK"], + ), + ) + + +def test_save_snapshot_creates_product(session: Session): + """Le repository persiste produit + log.""" + repo = ProductRepository(session) + snapshot = _make_snapshot(reference="B08N5WRWNW") + + product_id = repo.save_snapshot(snapshot) + session.commit() + + product = session.query(Product).one() + assert product.id == product_id + assert product.reference == "B08N5WRWNW" + assert len(product.images) == 1 + assert len(product.specs) == 1 + assert len(product.price_history) == 1 + + log = session.query(ScrapingLog).one() + assert log.product_id == product_id + assert log.errors == ["Avertissement"] + assert log.notes == ["OK"] + + +def test_save_snapshot_without_reference(session: Session): + """Sans reference, le produit n'est pas cree mais le log existe.""" + repo = ProductRepository(session) + snapshot = _make_snapshot(reference=None) + + product_id = repo.save_snapshot(snapshot) + session.commit() + + assert product_id is None + assert session.query(Product).count() == 0 + assert session.query(ScrapingLog).count() == 1 diff --git a/tests/scraping/__init__.py b/tests/scraping/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/tests/scraping/__pycache__/__init__.cpython-313.pyc b/tests/scraping/__pycache__/__init__.cpython-313.pyc new file mode 100755 index 0000000..7701a56 Binary files /dev/null and b/tests/scraping/__pycache__/__init__.cpython-313.pyc differ diff --git a/tests/scraping/__pycache__/test_http_fetch.cpython-313-pytest-9.0.2.pyc b/tests/scraping/__pycache__/test_http_fetch.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..6cd0c6a Binary files /dev/null and b/tests/scraping/__pycache__/test_http_fetch.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/scraping/__pycache__/test_pipeline.cpython-313-pytest-9.0.2.pyc b/tests/scraping/__pycache__/test_pipeline.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..7b80cbd Binary files /dev/null and b/tests/scraping/__pycache__/test_pipeline.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/scraping/__pycache__/test_pw_fetch.cpython-313-pytest-9.0.2.pyc b/tests/scraping/__pycache__/test_pw_fetch.cpython-313-pytest-9.0.2.pyc new file mode 100755 index 0000000..98b5db2 Binary files /dev/null and b/tests/scraping/__pycache__/test_pw_fetch.cpython-313-pytest-9.0.2.pyc differ diff --git a/tests/scraping/test_http_fetch.py b/tests/scraping/test_http_fetch.py new file mode 100755 index 0000000..b54cfa4 --- /dev/null +++ b/tests/scraping/test_http_fetch.py @@ -0,0 +1,290 @@ +""" +Tests pour pricewatch.app.scraping.http_fetch + +Teste la récupération HTTP avec mocks pour éviter les vraies requêtes. +""" + +from unittest.mock import Mock, patch + +import pytest +import requests +from requests.exceptions import RequestException, Timeout + +from pricewatch.app.scraping.http_fetch import FetchResult, fetch_http + + +class TestFetchResult: + """Tests pour la classe FetchResult.""" + + def test_success_result(self): + """Création d'un résultat réussi.""" + result = FetchResult( + success=True, + html="Test", + status_code=200, + duration_ms=150, + ) + + assert result.success is True + assert result.html == "Test" + assert result.error is None + assert result.status_code == 200 + assert result.duration_ms == 150 + + def test_error_result(self): + """Création d'un résultat d'erreur.""" + result = FetchResult( + success=False, + error="403 Forbidden", + status_code=403, + duration_ms=100, + ) + + assert result.success is False + assert result.html is None + assert result.error == "403 Forbidden" + assert result.status_code == 403 + assert result.duration_ms == 100 + + def test_minimal_result(self): + """Résultat minimal avec success uniquement.""" + result = FetchResult(success=False) + + assert result.success is False + assert result.html is None + assert result.error is None + assert result.status_code is None + assert result.duration_ms is None + + +class TestFetchHttp: + """Tests pour la fonction fetch_http().""" + + def test_fetch_success(self, mocker): + """Requête HTTP réussie (200 OK).""" + # Mock de requests.get + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "Test Page" + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is True + assert result.html == "Test Page" + assert result.status_code == 200 + assert result.error is None + assert result.duration_ms is not None + assert result.duration_ms >= 0 + + def test_fetch_with_custom_timeout(self, mocker): + """Requête avec timeout personnalisé.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "OK" + mock_get = mocker.patch("requests.get", return_value=mock_response) + + fetch_http("https://example.com", timeout=60) + + # Vérifier que timeout est passé à requests.get + mock_get.assert_called_once() + call_kwargs = mock_get.call_args.kwargs + assert call_kwargs["timeout"] == 60 + + def test_fetch_with_custom_headers(self, mocker): + """Requête avec headers personnalisés.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "OK" + mock_get = mocker.patch("requests.get", return_value=mock_response) + + custom_headers = {"X-Custom-Header": "test-value"} + fetch_http("https://example.com", headers=custom_headers) + + # Vérifier que les headers personnalisés sont inclus + mock_get.assert_called_once() + call_kwargs = mock_get.call_args.kwargs + assert "X-Custom-Header" in call_kwargs["headers"] + assert call_kwargs["headers"]["X-Custom-Header"] == "test-value" + # Headers par défaut doivent aussi être présents + assert "User-Agent" in call_kwargs["headers"] + + def test_fetch_403_forbidden(self, mocker): + """Requête bloquée (403 Forbidden).""" + mock_response = Mock() + mock_response.status_code = 403 + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is False + assert result.html is None + assert result.status_code == 403 + assert "403 Forbidden" in result.error + assert "Anti-bot" in result.error + + def test_fetch_404_not_found(self, mocker): + """Page introuvable (404 Not Found).""" + mock_response = Mock() + mock_response.status_code = 404 + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is False + assert result.status_code == 404 + assert "404 Not Found" in result.error + + def test_fetch_429_rate_limit(self, mocker): + """Rate limit atteint (429 Too Many Requests).""" + mock_response = Mock() + mock_response.status_code = 429 + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is False + assert result.status_code == 429 + assert "429" in result.error + assert "Rate limit" in result.error + + def test_fetch_500_server_error(self, mocker): + """Erreur serveur (500 Internal Server Error).""" + mock_response = Mock() + mock_response.status_code = 500 + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is False + assert result.status_code == 500 + assert "500" in result.error + assert "Server Error" in result.error + + def test_fetch_503_service_unavailable(self, mocker): + """Service indisponible (503).""" + mock_response = Mock() + mock_response.status_code = 503 + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is False + assert result.status_code == 503 + assert "503" in result.error + + def test_fetch_unknown_status_code(self, mocker): + """Code de statut inconnu (par ex. 418 I'm a teapot).""" + mock_response = Mock() + mock_response.status_code = 418 + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is False + assert result.status_code == 418 + assert "418" in result.error + + def test_fetch_timeout_error(self, mocker): + """Timeout lors de la requête.""" + mocker.patch("requests.get", side_effect=Timeout("Connection timed out")) + + result = fetch_http("https://example.com", timeout=10) + + assert result.success is False + assert result.html is None + assert "Timeout" in result.error + assert result.duration_ms is not None + + def test_fetch_request_exception(self, mocker): + """Exception réseau générique.""" + mocker.patch( + "requests.get", + side_effect=RequestException("Network error"), + ) + + result = fetch_http("https://example.com") + + assert result.success is False + assert "Erreur réseau" in result.error + assert result.duration_ms is not None + + def test_fetch_unexpected_exception(self, mocker): + """Exception inattendue.""" + mocker.patch("requests.get", side_effect=ValueError("Unexpected error")) + + result = fetch_http("https://example.com") + + assert result.success is False + assert "Erreur inattendue" in result.error + assert result.duration_ms is not None + + def test_fetch_empty_url(self): + """URL vide retourne une erreur.""" + result = fetch_http("") + + assert result.success is False + assert "URL vide" in result.error + assert result.html is None + + def test_fetch_whitespace_url(self): + """URL avec espaces uniquement retourne une erreur.""" + result = fetch_http(" ") + + assert result.success is False + assert "URL vide" in result.error + + def test_fetch_no_redirects(self, mocker): + """Requête sans suivre les redirections.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "OK" + mock_get = mocker.patch("requests.get", return_value=mock_response) + + fetch_http("https://example.com", follow_redirects=False) + + mock_get.assert_called_once() + call_kwargs = mock_get.call_args.kwargs + assert call_kwargs["allow_redirects"] is False + + def test_fetch_uses_random_user_agent(self, mocker): + """Vérifie qu'un User-Agent aléatoire est utilisé.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "OK" + mock_get = mocker.patch("requests.get", return_value=mock_response) + + fetch_http("https://example.com") + + # Vérifier qu'un User-Agent est présent + mock_get.assert_called_once() + call_kwargs = mock_get.call_args.kwargs + assert "User-Agent" in call_kwargs["headers"] + # User-Agent doit contenir "Mozilla" (présent dans tous les UA) + assert "Mozilla" in call_kwargs["headers"]["User-Agent"] + + def test_fetch_duration_is_measured(self, mocker): + """Vérifie que la durée est mesurée.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "OK" + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.duration_ms is not None + assert isinstance(result.duration_ms, int) + assert result.duration_ms >= 0 + + def test_fetch_large_response(self, mocker): + """Requête avec réponse volumineuse.""" + mock_response = Mock() + mock_response.status_code = 200 + # Simuler une grosse page HTML (1 MB) + mock_response.text = "" + ("x" * 1000000) + "" + mocker.patch("requests.get", return_value=mock_response) + + result = fetch_http("https://example.com") + + assert result.success is True + assert len(result.html) > 1000000 diff --git a/tests/scraping/test_pipeline.py b/tests/scraping/test_pipeline.py new file mode 100755 index 0000000..d0f1407 --- /dev/null +++ b/tests/scraping/test_pipeline.py @@ -0,0 +1,82 @@ +""" +Tests pour ScrapingPipeline. +""" + +from dataclasses import dataclass +from datetime import datetime + +import pytest + +from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot +from pricewatch.app.db.connection import get_session, init_db, reset_engine +from pricewatch.app.db.models import Product +from pricewatch.app.scraping.pipeline import ScrapingPipeline + + +@dataclass +class FakeDbConfig: + url: str + + +@dataclass +class FakeAppConfig: + db: FakeDbConfig + debug: bool = False + enable_db: bool = True + + +@pytest.fixture(autouse=True) +def reset_db_engine(): + """Reset l'engine global entre les tests.""" + reset_engine() + yield + reset_engine() + + +def test_pipeline_persists_snapshot(): + """Le pipeline persiste un snapshot en base SQLite.""" + config = FakeAppConfig(db=FakeDbConfig(url="sqlite:///:memory:")) + init_db(config) + + snapshot = ProductSnapshot( + source="amazon", + url="https://example.com/product", + fetched_at=datetime(2026, 1, 14, 12, 30, 0), + title="Produit pipeline", + price=99.99, + currency="EUR", + reference="B08PIPE", + debug=DebugInfo(method=FetchMethod.HTTP, status=DebugStatus.SUCCESS), + ) + + pipeline = ScrapingPipeline(config=config) + product_id = pipeline.process_snapshot(snapshot, save_to_db=True) + + assert product_id is not None + + with get_session(config) as session: + assert session.query(Product).count() == 1 + + +def test_pipeline_respects_disable_flag(): + """Le pipeline ignore la persistence si enable_db=False.""" + config = FakeAppConfig(db=FakeDbConfig(url="sqlite:///:memory:"), enable_db=False) + init_db(config) + + snapshot = ProductSnapshot( + source="amazon", + url="https://example.com/product", + fetched_at=datetime(2026, 1, 14, 12, 45, 0), + title="Produit pipeline", + price=99.99, + currency="EUR", + reference="B08PIPE", + debug=DebugInfo(method=FetchMethod.HTTP, status=DebugStatus.SUCCESS), + ) + + pipeline = ScrapingPipeline(config=config) + product_id = pipeline.process_snapshot(snapshot, save_to_db=True) + + assert product_id is None + with get_session(config) as session: + assert session.query(Product).count() == 0 diff --git a/tests/scraping/test_pw_fetch.py b/tests/scraping/test_pw_fetch.py new file mode 100755 index 0000000..27f5c25 --- /dev/null +++ b/tests/scraping/test_pw_fetch.py @@ -0,0 +1,388 @@ +""" +Tests pour pricewatch.app.scraping.pw_fetch + +Teste la récupération Playwright avec mocks pour éviter de lancer vraiment un navigateur. +""" + +from unittest.mock import Mock, patch + +import pytest +from playwright.sync_api import TimeoutError as PlaywrightTimeout + +from pricewatch.app.scraping.pw_fetch import ( + PlaywrightFetchResult, + fetch_playwright, + fetch_with_fallback, +) + + +class TestPlaywrightFetchResult: + """Tests pour la classe PlaywrightFetchResult.""" + + def test_success_result(self): + """Création d'un résultat réussi.""" + result = PlaywrightFetchResult( + success=True, + html="Test", + screenshot=b"fake_screenshot_bytes", + duration_ms=2500, + ) + + assert result.success is True + assert result.html == "Test" + assert result.screenshot == b"fake_screenshot_bytes" + assert result.error is None + assert result.duration_ms == 2500 + + def test_error_result(self): + """Création d'un résultat d'erreur.""" + result = PlaywrightFetchResult( + success=False, + error="Timeout", + screenshot=b"error_screenshot", + duration_ms=3000, + ) + + assert result.success is False + assert result.html is None + assert result.error == "Timeout" + assert result.screenshot == b"error_screenshot" + assert result.duration_ms == 3000 + + def test_minimal_result(self): + """Résultat minimal.""" + result = PlaywrightFetchResult(success=False) + + assert result.success is False + assert result.html is None + assert result.screenshot is None + assert result.error is None + assert result.duration_ms is None + + +class TestFetchPlaywright: + """Tests pour fetch_playwright().""" + + @pytest.fixture + def mock_playwright_stack(self, mocker): + """Fixture: Mock complet de la stack Playwright.""" + # Mock de la page + mock_page = Mock() + mock_page.content.return_value = "Playwright Test" + mock_page.screenshot.return_value = b"fake_screenshot_data" + mock_page.goto.return_value = Mock(status=200) + + # Mock du context + mock_context = Mock() + mock_context.new_page.return_value = mock_page + + # Mock du browser + mock_browser = Mock() + mock_browser.new_context.return_value = mock_context + + # Mock playwright chromium + mock_chromium = Mock() + mock_chromium.launch.return_value = mock_browser + + # Mock playwright + mock_playwright_obj = Mock() + mock_playwright_obj.chromium = mock_chromium + + # Mock sync_playwright().start() + mock_sync_playwright = Mock() + mock_sync_playwright.start.return_value = mock_playwright_obj + + mocker.patch( + "pricewatch.app.scraping.pw_fetch.sync_playwright", + return_value=mock_sync_playwright, + ) + + return { + "playwright": mock_playwright_obj, + "browser": mock_browser, + "context": mock_context, + "page": mock_page, + } + + def test_fetch_success(self, mock_playwright_stack): + """Récupération Playwright réussie.""" + result = fetch_playwright("https://example.com") + + assert result.success is True + assert result.html == "Playwright Test" + assert result.screenshot is None # Par défaut pas de screenshot + assert result.error is None + assert result.duration_ms is not None + assert result.duration_ms >= 0 + + # Vérifier que la page a été visitée + mock_playwright_stack["page"].goto.assert_called_once_with( + "https://example.com", wait_until="domcontentloaded" + ) + + def test_fetch_with_screenshot(self, mock_playwright_stack): + """Récupération avec screenshot.""" + result = fetch_playwright("https://example.com", save_screenshot=True) + + assert result.success is True + assert result.screenshot == b"fake_screenshot_data" + + # Vérifier que screenshot() a été appelé + mock_playwright_stack["page"].screenshot.assert_called_once() + + def test_fetch_headful_mode(self, mock_playwright_stack): + """Mode headful (navigateur visible).""" + result = fetch_playwright("https://example.com", headless=False) + + assert result.success is True + + # Vérifier que headless=False a été passé + mock_playwright_stack["playwright"].chromium.launch.assert_called_once() + call_kwargs = mock_playwright_stack["playwright"].chromium.launch.call_args.kwargs + assert call_kwargs["headless"] is False + + def test_fetch_with_custom_timeout(self, mock_playwright_stack): + """Timeout personnalisé.""" + result = fetch_playwright("https://example.com", timeout_ms=30000) + + assert result.success is True + + # Vérifier que set_default_timeout a été appelé + mock_playwright_stack["page"].set_default_timeout.assert_called_once_with(30000) + + def test_fetch_with_wait_for_selector(self, mock_playwright_stack): + """Attente d'un sélecteur CSS spécifique.""" + result = fetch_playwright( + "https://example.com", wait_for_selector=".product-title" + ) + + assert result.success is True + + # Vérifier que wait_for_selector a été appelé + mock_playwright_stack["page"].wait_for_selector.assert_called_once_with( + ".product-title", timeout=60000 + ) + + def test_fetch_wait_for_selector_timeout(self, mock_playwright_stack): + """Timeout lors de l'attente du sélecteur.""" + # Le sélecteur timeout mais la page continue + mock_playwright_stack["page"].wait_for_selector.side_effect = PlaywrightTimeout( + "Selector timeout" + ) + + result = fetch_playwright( + "https://example.com", wait_for_selector=".non-existent" + ) + + # Doit quand même réussir (le wait_for_selector est non-bloquant) + assert result.success is True + assert result.html is not None + + def test_fetch_empty_url(self): + """URL vide retourne une erreur.""" + result = fetch_playwright("") + + assert result.success is False + assert "URL vide" in result.error + assert result.html is None + + def test_fetch_whitespace_url(self): + """URL avec espaces retourne une erreur.""" + result = fetch_playwright(" ") + + assert result.success is False + assert "URL vide" in result.error + + def test_fetch_no_response_from_server(self, mock_playwright_stack): + """Pas de réponse du serveur.""" + mock_playwright_stack["page"].goto.return_value = None + + result = fetch_playwright("https://example.com") + + assert result.success is False + assert "Pas de réponse du serveur" in result.error + + def test_fetch_playwright_timeout(self, mock_playwright_stack): + """Timeout Playwright lors de la navigation.""" + mock_playwright_stack["page"].goto.side_effect = PlaywrightTimeout( + "Navigation timeout" + ) + + result = fetch_playwright("https://example.com", timeout_ms=10000) + + assert result.success is False + assert "Timeout" in result.error + assert result.duration_ms is not None + + def test_fetch_playwright_generic_error(self, mock_playwright_stack): + """Erreur générique Playwright.""" + mock_playwright_stack["page"].goto.side_effect = Exception( + "Generic Playwright error" + ) + + result = fetch_playwright("https://example.com") + + assert result.success is False + assert "Erreur Playwright" in result.error + assert result.duration_ms is not None + + def test_fetch_cleanup_on_success(self, mock_playwright_stack): + """Nettoyage des ressources sur succès.""" + result = fetch_playwright("https://example.com") + + assert result.success is True + + # Vérifier que les ressources sont nettoyées + mock_playwright_stack["page"].close.assert_called_once() + mock_playwright_stack["browser"].close.assert_called_once() + mock_playwright_stack["playwright"].stop.assert_called_once() + + def test_fetch_cleanup_on_error(self, mock_playwright_stack): + """Nettoyage des ressources sur erreur.""" + mock_playwright_stack["page"].goto.side_effect = Exception("Test error") + + result = fetch_playwright("https://example.com") + + assert result.success is False + + # Vérifier que les ressources sont nettoyées même en cas d'erreur + mock_playwright_stack["page"].close.assert_called_once() + mock_playwright_stack["browser"].close.assert_called_once() + mock_playwright_stack["playwright"].stop.assert_called_once() + + def test_fetch_screenshot_on_error(self, mock_playwright_stack): + """Screenshot capturé même en cas d'erreur.""" + mock_playwright_stack["page"].goto.side_effect = PlaywrightTimeout("Timeout") + + result = fetch_playwright("https://example.com", save_screenshot=True) + + assert result.success is False + assert result.screenshot == b"fake_screenshot_data" + + # Screenshot doit avoir été tenté + mock_playwright_stack["page"].screenshot.assert_called_once() + + +class TestFetchWithFallback: + """Tests pour fetch_with_fallback().""" + + def test_http_success_no_playwright(self, mocker): + """Si HTTP réussit, Playwright n'est pas appelé.""" + # Mock fetch_http qui réussit + mock_http_result = Mock() + mock_http_result.success = True + mock_http_result.html = "HTTP Success" + mock_http_result.duration_ms = 150 + + mocker.patch( + "pricewatch.app.scraping.http_fetch.fetch_http", + return_value=mock_http_result, + ) + + # Mock fetch_playwright (ne devrait pas être appelé) + mock_playwright = mocker.patch( + "pricewatch.app.scraping.pw_fetch.fetch_playwright" + ) + + result = fetch_with_fallback("https://example.com") + + assert result.success is True + assert result.html == "HTTP Success" + assert result.duration_ms == 150 + + # Playwright ne doit pas être appelé + mock_playwright.assert_not_called() + + def test_http_fails_playwright_fallback(self, mocker): + """Si HTTP échoue, fallback vers Playwright.""" + # Mock fetch_http qui échoue + mock_http_result = Mock() + mock_http_result.success = False + mock_http_result.error = "403 Forbidden" + + mocker.patch( + "pricewatch.app.scraping.http_fetch.fetch_http", + return_value=mock_http_result, + ) + + # Mock fetch_playwright qui réussit + mock_playwright_result = PlaywrightFetchResult( + success=True, + html="Playwright Success", + duration_ms=2500, + ) + + mock_playwright = mocker.patch( + "pricewatch.app.scraping.pw_fetch.fetch_playwright", + return_value=mock_playwright_result, + ) + + result = fetch_with_fallback("https://example.com") + + assert result.success is True + assert result.html == "Playwright Success" + + # Playwright doit avoir été appelé + mock_playwright.assert_called_once() + + def test_skip_http_direct_playwright(self, mocker): + """Mode Playwright direct (sans essayer HTTP d'abord).""" + # Mock fetch_http (ne devrait pas être appelé) + mock_http = mocker.patch("pricewatch.app.scraping.http_fetch.fetch_http") + + # Mock fetch_playwright + mock_playwright_result = PlaywrightFetchResult( + success=True, + html="Playwright Direct", + duration_ms=2500, + ) + + mock_playwright = mocker.patch( + "pricewatch.app.scraping.pw_fetch.fetch_playwright", + return_value=mock_playwright_result, + ) + + result = fetch_with_fallback("https://example.com", try_http_first=False) + + assert result.success is True + assert result.html == "Playwright Direct" + + # HTTP ne doit pas être appelé + mock_http.assert_not_called() + + # Playwright doit avoir été appelé + mock_playwright.assert_called_once() + + def test_playwright_options_passed(self, mocker): + """Options Playwright passées correctement.""" + # Mock fetch_http qui échoue + mock_http_result = Mock() + mock_http_result.success = False + mock_http_result.error = "403 Forbidden" + + mocker.patch( + "pricewatch.app.scraping.http_fetch.fetch_http", + return_value=mock_http_result, + ) + + # Mock fetch_playwright + mock_playwright_result = PlaywrightFetchResult( + success=True, + html="OK", + duration_ms=2500, + ) + + mock_playwright = mocker.patch( + "pricewatch.app.scraping.pw_fetch.fetch_playwright", + return_value=mock_playwright_result, + ) + + # Options personnalisées + options = {"headless": False, "timeout_ms": 30000, "save_screenshot": True} + + result = fetch_with_fallback("https://example.com", playwright_options=options) + + assert result.success is True + + # Vérifier que les options sont passées à fetch_playwright + mock_playwright.assert_called_once_with("https://example.com", **options) diff --git a/tests/stores/__pycache__/test_amazon.cpython-313-pytest-9.0.2.pyc b/tests/stores/__pycache__/test_amazon.cpython-313-pytest-9.0.2.pyc index f3111bf..c2e6152 100755 Binary files a/tests/stores/__pycache__/test_amazon.cpython-313-pytest-9.0.2.pyc and b/tests/stores/__pycache__/test_amazon.cpython-313-pytest-9.0.2.pyc differ