""" CLI PriceWatch - Interface en ligne de commande. Commandes disponibles: - run: Pipeline complet YAML → JSON - detect: Détection du store depuis une URL - fetch: Récupération d'une page (HTTP ou Playwright) - parse: Parsing d'un fichier HTML - doctor: Vérification de l'installation """ import sys from pathlib import Path from typing import Optional import redis import typer from rq import Worker from alembic import command as alembic_command from alembic.config import Config as AlembicConfig from rich import print as rprint from rich.console import Console from rich.table import Table from pricewatch.app.core import logging as app_logging from pricewatch.app.core.config import get_config from pricewatch.app.core.io import read_yaml_config, write_json_results from pricewatch.app.core.logging import get_logger, set_level from pricewatch.app.core.registry import get_registry, register_store from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod from pricewatch.app.db.connection import init_db from pricewatch.app.scraping.http_fetch import fetch_http from pricewatch.app.scraping.pipeline import ScrapingPipeline from pricewatch.app.scraping.pw_fetch import fetch_playwright from pricewatch.app.stores.amazon.store import AmazonStore from pricewatch.app.stores.cdiscount.store import CdiscountStore from pricewatch.app.tasks.scheduler import RedisUnavailableError, ScrapingScheduler # Créer l'application Typer app = typer.Typer( name="pricewatch", help="Application de suivi de prix e-commerce", add_completion=False, ) console = Console() logger = get_logger("cli") def setup_stores(): """Enregistre tous les stores disponibles dans le registry.""" registry = get_registry() registry.register(AmazonStore()) registry.register(CdiscountStore()) def get_alembic_config() -> AlembicConfig: """Construit la configuration Alembic à partir du repository.""" root_path = Path(__file__).resolve().parents[3] config_path = root_path / "alembic.ini" migrations_path = root_path / "pricewatch" / "app" / "db" / "migrations" if not config_path.exists(): logger.error(f"alembic.ini introuvable: {config_path}") raise typer.Exit(code=1) alembic_cfg = AlembicConfig(str(config_path)) alembic_cfg.set_main_option("script_location", str(migrations_path)) alembic_cfg.set_main_option("sqlalchemy.url", get_config().db.url) return alembic_cfg @app.command("init-db") def init_db_command(): """ Initialise la base de donnees (creer toutes les tables). """ try: init_db(get_config()) except Exception as e: logger.error(f"Init DB echoue: {e}") raise typer.Exit(code=1) @app.command() def migrate( message: str = typer.Argument(..., help="Message de migration"), autogenerate: bool = typer.Option(True, "--autogenerate/--no-autogenerate"), ): """ Genere une migration Alembic. """ try: alembic_cfg = get_alembic_config() alembic_command.revision(alembic_cfg, message=message, autogenerate=autogenerate) except Exception as e: logger.error(f"Migration echouee: {e}") raise typer.Exit(code=1) @app.command() def upgrade(revision: str = typer.Argument("head", help="Revision cible")): """ Applique les migrations Alembic. """ try: alembic_cfg = get_alembic_config() alembic_command.upgrade(alembic_cfg, revision) except Exception as e: logger.error(f"Upgrade echoue: {e}") raise typer.Exit(code=1) @app.command() def downgrade(revision: str = typer.Argument("-1", help="Revision cible")): """ Rollback une migration Alembic. """ try: alembic_cfg = get_alembic_config() alembic_command.downgrade(alembic_cfg, revision) except Exception as e: logger.error(f"Downgrade echoue: {e}") raise typer.Exit(code=1) @app.command() def run( yaml: Path = typer.Option( "scrap_url.yaml", "--yaml", "-y", help="Fichier YAML de configuration", exists=True, ), out: Path = typer.Option( "scraped_store.json", "--out", "-o", help="Fichier JSON de sortie", ), debug: bool = typer.Option( False, "--debug", "-d", help="Activer le mode debug", ), save_db: Optional[bool] = typer.Option( None, "--save-db/--no-db", help="Activer la persistence en base de donnees", ), ): """ Pipeline complet: scrape toutes les URLs du YAML et génère le JSON. """ if debug: set_level("DEBUG") logger.info("=== Démarrage du pipeline PriceWatch ===") # Initialiser les stores setup_stores() registry = get_registry() logger.info(f"Stores enregistrés: {', '.join(registry.list_stores())}") # Lire la configuration try: config = read_yaml_config(yaml) except Exception as e: logger.error(f"Erreur lecture YAML: {e}") raise typer.Exit(code=1) app_config = get_config() if save_db is None: save_db = app_config.enable_db pipeline = ScrapingPipeline(config=app_config) logger.info(f"{len(config.urls)} URL(s) à scraper") # Scraper chaque URL snapshots = [] for i, url in enumerate(config.urls, 1): logger.info(f"[{i}/{len(config.urls)}] Traitement: {url}") # Détecter le store store = registry.detect_store(url) if not store: logger.error(f"Aucun store trouvé pour: {url}") continue # Canoniser l'URL canonical_url = store.canonicalize(url) logger.info(f"URL canonique: {canonical_url}") # Récupérer la page html = None fetch_method = FetchMethod.HTTP fetch_error = None http_result = None if config.options.force_playwright: logger.info("Playwright force, skip HTTP") else: logger.info("Tentative HTTP...") http_result = fetch_http(canonical_url) if http_result and http_result.success: html = http_result.html fetch_method = FetchMethod.HTTP logger.info("✓ HTTP réussi") elif config.options.use_playwright: fallback_reason = http_result.error if http_result else "force_playwright" logger.warning(f"HTTP échoué: {fallback_reason}, fallback Playwright") pw_result = fetch_playwright( canonical_url, headless=not config.options.headful, timeout_ms=config.options.timeout_ms, save_screenshot=config.options.save_screenshot, ) if pw_result.success: html = pw_result.html fetch_method = FetchMethod.PLAYWRIGHT logger.info("✓ Playwright réussi") # Sauvegarder screenshot si demandé if config.options.save_screenshot and pw_result.screenshot: from pricewatch.app.core.io import save_debug_screenshot ref = store.extract_reference(canonical_url) or f"url_{i}" save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}") else: fetch_error = pw_result.error logger.error(f"✗ Playwright échoué: {fetch_error}") else: fetch_error = http_result.error if http_result else "skip_http" logger.error(f"✗ HTTP échoué: {fetch_error}") # Parser si on a du HTML if html: try: # Sauvegarder HTML si demandé if config.options.save_html: from pricewatch.app.core.io import save_debug_html ref = store.extract_reference(canonical_url) or f"url_{i}" save_debug_html(html, f"{store.store_id}_{ref}") snapshot = store.parse(html, canonical_url) snapshot.debug.method = fetch_method if save_db: product_id = pipeline.process_snapshot(snapshot, save_to_db=True) if product_id: logger.info(f"DB: produit id={product_id}") snapshots.append(snapshot) status_emoji = "✓" if snapshot.is_complete() else "⚠" logger.info( f"{status_emoji} Parsing: title={bool(snapshot.title)}, " f"price={snapshot.price is not None}" ) except Exception as e: logger.error(f"✗ Erreur parsing: {e}") # Créer un snapshot failed from pricewatch.app.core.schema import ProductSnapshot snapshot = ProductSnapshot( source=store.store_id, url=canonical_url, debug=DebugInfo( method=fetch_method, status=DebugStatus.FAILED, errors=[f"Parsing failed: {str(e)}"], ), ) if save_db: pipeline.process_snapshot(snapshot, save_to_db=True) snapshots.append(snapshot) else: # Pas de HTML récupéré from pricewatch.app.core.schema import ProductSnapshot snapshot = ProductSnapshot( source=store.store_id if store else "unknown", url=canonical_url, debug=DebugInfo( method=fetch_method, status=DebugStatus.FAILED, errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"], ), ) if save_db: pipeline.process_snapshot(snapshot, save_to_db=True) snapshots.append(snapshot) # Écrire les résultats logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {out}") try: write_json_results(snapshots, out) logger.info("✓ Pipeline terminé avec succès") except Exception as e: logger.error(f"✗ Erreur écriture JSON: {e}") raise typer.Exit(code=1) @app.command() def detect(url: str): """ Détecte le store correspondant à une URL. """ logger.info(f"Détection du store pour: {url}") setup_stores() registry = get_registry() store = registry.detect_store(url) if store: rprint(f"[green]✓ Store détecté: {store.store_id}[/green]") rprint(f" URL canonique: {store.canonicalize(url)}") rprint(f" Référence: {store.extract_reference(url)}") else: rprint("[red]✗ Aucun store trouvé[/red]") raise typer.Exit(code=1) @app.command() def fetch( url: str, http: bool = typer.Option(False, "--http", help="Forcer HTTP"), playwright: bool = typer.Option(False, "--playwright", help="Forcer Playwright"), headful: bool = typer.Option(False, "--headful", help="Mode Playwright visible"), debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"), ): """ Récupère une page via HTTP ou Playwright. """ if debug: set_level("DEBUG") if http and playwright: rprint("[red]✗ Impossible de spécifier --http et --playwright ensemble[/red]") raise typer.Exit(code=1) if playwright or (not http and not playwright): # Playwright par défaut ou explicite logger.info(f"Récupération via Playwright: {url}") result = fetch_playwright(url, headless=not headful) if result.success: rprint(f"[green]✓ Succès[/green]") rprint(f" Taille HTML: {len(result.html)} chars") rprint(f" Durée: {result.duration_ms}ms") else: rprint(f"[red]✗ Échec: {result.error}[/red]") raise typer.Exit(code=1) else: # HTTP explicite logger.info(f"Récupération via HTTP: {url}") result = fetch_http(url) if result.success: rprint(f"[green]✓ Succès[/green]") rprint(f" Taille HTML: {len(result.html)} chars") rprint(f" Status: {result.status_code}") rprint(f" Durée: {result.duration_ms}ms") else: rprint(f"[red]✗ Échec: {result.error}[/red]") raise typer.Exit(code=1) @app.command() def parse( store: str = typer.Argument(..., help="Store ID (amazon, cdiscount)"), html_file: Path = typer.Option( ..., "--in", "-i", help="Fichier HTML à parser", exists=True ), debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"), ): """ Parse un fichier HTML avec un store spécifique. """ if debug: set_level("DEBUG") setup_stores() registry = get_registry() store_obj = registry.get_store(store) if not store_obj: rprint(f"[red]✗ Store inconnu: {store}[/red]") rprint(f"Stores disponibles: {', '.join(registry.list_stores())}") raise typer.Exit(code=1) logger.info(f"Parsing avec {store}: {html_file}") with open(html_file, "r", encoding="utf-8") as f: html = f.read() try: snapshot = store_obj.parse(html, url="file://local") if snapshot.is_complete(): rprint("[green]✓ Parsing réussi[/green]") else: rprint("[yellow]⚠ Parsing partiel[/yellow]") rprint(f" Titre: {snapshot.title or 'N/A'}") rprint(f" Prix: {snapshot.price} {snapshot.currency}") rprint(f" Référence: {snapshot.reference or 'N/A'}") rprint(f" Stock: {snapshot.stock_status}") rprint(f" Images: {len(snapshot.images)}") rprint(f" Specs: {len(snapshot.specs)}") except Exception as e: rprint(f"[red]✗ Erreur parsing: {e}[/red]") raise typer.Exit(code=1) @app.command() def doctor(): """ Vérifie l'installation de PriceWatch. """ table = Table(title="PriceWatch Doctor") table.add_column("Composant", style="cyan") table.add_column("Statut", style="green") # Python version table.add_row("Python", f"{sys.version.split()[0]} ✓") # Dépendances deps = [ ("typer", "typer"), ("pydantic", "pydantic"), ("requests", "requests"), ("playwright", "playwright"), ("beautifulsoup4", "bs4"), ("pyyaml", "yaml"), ] for name, module in deps: try: __import__(module) table.add_row(name, "✓ Installé") except ImportError: table.add_row(name, "✗ Manquant") # Stores setup_stores() registry = get_registry() table.add_row("Stores", f"{len(registry)} enregistrés: {', '.join(registry.list_stores())}") console.print(table) rprint("\n[green]✓ PriceWatch est prêt![/green]") @app.command() def worker( queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"), with_scheduler: bool = typer.Option( True, "--with-scheduler/--no-scheduler", help="Activer le scheduler RQ" ), ): """ Lance un worker RQ. """ config = get_config() try: connection = redis.from_url(config.redis.url) # Verification connexion avant de lancer le worker connection.ping() except redis.exceptions.ConnectionError as e: rprint(f"[red]✗ Impossible de se connecter a Redis ({config.redis.url})[/red]") rprint(f"[red] Erreur: {e}[/red]") rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]") rprint(" docker compose up -d redis") rprint(" # ou") rprint(" redis-server") raise typer.Exit(code=1) except redis.exceptions.RedisError as e: rprint(f"[red]✗ Erreur Redis: {e}[/red]") raise typer.Exit(code=1) # RQ 2.x: connexion passee directement au Worker worker_instance = Worker([queue], connection=connection) worker_instance.work(with_scheduler=with_scheduler) @app.command() def enqueue( url: str = typer.Argument(..., help="URL du produit a scraper"), queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"), save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"), use_playwright: Optional[bool] = typer.Option( None, "--playwright/--no-playwright", help="Forcer Playwright" ), ): """ Enqueue un scraping immediat. """ try: scheduler = ScrapingScheduler(get_config(), queue_name=queue) job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db) rprint(f"[green]✓ Job enqueued: {job.id}[/green]") except RedisUnavailableError as e: rprint(f"[red]✗ {e.message}[/red]") rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]") rprint(" docker compose up -d redis") raise typer.Exit(code=1) @app.command() def schedule( url: str = typer.Argument(..., help="URL du produit a planifier"), interval: int = typer.Option(24, "--interval", help="Intervalle en heures"), queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"), save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"), use_playwright: Optional[bool] = typer.Option( None, "--playwright/--no-playwright", help="Forcer Playwright" ), ): """ Planifie un scraping recurrent. """ try: scheduler = ScrapingScheduler(get_config(), queue_name=queue) job_info = scheduler.schedule_product( url, interval_hours=interval, use_playwright=use_playwright, save_db=save_db, ) rprint( f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]" ) except RedisUnavailableError as e: rprint(f"[red]✗ {e.message}[/red]") rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]") rprint(" docker compose up -d redis") raise typer.Exit(code=1) if __name__ == "__main__": app()