scrap/pricewatch/app/cli/main.py

"""
CLI PriceWatch - Interface en ligne de commande.

Commandes disponibles:
- run: Pipeline complet YAML → JSON
- detect: Détection du store depuis une URL
- fetch: Récupération d'une page (HTTP ou Playwright)
- parse: Parsing d'un fichier HTML
- doctor: Vérification de l'installation
"""

import sys
from pathlib import Path
from typing import Optional

import typer
from rich import print as rprint
from rich.console import Console
from rich.table import Table

from pricewatch.app.core import logging as app_logging
from pricewatch.app.core.io import read_yaml_config, write_json_results
from pricewatch.app.core.logging import get_logger, set_level
from pricewatch.app.core.registry import get_registry, register_store
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod
from pricewatch.app.scraping.http_fetch import fetch_http
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore

# Créer l'application Typer
app = typer.Typer(
    name="pricewatch",
    help="Application de suivi de prix e-commerce",
    add_completion=False,
)

console = Console()
logger = get_logger("cli")


def setup_stores():
    """Enregistre tous les stores disponibles dans le registry."""
    registry = get_registry()
    registry.register(AmazonStore())
    registry.register(CdiscountStore())


@app.command()
def run(
    yaml: Path = typer.Option(
        "scrap_url.yaml",
        "--yaml",
        "-y",
        help="Fichier YAML de configuration",
        exists=True,
    ),
    out: Path = typer.Option(
        "scraped_store.json",
        "--out",
        "-o",
        help="Fichier JSON de sortie",
    ),
    debug: bool = typer.Option(
        False,
        "--debug",
        "-d",
        help="Activer le mode debug",
    ),
):
    """
    Pipeline complet: scrape toutes les URLs du YAML et génère le JSON.
    """
    if debug:
        set_level("DEBUG")

    logger.info("=== Démarrage du pipeline PriceWatch ===")

    # Initialiser les stores
    setup_stores()
    registry = get_registry()
    logger.info(f"Stores enregistrés: {', '.join(registry.list_stores())}")

    # Lire la configuration
    try:
        config = read_yaml_config(yaml)
    except Exception as e:
        logger.error(f"Erreur lecture YAML: {e}")
        raise typer.Exit(code=1)

    logger.info(f"{len(config.urls)} URL(s) à scraper")

    # Scraper chaque URL
    snapshots = []
    for i, url in enumerate(config.urls, 1):
        logger.info(f"[{i}/{len(config.urls)}] Traitement: {url}")

        # Détecter le store
        store = registry.detect_store(url)
        if not store:
            logger.error(f"Aucun store trouvé pour: {url}")
            continue

        # Canoniser l'URL
        canonical_url = store.canonicalize(url)
        logger.info(f"URL canonique: {canonical_url}")

        # Récupérer la page
        html = None
        fetch_method = FetchMethod.HTTP
        fetch_error = None

        # Tenter HTTP d'abord
        logger.info("Tentative HTTP...")
        http_result = fetch_http(canonical_url)

        if http_result.success:
            html = http_result.html
            fetch_method = FetchMethod.HTTP
            logger.info("✓ HTTP réussi")
        elif config.options.use_playwright:
            # Fallback Playwright
            logger.warning(f"HTTP échoué: {http_result.error}, fallback Playwright")
            pw_result = fetch_playwright(
                canonical_url,
                headless=not config.options.headful,
                timeout_ms=config.options.timeout_ms,
                save_screenshot=config.options.save_screenshot,
            )

            if pw_result.success:
                html = pw_result.html
                fetch_method = FetchMethod.PLAYWRIGHT
                logger.info("✓ Playwright réussi")

                # Sauvegarder screenshot si demandé
                if config.options.save_screenshot and pw_result.screenshot:
                    from pricewatch.app.core.io import save_debug_screenshot

                    ref = store.extract_reference(canonical_url) or f"url_{i}"
                    save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
            else:
                fetch_error = pw_result.error
                logger.error(f"✗ Playwright échoué: {fetch_error}")
        else:
            fetch_error = http_result.error
            logger.error(f"✗ HTTP échoué: {fetch_error}")

        # Parser si on a du HTML
        if html:
            try:
                # Sauvegarder HTML si demandé
                if config.options.save_html:
                    from pricewatch.app.core.io import save_debug_html

                    ref = store.extract_reference(canonical_url) or f"url_{i}"
                    save_debug_html(html, f"{store.store_id}_{ref}")

                snapshot = store.parse(html, canonical_url)
                snapshot.debug.method = fetch_method
                snapshots.append(snapshot)

                status_emoji = "✓" if snapshot.is_complete() else "⚠"
                logger.info(
                    f"{status_emoji} Parsing: title={bool(snapshot.title)}, "
                    f"price={snapshot.price is not None}"
                )

            except Exception as e:
                logger.error(f"✗ Erreur parsing: {e}")
                # Créer un snapshot failed
                from pricewatch.app.core.schema import ProductSnapshot

                snapshot = ProductSnapshot(
                    source=store.store_id,
                    url=canonical_url,
                    debug=DebugInfo(
                        method=fetch_method,
                        status=DebugStatus.FAILED,
                        errors=[f"Parsing failed: {str(e)}"],
                    ),
                )
                snapshots.append(snapshot)
        else:
            # Pas de HTML récupéré
            from pricewatch.app.core.schema import ProductSnapshot

            snapshot = ProductSnapshot(
                source=store.store_id if store else "unknown",
                url=canonical_url,
                debug=DebugInfo(
                    method=fetch_method,
                    status=DebugStatus.FAILED,
                    errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
                ),
            )
            snapshots.append(snapshot)

    # Écrire les résultats
    logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {out}")
    try:
        write_json_results(snapshots, out)
        logger.info("✓ Pipeline terminé avec succès")
    except Exception as e:
        logger.error(f"✗ Erreur écriture JSON: {e}")
        raise typer.Exit(code=1)


@app.command()
def detect(url: str):
    """
    Détecte le store correspondant à une URL.
    """
    logger.info(f"Détection du store pour: {url}")

    setup_stores()
    registry = get_registry()

    store = registry.detect_store(url)

    if store:
        rprint(f"[green]✓ Store détecté: {store.store_id}[/green]")
        rprint(f"  URL canonique: {store.canonicalize(url)}")
        rprint(f"  Référence: {store.extract_reference(url)}")
    else:
        rprint("[red]✗ Aucun store trouvé[/red]")
        raise typer.Exit(code=1)


@app.command()
def fetch(
    url: str,
    http: bool = typer.Option(False, "--http", help="Forcer HTTP"),
    playwright: bool = typer.Option(False, "--playwright", help="Forcer Playwright"),
    headful: bool = typer.Option(False, "--headful", help="Mode Playwright visible"),
    debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
    """
    Récupère une page via HTTP ou Playwright.
    """
    if debug:
        set_level("DEBUG")

    if http and playwright:
        rprint("[red]✗ Impossible de spécifier --http et --playwright ensemble[/red]")
        raise typer.Exit(code=1)

    if playwright or (not http and not playwright):
        # Playwright par défaut ou explicite
        logger.info(f"Récupération via Playwright: {url}")
        result = fetch_playwright(url, headless=not headful)

        if result.success:
            rprint(f"[green]✓ Succès[/green]")
            rprint(f"  Taille HTML: {len(result.html)} chars")
            rprint(f"  Durée: {result.duration_ms}ms")
        else:
            rprint(f"[red]✗ Échec: {result.error}[/red]")
            raise typer.Exit(code=1)
    else:
        # HTTP explicite
        logger.info(f"Récupération via HTTP: {url}")
        result = fetch_http(url)

        if result.success:
            rprint(f"[green]✓ Succès[/green]")
            rprint(f"  Taille HTML: {len(result.html)} chars")
            rprint(f"  Status: {result.status_code}")
            rprint(f"  Durée: {result.duration_ms}ms")
        else:
            rprint(f"[red]✗ Échec: {result.error}[/red]")
            raise typer.Exit(code=1)


@app.command()
def parse(
    store: str = typer.Argument(..., help="Store ID (amazon, cdiscount)"),
    html_file: Path = typer.Option(
        ..., "--in", "-i", help="Fichier HTML à parser", exists=True
    ),
    debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
    """
    Parse un fichier HTML avec un store spécifique.
    """
    if debug:
        set_level("DEBUG")

    setup_stores()
    registry = get_registry()

    store_obj = registry.get_store(store)
    if not store_obj:
        rprint(f"[red]✗ Store inconnu: {store}[/red]")
        rprint(f"Stores disponibles: {', '.join(registry.list_stores())}")
        raise typer.Exit(code=1)

    logger.info(f"Parsing avec {store}: {html_file}")

    with open(html_file, "r", encoding="utf-8") as f:
        html = f.read()

    try:
        snapshot = store_obj.parse(html, url="file://local")

        if snapshot.is_complete():
            rprint("[green]✓ Parsing réussi[/green]")
        else:
            rprint("[yellow]⚠ Parsing partiel[/yellow]")

        rprint(f"  Titre: {snapshot.title or 'N/A'}")
        rprint(f"  Prix: {snapshot.price} {snapshot.currency}")
        rprint(f"  Référence: {snapshot.reference or 'N/A'}")
        rprint(f"  Stock: {snapshot.stock_status}")
        rprint(f"  Images: {len(snapshot.images)}")
        rprint(f"  Specs: {len(snapshot.specs)}")

    except Exception as e:
        rprint(f"[red]✗ Erreur parsing: {e}[/red]")
        raise typer.Exit(code=1)


@app.command()
def doctor():
    """
    Vérifie l'installation de PriceWatch.
    """
    table = Table(title="PriceWatch Doctor")
    table.add_column("Composant", style="cyan")
    table.add_column("Statut", style="green")

    # Python version
    table.add_row("Python", f"{sys.version.split()[0]} ✓")

    # Dépendances
    deps = [
        ("typer", "typer"),
        ("pydantic", "pydantic"),
        ("requests", "requests"),
        ("playwright", "playwright"),
        ("beautifulsoup4", "bs4"),
        ("pyyaml", "yaml"),
    ]

    for name, module in deps:
        try:
            __import__(module)
            table.add_row(name, "✓ Installé")
        except ImportError:
            table.add_row(name, "✗ Manquant")

    # Stores
    setup_stores()
    registry = get_registry()
    table.add_row("Stores", f"{len(registry)} enregistrés: {', '.join(registry.list_stores())}")

    console.print(table)

    rprint("\n[green]✓ PriceWatch est prêt![/green]")


if __name__ == "__main__":
    app()