scrap/pricewatch/app/cli/main.py

"""
CLI PriceWatch - Interface en ligne de commande.

Commandes disponibles:
- run: Pipeline complet YAML → JSON
- detect: Détection du store depuis une URL
- fetch: Récupération d'une page (HTTP ou Playwright)
- parse: Parsing d'un fichier HTML
- doctor: Vérification de l'installation
"""

import sys
from pathlib import Path
from typing import Optional

import redis
import typer
from rq import Worker
from alembic import command as alembic_command
from alembic.config import Config as AlembicConfig
from rich import print as rprint
from rich.console import Console
from rich.table import Table

from pricewatch.app.core import logging as app_logging
from pricewatch.app.core.config import get_config
from pricewatch.app.core.io import read_yaml_config, write_json_results
from pricewatch.app.core.logging import get_logger, set_level
from pricewatch.app.core.registry import get_registry, register_store
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod
from pricewatch.app.db.connection import init_db
from pricewatch.app.scraping.http_fetch import fetch_http
from pricewatch.app.scraping.pipeline import ScrapingPipeline
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore
from pricewatch.app.tasks.scheduler import RedisUnavailableError, ScrapingScheduler

# Créer l'application Typer
app = typer.Typer(
    name="pricewatch",
    help="Application de suivi de prix e-commerce",
    add_completion=False,
)

console = Console()
logger = get_logger("cli")


def setup_stores():
    """Enregistre tous les stores disponibles dans le registry."""
    registry = get_registry()
    registry.register(AmazonStore())
    registry.register(CdiscountStore())


def get_alembic_config() -> AlembicConfig:
    """Construit la configuration Alembic à partir du repository."""
    root_path = Path(__file__).resolve().parents[3]
    config_path = root_path / "alembic.ini"
    migrations_path = root_path / "pricewatch" / "app" / "db" / "migrations"

    if not config_path.exists():
        logger.error(f"alembic.ini introuvable: {config_path}")
        raise typer.Exit(code=1)

    alembic_cfg = AlembicConfig(str(config_path))
    alembic_cfg.set_main_option("script_location", str(migrations_path))
    alembic_cfg.set_main_option("sqlalchemy.url", get_config().db.url)
    return alembic_cfg


@app.command("init-db")
def init_db_command():
    """
    Initialise la base de donnees (creer toutes les tables).
    """
    try:
        init_db(get_config())
    except Exception as e:
        logger.error(f"Init DB echoue: {e}")
        raise typer.Exit(code=1)


@app.command()
def migrate(
    message: str = typer.Argument(..., help="Message de migration"),
    autogenerate: bool = typer.Option(True, "--autogenerate/--no-autogenerate"),
):
    """
    Genere une migration Alembic.
    """
    try:
        alembic_cfg = get_alembic_config()
        alembic_command.revision(alembic_cfg, message=message, autogenerate=autogenerate)
    except Exception as e:
        logger.error(f"Migration echouee: {e}")
        raise typer.Exit(code=1)


@app.command()
def upgrade(revision: str = typer.Argument("head", help="Revision cible")):
    """
    Applique les migrations Alembic.
    """
    try:
        alembic_cfg = get_alembic_config()
        alembic_command.upgrade(alembic_cfg, revision)
    except Exception as e:
        logger.error(f"Upgrade echoue: {e}")
        raise typer.Exit(code=1)


@app.command()
def downgrade(revision: str = typer.Argument("-1", help="Revision cible")):
    """
    Rollback une migration Alembic.
    """
    try:
        alembic_cfg = get_alembic_config()
        alembic_command.downgrade(alembic_cfg, revision)
    except Exception as e:
        logger.error(f"Downgrade echoue: {e}")
        raise typer.Exit(code=1)

@app.command()
def run(
    yaml: Path = typer.Option(
        "scrap_url.yaml",
        "--yaml",
        "-y",
        help="Fichier YAML de configuration",
        exists=True,
    ),
    out: Path = typer.Option(
        "scraped_store.json",
        "--out",
        "-o",
        help="Fichier JSON de sortie",
    ),
    debug: bool = typer.Option(
        False,
        "--debug",
        "-d",
        help="Activer le mode debug",
    ),
    save_db: Optional[bool] = typer.Option(
        None,
        "--save-db/--no-db",
        help="Activer la persistence en base de donnees",
    ),
):
    """
    Pipeline complet: scrape toutes les URLs du YAML et génère le JSON.
    """
    if debug:
        set_level("DEBUG")

    logger.info("=== Démarrage du pipeline PriceWatch ===")

    # Initialiser les stores
    setup_stores()
    registry = get_registry()
    logger.info(f"Stores enregistrés: {', '.join(registry.list_stores())}")

    # Lire la configuration
    try:
        config = read_yaml_config(yaml)
    except Exception as e:
        logger.error(f"Erreur lecture YAML: {e}")
        raise typer.Exit(code=1)

    app_config = get_config()
    if save_db is None:
        save_db = app_config.enable_db

    pipeline = ScrapingPipeline(config=app_config)

    logger.info(f"{len(config.urls)} URL(s) à scraper")

    # Scraper chaque URL
    snapshots = []
    for i, url in enumerate(config.urls, 1):
        logger.info(f"[{i}/{len(config.urls)}] Traitement: {url}")

        # Détecter le store
        store = registry.detect_store(url)
        if not store:
            logger.error(f"Aucun store trouvé pour: {url}")
            continue

        # Canoniser l'URL
        canonical_url = store.canonicalize(url)
        logger.info(f"URL canonique: {canonical_url}")

        # Récupérer la page
        html = None
        fetch_method = FetchMethod.HTTP
        fetch_error = None
        http_result = None

        if config.options.force_playwright:
            logger.info("Playwright force, skip HTTP")
        else:
            logger.info("Tentative HTTP...")
            http_result = fetch_http(canonical_url)

        if http_result and http_result.success:
            html = http_result.html
            fetch_method = FetchMethod.HTTP
            logger.info("✓ HTTP réussi")
        elif config.options.use_playwright:
            fallback_reason = http_result.error if http_result else "force_playwright"
            logger.warning(f"HTTP échoué: {fallback_reason}, fallback Playwright")
            pw_result = fetch_playwright(
                canonical_url,
                headless=not config.options.headful,
                timeout_ms=config.options.timeout_ms,
                save_screenshot=config.options.save_screenshot,
            )

            if pw_result.success:
                html = pw_result.html
                fetch_method = FetchMethod.PLAYWRIGHT
                logger.info("✓ Playwright réussi")

                # Sauvegarder screenshot si demandé
                if config.options.save_screenshot and pw_result.screenshot:
                    from pricewatch.app.core.io import save_debug_screenshot

                    ref = store.extract_reference(canonical_url) or f"url_{i}"
                    save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
            else:
                fetch_error = pw_result.error
                logger.error(f"✗ Playwright échoué: {fetch_error}")
        else:
            fetch_error = http_result.error if http_result else "skip_http"
            logger.error(f"✗ HTTP échoué: {fetch_error}")

        # Parser si on a du HTML
        if html:
            try:
                # Sauvegarder HTML si demandé
                if config.options.save_html:
                    from pricewatch.app.core.io import save_debug_html

                    ref = store.extract_reference(canonical_url) or f"url_{i}"
                    save_debug_html(html, f"{store.store_id}_{ref}")

                snapshot = store.parse(html, canonical_url)
                snapshot.debug.method = fetch_method
                if save_db:
                    product_id = pipeline.process_snapshot(snapshot, save_to_db=True)
                    if product_id:
                        logger.info(f"DB: produit id={product_id}")

                snapshots.append(snapshot)

                status_emoji = "✓" if snapshot.is_complete() else "⚠"
                logger.info(
                    f"{status_emoji} Parsing: title={bool(snapshot.title)}, "
                    f"price={snapshot.price is not None}"
                )

            except Exception as e:
                logger.error(f"✗ Erreur parsing: {e}")
                # Créer un snapshot failed
                from pricewatch.app.core.schema import ProductSnapshot

                snapshot = ProductSnapshot(
                    source=store.store_id,
                    url=canonical_url,
                    debug=DebugInfo(
                        method=fetch_method,
                        status=DebugStatus.FAILED,
                        errors=[f"Parsing failed: {str(e)}"],
                    ),
                )
                if save_db:
                    pipeline.process_snapshot(snapshot, save_to_db=True)
                snapshots.append(snapshot)
        else:
            # Pas de HTML récupéré
            from pricewatch.app.core.schema import ProductSnapshot

            snapshot = ProductSnapshot(
                source=store.store_id if store else "unknown",
                url=canonical_url,
                debug=DebugInfo(
                    method=fetch_method,
                    status=DebugStatus.FAILED,
                    errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
                ),
            )
            if save_db:
                pipeline.process_snapshot(snapshot, save_to_db=True)
            snapshots.append(snapshot)

    # Écrire les résultats
    logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {out}")
    try:
        write_json_results(snapshots, out)
        logger.info("✓ Pipeline terminé avec succès")
    except Exception as e:
        logger.error(f"✗ Erreur écriture JSON: {e}")
        raise typer.Exit(code=1)


@app.command()
def detect(url: str):
    """
    Détecte le store correspondant à une URL.
    """
    logger.info(f"Détection du store pour: {url}")

    setup_stores()
    registry = get_registry()

    store = registry.detect_store(url)

    if store:
        rprint(f"[green]✓ Store détecté: {store.store_id}[/green]")
        rprint(f"  URL canonique: {store.canonicalize(url)}")
        rprint(f"  Référence: {store.extract_reference(url)}")
    else:
        rprint("[red]✗ Aucun store trouvé[/red]")
        raise typer.Exit(code=1)


@app.command()
def fetch(
    url: str,
    http: bool = typer.Option(False, "--http", help="Forcer HTTP"),
    playwright: bool = typer.Option(False, "--playwright", help="Forcer Playwright"),
    headful: bool = typer.Option(False, "--headful", help="Mode Playwright visible"),
    debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
    """
    Récupère une page via HTTP ou Playwright.
    """
    if debug:
        set_level("DEBUG")

    if http and playwright:
        rprint("[red]✗ Impossible de spécifier --http et --playwright ensemble[/red]")
        raise typer.Exit(code=1)

    if playwright or (not http and not playwright):
        # Playwright par défaut ou explicite
        logger.info(f"Récupération via Playwright: {url}")
        result = fetch_playwright(url, headless=not headful)

        if result.success:
            rprint(f"[green]✓ Succès[/green]")
            rprint(f"  Taille HTML: {len(result.html)} chars")
            rprint(f"  Durée: {result.duration_ms}ms")
        else:
            rprint(f"[red]✗ Échec: {result.error}[/red]")
            raise typer.Exit(code=1)
    else:
        # HTTP explicite
        logger.info(f"Récupération via HTTP: {url}")
        result = fetch_http(url)

        if result.success:
            rprint(f"[green]✓ Succès[/green]")
            rprint(f"  Taille HTML: {len(result.html)} chars")
            rprint(f"  Status: {result.status_code}")
            rprint(f"  Durée: {result.duration_ms}ms")
        else:
            rprint(f"[red]✗ Échec: {result.error}[/red]")
            raise typer.Exit(code=1)


@app.command()
def parse(
    store: str = typer.Argument(..., help="Store ID (amazon, cdiscount)"),
    html_file: Path = typer.Option(
        ..., "--in", "-i", help="Fichier HTML à parser", exists=True
    ),
    debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
    """
    Parse un fichier HTML avec un store spécifique.
    """
    if debug:
        set_level("DEBUG")

    setup_stores()
    registry = get_registry()

    store_obj = registry.get_store(store)
    if not store_obj:
        rprint(f"[red]✗ Store inconnu: {store}[/red]")
        rprint(f"Stores disponibles: {', '.join(registry.list_stores())}")
        raise typer.Exit(code=1)

    logger.info(f"Parsing avec {store}: {html_file}")

    with open(html_file, "r", encoding="utf-8") as f:
        html = f.read()

    try:
        snapshot = store_obj.parse(html, url="file://local")

        if snapshot.is_complete():
            rprint("[green]✓ Parsing réussi[/green]")
        else:
            rprint("[yellow]⚠ Parsing partiel[/yellow]")

        rprint(f"  Titre: {snapshot.title or 'N/A'}")
        rprint(f"  Prix: {snapshot.price} {snapshot.currency}")
        rprint(f"  Référence: {snapshot.reference or 'N/A'}")
        rprint(f"  Stock: {snapshot.stock_status}")
        rprint(f"  Images: {len(snapshot.images)}")
        rprint(f"  Specs: {len(snapshot.specs)}")

    except Exception as e:
        rprint(f"[red]✗ Erreur parsing: {e}[/red]")
        raise typer.Exit(code=1)


@app.command()
def doctor():
    """
    Vérifie l'installation de PriceWatch.
    """
    table = Table(title="PriceWatch Doctor")
    table.add_column("Composant", style="cyan")
    table.add_column("Statut", style="green")

    # Python version
    table.add_row("Python", f"{sys.version.split()[0]} ✓")

    # Dépendances
    deps = [
        ("typer", "typer"),
        ("pydantic", "pydantic"),
        ("requests", "requests"),
        ("playwright", "playwright"),
        ("beautifulsoup4", "bs4"),
        ("pyyaml", "yaml"),
    ]

    for name, module in deps:
        try:
            __import__(module)
            table.add_row(name, "✓ Installé")
        except ImportError:
            table.add_row(name, "✗ Manquant")

    # Stores
    setup_stores()
    registry = get_registry()
    table.add_row("Stores", f"{len(registry)} enregistrés: {', '.join(registry.list_stores())}")

    console.print(table)

    rprint("\n[green]✓ PriceWatch est prêt![/green]")


@app.command()
def worker(
    queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
    with_scheduler: bool = typer.Option(
        True, "--with-scheduler/--no-scheduler", help="Activer le scheduler RQ"
    ),
):
    """
    Lance un worker RQ.
    """
    config = get_config()
    try:
        connection = redis.from_url(config.redis.url)
        # Verification connexion avant de lancer le worker
        connection.ping()
    except redis.exceptions.ConnectionError as e:
        rprint(f"[red]✗ Impossible de se connecter a Redis ({config.redis.url})[/red]")
        rprint(f"[red]  Erreur: {e}[/red]")
        rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
        rprint("  docker compose up -d redis")
        rprint("  # ou")
        rprint("  redis-server")
        raise typer.Exit(code=1)
    except redis.exceptions.RedisError as e:
        rprint(f"[red]✗ Erreur Redis: {e}[/red]")
        raise typer.Exit(code=1)

    # RQ 2.x: connexion passee directement au Worker
    worker_instance = Worker([queue], connection=connection)
    worker_instance.work(with_scheduler=with_scheduler)


@app.command()
def enqueue(
    url: str = typer.Argument(..., help="URL du produit a scraper"),
    queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
    save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
    use_playwright: Optional[bool] = typer.Option(
        None, "--playwright/--no-playwright", help="Forcer Playwright"
    ),
):
    """
    Enqueue un scraping immediat.
    """
    try:
        scheduler = ScrapingScheduler(get_config(), queue_name=queue)
        job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
        rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
    except RedisUnavailableError as e:
        rprint(f"[red]✗ {e.message}[/red]")
        rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
        rprint("  docker compose up -d redis")
        raise typer.Exit(code=1)


@app.command()
def schedule(
    url: str = typer.Argument(..., help="URL du produit a planifier"),
    interval: int = typer.Option(24, "--interval", help="Intervalle en heures"),
    queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
    save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
    use_playwright: Optional[bool] = typer.Option(
        None, "--playwright/--no-playwright", help="Forcer Playwright"
    ),
):
    """
    Planifie un scraping recurrent.
    """
    try:
        scheduler = ScrapingScheduler(get_config(), queue_name=queue)
        job_info = scheduler.schedule_product(
            url,
            interval_hours=interval,
            use_playwright=use_playwright,
            save_db=save_db,
        )
        rprint(
            f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
        )
    except RedisUnavailableError as e:
        rprint(f"[red]✗ {e.message}[/red]")
        rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
        rprint("  docker compose up -d redis")
        raise typer.Exit(code=1)


if __name__ == "__main__":
    app()