Files
scrap/pricewatch/app/cli/main.py
2026-01-13 19:49:04 +01:00

364 lines
11 KiB
Python
Executable File

"""
CLI PriceWatch - Interface en ligne de commande.
Commandes disponibles:
- run: Pipeline complet YAML → JSON
- detect: Détection du store depuis une URL
- fetch: Récupération d'une page (HTTP ou Playwright)
- parse: Parsing d'un fichier HTML
- doctor: Vérification de l'installation
"""
import sys
from pathlib import Path
from typing import Optional
import typer
from rich import print as rprint
from rich.console import Console
from rich.table import Table
from pricewatch.app.core import logging as app_logging
from pricewatch.app.core.io import read_yaml_config, write_json_results
from pricewatch.app.core.logging import get_logger, set_level
from pricewatch.app.core.registry import get_registry, register_store
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod
from pricewatch.app.scraping.http_fetch import fetch_http
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore
# Créer l'application Typer
app = typer.Typer(
name="pricewatch",
help="Application de suivi de prix e-commerce",
add_completion=False,
)
console = Console()
logger = get_logger("cli")
def setup_stores():
"""Enregistre tous les stores disponibles dans le registry."""
registry = get_registry()
registry.register(AmazonStore())
registry.register(CdiscountStore())
@app.command()
def run(
yaml: Path = typer.Option(
"scrap_url.yaml",
"--yaml",
"-y",
help="Fichier YAML de configuration",
exists=True,
),
out: Path = typer.Option(
"scraped_store.json",
"--out",
"-o",
help="Fichier JSON de sortie",
),
debug: bool = typer.Option(
False,
"--debug",
"-d",
help="Activer le mode debug",
),
):
"""
Pipeline complet: scrape toutes les URLs du YAML et génère le JSON.
"""
if debug:
set_level("DEBUG")
logger.info("=== Démarrage du pipeline PriceWatch ===")
# Initialiser les stores
setup_stores()
registry = get_registry()
logger.info(f"Stores enregistrés: {', '.join(registry.list_stores())}")
# Lire la configuration
try:
config = read_yaml_config(yaml)
except Exception as e:
logger.error(f"Erreur lecture YAML: {e}")
raise typer.Exit(code=1)
logger.info(f"{len(config.urls)} URL(s) à scraper")
# Scraper chaque URL
snapshots = []
for i, url in enumerate(config.urls, 1):
logger.info(f"[{i}/{len(config.urls)}] Traitement: {url}")
# Détecter le store
store = registry.detect_store(url)
if not store:
logger.error(f"Aucun store trouvé pour: {url}")
continue
# Canoniser l'URL
canonical_url = store.canonicalize(url)
logger.info(f"URL canonique: {canonical_url}")
# Récupérer la page
html = None
fetch_method = FetchMethod.HTTP
fetch_error = None
# Tenter HTTP d'abord
logger.info("Tentative HTTP...")
http_result = fetch_http(canonical_url)
if http_result.success:
html = http_result.html
fetch_method = FetchMethod.HTTP
logger.info("✓ HTTP réussi")
elif config.options.use_playwright:
# Fallback Playwright
logger.warning(f"HTTP échoué: {http_result.error}, fallback Playwright")
pw_result = fetch_playwright(
canonical_url,
headless=not config.options.headful,
timeout_ms=config.options.timeout_ms,
save_screenshot=config.options.save_screenshot,
)
if pw_result.success:
html = pw_result.html
fetch_method = FetchMethod.PLAYWRIGHT
logger.info("✓ Playwright réussi")
# Sauvegarder screenshot si demandé
if config.options.save_screenshot and pw_result.screenshot:
from pricewatch.app.core.io import save_debug_screenshot
ref = store.extract_reference(canonical_url) or f"url_{i}"
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
else:
fetch_error = pw_result.error
logger.error(f"✗ Playwright échoué: {fetch_error}")
else:
fetch_error = http_result.error
logger.error(f"✗ HTTP échoué: {fetch_error}")
# Parser si on a du HTML
if html:
try:
# Sauvegarder HTML si demandé
if config.options.save_html:
from pricewatch.app.core.io import save_debug_html
ref = store.extract_reference(canonical_url) or f"url_{i}"
save_debug_html(html, f"{store.store_id}_{ref}")
snapshot = store.parse(html, canonical_url)
snapshot.debug.method = fetch_method
snapshots.append(snapshot)
status_emoji = "" if snapshot.is_complete() else ""
logger.info(
f"{status_emoji} Parsing: title={bool(snapshot.title)}, "
f"price={snapshot.price is not None}"
)
except Exception as e:
logger.error(f"✗ Erreur parsing: {e}")
# Créer un snapshot failed
from pricewatch.app.core.schema import ProductSnapshot
snapshot = ProductSnapshot(
source=store.store_id,
url=canonical_url,
debug=DebugInfo(
method=fetch_method,
status=DebugStatus.FAILED,
errors=[f"Parsing failed: {str(e)}"],
),
)
snapshots.append(snapshot)
else:
# Pas de HTML récupéré
from pricewatch.app.core.schema import ProductSnapshot
snapshot = ProductSnapshot(
source=store.store_id if store else "unknown",
url=canonical_url,
debug=DebugInfo(
method=fetch_method,
status=DebugStatus.FAILED,
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
),
)
snapshots.append(snapshot)
# Écrire les résultats
logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {out}")
try:
write_json_results(snapshots, out)
logger.info("✓ Pipeline terminé avec succès")
except Exception as e:
logger.error(f"✗ Erreur écriture JSON: {e}")
raise typer.Exit(code=1)
@app.command()
def detect(url: str):
"""
Détecte le store correspondant à une URL.
"""
logger.info(f"Détection du store pour: {url}")
setup_stores()
registry = get_registry()
store = registry.detect_store(url)
if store:
rprint(f"[green]✓ Store détecté: {store.store_id}[/green]")
rprint(f" URL canonique: {store.canonicalize(url)}")
rprint(f" Référence: {store.extract_reference(url)}")
else:
rprint("[red]✗ Aucun store trouvé[/red]")
raise typer.Exit(code=1)
@app.command()
def fetch(
url: str,
http: bool = typer.Option(False, "--http", help="Forcer HTTP"),
playwright: bool = typer.Option(False, "--playwright", help="Forcer Playwright"),
headful: bool = typer.Option(False, "--headful", help="Mode Playwright visible"),
debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
"""
Récupère une page via HTTP ou Playwright.
"""
if debug:
set_level("DEBUG")
if http and playwright:
rprint("[red]✗ Impossible de spécifier --http et --playwright ensemble[/red]")
raise typer.Exit(code=1)
if playwright or (not http and not playwright):
# Playwright par défaut ou explicite
logger.info(f"Récupération via Playwright: {url}")
result = fetch_playwright(url, headless=not headful)
if result.success:
rprint(f"[green]✓ Succès[/green]")
rprint(f" Taille HTML: {len(result.html)} chars")
rprint(f" Durée: {result.duration_ms}ms")
else:
rprint(f"[red]✗ Échec: {result.error}[/red]")
raise typer.Exit(code=1)
else:
# HTTP explicite
logger.info(f"Récupération via HTTP: {url}")
result = fetch_http(url)
if result.success:
rprint(f"[green]✓ Succès[/green]")
rprint(f" Taille HTML: {len(result.html)} chars")
rprint(f" Status: {result.status_code}")
rprint(f" Durée: {result.duration_ms}ms")
else:
rprint(f"[red]✗ Échec: {result.error}[/red]")
raise typer.Exit(code=1)
@app.command()
def parse(
store: str = typer.Argument(..., help="Store ID (amazon, cdiscount)"),
html_file: Path = typer.Option(
..., "--in", "-i", help="Fichier HTML à parser", exists=True
),
debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
"""
Parse un fichier HTML avec un store spécifique.
"""
if debug:
set_level("DEBUG")
setup_stores()
registry = get_registry()
store_obj = registry.get_store(store)
if not store_obj:
rprint(f"[red]✗ Store inconnu: {store}[/red]")
rprint(f"Stores disponibles: {', '.join(registry.list_stores())}")
raise typer.Exit(code=1)
logger.info(f"Parsing avec {store}: {html_file}")
with open(html_file, "r", encoding="utf-8") as f:
html = f.read()
try:
snapshot = store_obj.parse(html, url="file://local")
if snapshot.is_complete():
rprint("[green]✓ Parsing réussi[/green]")
else:
rprint("[yellow]⚠ Parsing partiel[/yellow]")
rprint(f" Titre: {snapshot.title or 'N/A'}")
rprint(f" Prix: {snapshot.price} {snapshot.currency}")
rprint(f" Référence: {snapshot.reference or 'N/A'}")
rprint(f" Stock: {snapshot.stock_status}")
rprint(f" Images: {len(snapshot.images)}")
rprint(f" Specs: {len(snapshot.specs)}")
except Exception as e:
rprint(f"[red]✗ Erreur parsing: {e}[/red]")
raise typer.Exit(code=1)
@app.command()
def doctor():
"""
Vérifie l'installation de PriceWatch.
"""
table = Table(title="PriceWatch Doctor")
table.add_column("Composant", style="cyan")
table.add_column("Statut", style="green")
# Python version
table.add_row("Python", f"{sys.version.split()[0]}")
# Dépendances
deps = [
("typer", "typer"),
("pydantic", "pydantic"),
("requests", "requests"),
("playwright", "playwright"),
("beautifulsoup4", "bs4"),
("pyyaml", "yaml"),
]
for name, module in deps:
try:
__import__(module)
table.add_row(name, "✓ Installé")
except ImportError:
table.add_row(name, "✗ Manquant")
# Stores
setup_stores()
registry = get_registry()
table.add_row("Stores", f"{len(registry)} enregistrés: {', '.join(registry.list_stores())}")
console.print(table)
rprint("\n[green]✓ PriceWatch est prêt![/green]")
if __name__ == "__main__":
app()