Files
scrap/pricewatch/app/cli/main.py
Gilles Soulier d0b73b9319 codex2
2026-01-14 21:54:55 +01:00

550 lines
18 KiB
Python
Executable File

"""
CLI PriceWatch - Interface en ligne de commande.
Commandes disponibles:
- run: Pipeline complet YAML → JSON
- detect: Détection du store depuis une URL
- fetch: Récupération d'une page (HTTP ou Playwright)
- parse: Parsing d'un fichier HTML
- doctor: Vérification de l'installation
"""
import sys
from pathlib import Path
from typing import Optional
import redis
import typer
from rq import Worker
from alembic import command as alembic_command
from alembic.config import Config as AlembicConfig
from rich import print as rprint
from rich.console import Console
from rich.table import Table
from pricewatch.app.core import logging as app_logging
from pricewatch.app.core.config import get_config
from pricewatch.app.core.io import read_yaml_config, write_json_results
from pricewatch.app.core.logging import get_logger, set_level
from pricewatch.app.core.registry import get_registry, register_store
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod
from pricewatch.app.db.connection import init_db
from pricewatch.app.scraping.http_fetch import fetch_http
from pricewatch.app.scraping.pipeline import ScrapingPipeline
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore
from pricewatch.app.tasks.scheduler import RedisUnavailableError, ScrapingScheduler
# Créer l'application Typer
app = typer.Typer(
name="pricewatch",
help="Application de suivi de prix e-commerce",
add_completion=False,
)
console = Console()
logger = get_logger("cli")
def setup_stores():
"""Enregistre tous les stores disponibles dans le registry."""
registry = get_registry()
registry.register(AmazonStore())
registry.register(CdiscountStore())
def get_alembic_config() -> AlembicConfig:
"""Construit la configuration Alembic à partir du repository."""
root_path = Path(__file__).resolve().parents[3]
config_path = root_path / "alembic.ini"
migrations_path = root_path / "pricewatch" / "app" / "db" / "migrations"
if not config_path.exists():
logger.error(f"alembic.ini introuvable: {config_path}")
raise typer.Exit(code=1)
alembic_cfg = AlembicConfig(str(config_path))
alembic_cfg.set_main_option("script_location", str(migrations_path))
alembic_cfg.set_main_option("sqlalchemy.url", get_config().db.url)
return alembic_cfg
@app.command("init-db")
def init_db_command():
"""
Initialise la base de donnees (creer toutes les tables).
"""
try:
init_db(get_config())
except Exception as e:
logger.error(f"Init DB echoue: {e}")
raise typer.Exit(code=1)
@app.command()
def migrate(
message: str = typer.Argument(..., help="Message de migration"),
autogenerate: bool = typer.Option(True, "--autogenerate/--no-autogenerate"),
):
"""
Genere une migration Alembic.
"""
try:
alembic_cfg = get_alembic_config()
alembic_command.revision(alembic_cfg, message=message, autogenerate=autogenerate)
except Exception as e:
logger.error(f"Migration echouee: {e}")
raise typer.Exit(code=1)
@app.command()
def upgrade(revision: str = typer.Argument("head", help="Revision cible")):
"""
Applique les migrations Alembic.
"""
try:
alembic_cfg = get_alembic_config()
alembic_command.upgrade(alembic_cfg, revision)
except Exception as e:
logger.error(f"Upgrade echoue: {e}")
raise typer.Exit(code=1)
@app.command()
def downgrade(revision: str = typer.Argument("-1", help="Revision cible")):
"""
Rollback une migration Alembic.
"""
try:
alembic_cfg = get_alembic_config()
alembic_command.downgrade(alembic_cfg, revision)
except Exception as e:
logger.error(f"Downgrade echoue: {e}")
raise typer.Exit(code=1)
@app.command()
def run(
yaml: Path = typer.Option(
"scrap_url.yaml",
"--yaml",
"-y",
help="Fichier YAML de configuration",
exists=True,
),
out: Path = typer.Option(
"scraped_store.json",
"--out",
"-o",
help="Fichier JSON de sortie",
),
debug: bool = typer.Option(
False,
"--debug",
"-d",
help="Activer le mode debug",
),
save_db: Optional[bool] = typer.Option(
None,
"--save-db/--no-db",
help="Activer la persistence en base de donnees",
),
):
"""
Pipeline complet: scrape toutes les URLs du YAML et génère le JSON.
"""
if debug:
set_level("DEBUG")
logger.info("=== Démarrage du pipeline PriceWatch ===")
# Initialiser les stores
setup_stores()
registry = get_registry()
logger.info(f"Stores enregistrés: {', '.join(registry.list_stores())}")
# Lire la configuration
try:
config = read_yaml_config(yaml)
except Exception as e:
logger.error(f"Erreur lecture YAML: {e}")
raise typer.Exit(code=1)
app_config = get_config()
if save_db is None:
save_db = app_config.enable_db
pipeline = ScrapingPipeline(config=app_config)
logger.info(f"{len(config.urls)} URL(s) à scraper")
# Scraper chaque URL
snapshots = []
for i, url in enumerate(config.urls, 1):
logger.info(f"[{i}/{len(config.urls)}] Traitement: {url}")
# Détecter le store
store = registry.detect_store(url)
if not store:
logger.error(f"Aucun store trouvé pour: {url}")
continue
# Canoniser l'URL
canonical_url = store.canonicalize(url)
logger.info(f"URL canonique: {canonical_url}")
# Récupérer la page
html = None
fetch_method = FetchMethod.HTTP
fetch_error = None
http_result = None
if config.options.force_playwright:
logger.info("Playwright force, skip HTTP")
else:
logger.info("Tentative HTTP...")
http_result = fetch_http(canonical_url)
if http_result and http_result.success:
html = http_result.html
fetch_method = FetchMethod.HTTP
logger.info("✓ HTTP réussi")
elif config.options.use_playwright:
fallback_reason = http_result.error if http_result else "force_playwright"
logger.warning(f"HTTP échoué: {fallback_reason}, fallback Playwright")
pw_result = fetch_playwright(
canonical_url,
headless=not config.options.headful,
timeout_ms=config.options.timeout_ms,
save_screenshot=config.options.save_screenshot,
)
if pw_result.success:
html = pw_result.html
fetch_method = FetchMethod.PLAYWRIGHT
logger.info("✓ Playwright réussi")
# Sauvegarder screenshot si demandé
if config.options.save_screenshot and pw_result.screenshot:
from pricewatch.app.core.io import save_debug_screenshot
ref = store.extract_reference(canonical_url) or f"url_{i}"
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
else:
fetch_error = pw_result.error
logger.error(f"✗ Playwright échoué: {fetch_error}")
else:
fetch_error = http_result.error if http_result else "skip_http"
logger.error(f"✗ HTTP échoué: {fetch_error}")
# Parser si on a du HTML
if html:
try:
# Sauvegarder HTML si demandé
if config.options.save_html:
from pricewatch.app.core.io import save_debug_html
ref = store.extract_reference(canonical_url) or f"url_{i}"
save_debug_html(html, f"{store.store_id}_{ref}")
snapshot = store.parse(html, canonical_url)
snapshot.debug.method = fetch_method
if save_db:
product_id = pipeline.process_snapshot(snapshot, save_to_db=True)
if product_id:
logger.info(f"DB: produit id={product_id}")
snapshots.append(snapshot)
status_emoji = "" if snapshot.is_complete() else ""
logger.info(
f"{status_emoji} Parsing: title={bool(snapshot.title)}, "
f"price={snapshot.price is not None}"
)
except Exception as e:
logger.error(f"✗ Erreur parsing: {e}")
# Créer un snapshot failed
from pricewatch.app.core.schema import ProductSnapshot
snapshot = ProductSnapshot(
source=store.store_id,
url=canonical_url,
debug=DebugInfo(
method=fetch_method,
status=DebugStatus.FAILED,
errors=[f"Parsing failed: {str(e)}"],
),
)
if save_db:
pipeline.process_snapshot(snapshot, save_to_db=True)
snapshots.append(snapshot)
else:
# Pas de HTML récupéré
from pricewatch.app.core.schema import ProductSnapshot
snapshot = ProductSnapshot(
source=store.store_id if store else "unknown",
url=canonical_url,
debug=DebugInfo(
method=fetch_method,
status=DebugStatus.FAILED,
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
),
)
if save_db:
pipeline.process_snapshot(snapshot, save_to_db=True)
snapshots.append(snapshot)
# Écrire les résultats
logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {out}")
try:
write_json_results(snapshots, out)
logger.info("✓ Pipeline terminé avec succès")
except Exception as e:
logger.error(f"✗ Erreur écriture JSON: {e}")
raise typer.Exit(code=1)
@app.command()
def detect(url: str):
"""
Détecte le store correspondant à une URL.
"""
logger.info(f"Détection du store pour: {url}")
setup_stores()
registry = get_registry()
store = registry.detect_store(url)
if store:
rprint(f"[green]✓ Store détecté: {store.store_id}[/green]")
rprint(f" URL canonique: {store.canonicalize(url)}")
rprint(f" Référence: {store.extract_reference(url)}")
else:
rprint("[red]✗ Aucun store trouvé[/red]")
raise typer.Exit(code=1)
@app.command()
def fetch(
url: str,
http: bool = typer.Option(False, "--http", help="Forcer HTTP"),
playwright: bool = typer.Option(False, "--playwright", help="Forcer Playwright"),
headful: bool = typer.Option(False, "--headful", help="Mode Playwright visible"),
debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
"""
Récupère une page via HTTP ou Playwright.
"""
if debug:
set_level("DEBUG")
if http and playwright:
rprint("[red]✗ Impossible de spécifier --http et --playwright ensemble[/red]")
raise typer.Exit(code=1)
if playwright or (not http and not playwright):
# Playwright par défaut ou explicite
logger.info(f"Récupération via Playwright: {url}")
result = fetch_playwright(url, headless=not headful)
if result.success:
rprint(f"[green]✓ Succès[/green]")
rprint(f" Taille HTML: {len(result.html)} chars")
rprint(f" Durée: {result.duration_ms}ms")
else:
rprint(f"[red]✗ Échec: {result.error}[/red]")
raise typer.Exit(code=1)
else:
# HTTP explicite
logger.info(f"Récupération via HTTP: {url}")
result = fetch_http(url)
if result.success:
rprint(f"[green]✓ Succès[/green]")
rprint(f" Taille HTML: {len(result.html)} chars")
rprint(f" Status: {result.status_code}")
rprint(f" Durée: {result.duration_ms}ms")
else:
rprint(f"[red]✗ Échec: {result.error}[/red]")
raise typer.Exit(code=1)
@app.command()
def parse(
store: str = typer.Argument(..., help="Store ID (amazon, cdiscount)"),
html_file: Path = typer.Option(
..., "--in", "-i", help="Fichier HTML à parser", exists=True
),
debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
):
"""
Parse un fichier HTML avec un store spécifique.
"""
if debug:
set_level("DEBUG")
setup_stores()
registry = get_registry()
store_obj = registry.get_store(store)
if not store_obj:
rprint(f"[red]✗ Store inconnu: {store}[/red]")
rprint(f"Stores disponibles: {', '.join(registry.list_stores())}")
raise typer.Exit(code=1)
logger.info(f"Parsing avec {store}: {html_file}")
with open(html_file, "r", encoding="utf-8") as f:
html = f.read()
try:
snapshot = store_obj.parse(html, url="file://local")
if snapshot.is_complete():
rprint("[green]✓ Parsing réussi[/green]")
else:
rprint("[yellow]⚠ Parsing partiel[/yellow]")
rprint(f" Titre: {snapshot.title or 'N/A'}")
rprint(f" Prix: {snapshot.price} {snapshot.currency}")
rprint(f" Référence: {snapshot.reference or 'N/A'}")
rprint(f" Stock: {snapshot.stock_status}")
rprint(f" Images: {len(snapshot.images)}")
rprint(f" Specs: {len(snapshot.specs)}")
except Exception as e:
rprint(f"[red]✗ Erreur parsing: {e}[/red]")
raise typer.Exit(code=1)
@app.command()
def doctor():
"""
Vérifie l'installation de PriceWatch.
"""
table = Table(title="PriceWatch Doctor")
table.add_column("Composant", style="cyan")
table.add_column("Statut", style="green")
# Python version
table.add_row("Python", f"{sys.version.split()[0]}")
# Dépendances
deps = [
("typer", "typer"),
("pydantic", "pydantic"),
("requests", "requests"),
("playwright", "playwright"),
("beautifulsoup4", "bs4"),
("pyyaml", "yaml"),
]
for name, module in deps:
try:
__import__(module)
table.add_row(name, "✓ Installé")
except ImportError:
table.add_row(name, "✗ Manquant")
# Stores
setup_stores()
registry = get_registry()
table.add_row("Stores", f"{len(registry)} enregistrés: {', '.join(registry.list_stores())}")
console.print(table)
rprint("\n[green]✓ PriceWatch est prêt![/green]")
@app.command()
def worker(
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
with_scheduler: bool = typer.Option(
True, "--with-scheduler/--no-scheduler", help="Activer le scheduler RQ"
),
):
"""
Lance un worker RQ.
"""
config = get_config()
try:
connection = redis.from_url(config.redis.url)
# Verification connexion avant de lancer le worker
connection.ping()
except redis.exceptions.ConnectionError as e:
rprint(f"[red]✗ Impossible de se connecter a Redis ({config.redis.url})[/red]")
rprint(f"[red] Erreur: {e}[/red]")
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
rprint(" docker compose up -d redis")
rprint(" # ou")
rprint(" redis-server")
raise typer.Exit(code=1)
except redis.exceptions.RedisError as e:
rprint(f"[red]✗ Erreur Redis: {e}[/red]")
raise typer.Exit(code=1)
# RQ 2.x: connexion passee directement au Worker
worker_instance = Worker([queue], connection=connection)
worker_instance.work(with_scheduler=with_scheduler)
@app.command()
def enqueue(
url: str = typer.Argument(..., help="URL du produit a scraper"),
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
use_playwright: Optional[bool] = typer.Option(
None, "--playwright/--no-playwright", help="Forcer Playwright"
),
):
"""
Enqueue un scraping immediat.
"""
try:
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
except RedisUnavailableError as e:
rprint(f"[red]✗ {e.message}[/red]")
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
rprint(" docker compose up -d redis")
raise typer.Exit(code=1)
@app.command()
def schedule(
url: str = typer.Argument(..., help="URL du produit a planifier"),
interval: int = typer.Option(24, "--interval", help="Intervalle en heures"),
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
use_playwright: Optional[bool] = typer.Option(
None, "--playwright/--no-playwright", help="Forcer Playwright"
),
):
"""
Planifie un scraping recurrent.
"""
try:
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job_info = scheduler.schedule_product(
url,
interval_hours=interval,
use_playwright=use_playwright,
save_db=save_db,
)
rprint(
f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
)
except RedisUnavailableError as e:
rprint(f"[red]✗ {e.message}[/red]")
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
rprint(" docker compose up -d redis")
raise typer.Exit(code=1)
if __name__ == "__main__":
app()