This commit is contained in:
2026-01-14 07:03:38 +01:00
parent ecda149a4b
commit c91c0f1fc9
61 changed files with 4388 additions and 38 deletions

View File

@@ -13,20 +13,28 @@ import sys
from pathlib import Path
from typing import Optional
import redis
import typer
from rq import Connection, Worker
from alembic import command as alembic_command
from alembic.config import Config as AlembicConfig
from rich import print as rprint
from rich.console import Console
from rich.table import Table
from pricewatch.app.core import logging as app_logging
from pricewatch.app.core.config import get_config
from pricewatch.app.core.io import read_yaml_config, write_json_results
from pricewatch.app.core.logging import get_logger, set_level
from pricewatch.app.core.registry import get_registry, register_store
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod
from pricewatch.app.db.connection import init_db
from pricewatch.app.scraping.http_fetch import fetch_http
from pricewatch.app.scraping.pipeline import ScrapingPipeline
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore
from pricewatch.app.tasks.scheduler import ScrapingScheduler
# Créer l'application Typer
app = typer.Typer(
@@ -46,6 +54,75 @@ def setup_stores():
registry.register(CdiscountStore())
def get_alembic_config() -> AlembicConfig:
"""Construit la configuration Alembic à partir du repository."""
root_path = Path(__file__).resolve().parents[3]
config_path = root_path / "alembic.ini"
migrations_path = root_path / "pricewatch" / "app" / "db" / "migrations"
if not config_path.exists():
logger.error(f"alembic.ini introuvable: {config_path}")
raise typer.Exit(code=1)
alembic_cfg = AlembicConfig(str(config_path))
alembic_cfg.set_main_option("script_location", str(migrations_path))
alembic_cfg.set_main_option("sqlalchemy.url", get_config().db.url)
return alembic_cfg
@app.command("init-db")
def init_db_command():
"""
Initialise la base de donnees (creer toutes les tables).
"""
try:
init_db(get_config())
except Exception as e:
logger.error(f"Init DB echoue: {e}")
raise typer.Exit(code=1)
@app.command()
def migrate(
message: str = typer.Argument(..., help="Message de migration"),
autogenerate: bool = typer.Option(True, "--autogenerate/--no-autogenerate"),
):
"""
Genere une migration Alembic.
"""
try:
alembic_cfg = get_alembic_config()
alembic_command.revision(alembic_cfg, message=message, autogenerate=autogenerate)
except Exception as e:
logger.error(f"Migration echouee: {e}")
raise typer.Exit(code=1)
@app.command()
def upgrade(revision: str = typer.Argument("head", help="Revision cible")):
"""
Applique les migrations Alembic.
"""
try:
alembic_cfg = get_alembic_config()
alembic_command.upgrade(alembic_cfg, revision)
except Exception as e:
logger.error(f"Upgrade echoue: {e}")
raise typer.Exit(code=1)
@app.command()
def downgrade(revision: str = typer.Argument("-1", help="Revision cible")):
"""
Rollback une migration Alembic.
"""
try:
alembic_cfg = get_alembic_config()
alembic_command.downgrade(alembic_cfg, revision)
except Exception as e:
logger.error(f"Downgrade echoue: {e}")
raise typer.Exit(code=1)
@app.command()
def run(
yaml: Path = typer.Option(
@@ -67,6 +144,11 @@ def run(
"-d",
help="Activer le mode debug",
),
save_db: Optional[bool] = typer.Option(
None,
"--save-db/--no-db",
help="Activer la persistence en base de donnees",
),
):
"""
Pipeline complet: scrape toutes les URLs du YAML et génère le JSON.
@@ -88,6 +170,12 @@ def run(
logger.error(f"Erreur lecture YAML: {e}")
raise typer.Exit(code=1)
app_config = get_config()
if save_db is None:
save_db = app_config.enable_db
pipeline = ScrapingPipeline(config=app_config)
logger.info(f"{len(config.urls)} URL(s) à scraper")
# Scraper chaque URL
@@ -158,6 +246,11 @@ def run(
snapshot = store.parse(html, canonical_url)
snapshot.debug.method = fetch_method
if save_db:
product_id = pipeline.process_snapshot(snapshot, save_to_db=True)
if product_id:
logger.info(f"DB: produit id={product_id}")
snapshots.append(snapshot)
status_emoji = "" if snapshot.is_complete() else ""
@@ -180,6 +273,8 @@ def run(
errors=[f"Parsing failed: {str(e)}"],
),
)
if save_db:
pipeline.process_snapshot(snapshot, save_to_db=True)
snapshots.append(snapshot)
else:
# Pas de HTML récupéré
@@ -194,6 +289,8 @@ def run(
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
),
)
if save_db:
pipeline.process_snapshot(snapshot, save_to_db=True)
snapshots.append(snapshot)
# Écrire les résultats
@@ -359,5 +456,65 @@ def doctor():
rprint("\n[green]✓ PriceWatch est prêt![/green]")
@app.command()
def worker(
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
with_scheduler: bool = typer.Option(
True, "--with-scheduler/--no-scheduler", help="Activer le scheduler RQ"
),
):
"""
Lance un worker RQ.
"""
config = get_config()
connection = redis.from_url(config.redis.url)
with Connection(connection):
worker_instance = Worker([queue])
worker_instance.work(with_scheduler=with_scheduler)
@app.command()
def enqueue(
url: str = typer.Argument(..., help="URL du produit a scraper"),
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
use_playwright: Optional[bool] = typer.Option(
None, "--playwright/--no-playwright", help="Forcer Playwright"
),
):
"""
Enqueue un scraping immediat.
"""
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
@app.command()
def schedule(
url: str = typer.Argument(..., help="URL du produit a planifier"),
interval: int = typer.Option(24, "--interval", help="Intervalle en heures"),
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
use_playwright: Optional[bool] = typer.Option(
None, "--playwright/--no-playwright", help="Forcer Playwright"
),
):
"""
Planifie un scraping recurrent.
"""
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job_info = scheduler.schedule_product(
url,
interval_hours=interval,
use_playwright=use_playwright,
save_db=save_db,
)
rprint(
f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
)
if __name__ == "__main__":
app()