codex

2026-01-14 07:03:38 +01:00
parent ecda149a4b
commit c91c0f1fc9
61 changed files with 4388 additions and 38 deletions
@@ -0,0 +1,8 @@
+"""
+Module tasks pour les jobs RQ.
+"""
+
+from pricewatch.app.tasks.scrape import scrape_product
+from pricewatch.app.tasks.scheduler import ScrapingScheduler
+
+__all__ = ["scrape_product", "ScrapingScheduler"]
@@ -0,0 +1,75 @@
+"""
+Planification des jobs de scraping via RQ Scheduler.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from typing import Optional
+
+import redis
+from rq import Queue
+from rq_scheduler import Scheduler
+
+from pricewatch.app.core.config import AppConfig, get_config
+from pricewatch.app.core.logging import get_logger
+from pricewatch.app.tasks.scrape import scrape_product
+
+logger = get_logger("tasks.scheduler")
+
+
+@dataclass
+class ScheduledJobInfo:
+    """Infos de retour pour un job planifie."""
+
+    job_id: str
+    next_run: datetime
+
+
+class ScrapingScheduler:
+    """Scheduler pour les jobs de scraping avec RQ."""
+
+    def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
+        self.config = config or get_config()
+        self.redis = redis.from_url(self.config.redis.url)
+        self.queue = Queue(queue_name, connection=self.redis)
+        self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
+
+    def enqueue_immediate(
+        self,
+        url: str,
+        use_playwright: Optional[bool] = None,
+        save_db: bool = True,
+    ):
+        """Enqueue un job immediat."""
+        job = self.queue.enqueue(
+            scrape_product,
+            url,
+            use_playwright=use_playwright,
+            save_db=save_db,
+        )
+        logger.info(f"Job enqueued: {job.id}")
+        return job
+
+    def schedule_product(
+        self,
+        url: str,
+        interval_hours: int = 24,
+        use_playwright: Optional[bool] = None,
+        save_db: bool = True,
+    ) -> ScheduledJobInfo:
+        """Planifie un scraping recurrent (intervalle en heures)."""
+        interval_seconds = int(timedelta(hours=interval_hours).total_seconds())
+        next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds)
+
+        job = self.scheduler.schedule(
+            scheduled_time=next_run,
+            func=scrape_product,
+            args=[url],
+            kwargs={"use_playwright": use_playwright, "save_db": save_db},
+            interval=interval_seconds,
+            repeat=None,
+        )
+        logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}")
+        return ScheduledJobInfo(job_id=job.id, next_run=next_run)
@@ -0,0 +1,160 @@
+"""
+Tache de scraping asynchrone pour RQ.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pricewatch.app.core.config import AppConfig, get_config
+from pricewatch.app.core.logging import get_logger
+from pricewatch.app.core.registry import get_registry
+from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot
+from pricewatch.app.scraping.http_fetch import fetch_http
+from pricewatch.app.scraping.pipeline import ScrapingPipeline
+from pricewatch.app.scraping.pw_fetch import fetch_playwright
+from pricewatch.app.stores.aliexpress.store import AliexpressStore
+from pricewatch.app.stores.amazon.store import AmazonStore
+from pricewatch.app.stores.backmarket.store import BackmarketStore
+from pricewatch.app.stores.cdiscount.store import CdiscountStore
+
+logger = get_logger("tasks.scrape")
+
+
+def setup_stores() -> None:
+    """Enregistre les stores disponibles si besoin."""
+    registry = get_registry()
+    if registry.list_stores():
+        return
+    registry.register(AmazonStore())
+    registry.register(CdiscountStore())
+    registry.register(BackmarketStore())
+    registry.register(AliexpressStore())
+
+
+def scrape_product(
+    url: str,
+    use_playwright: Optional[bool] = None,
+    save_db: bool = True,
+    save_html: bool = False,
+    save_screenshot: bool = False,
+    headful: bool = False,
+    timeout_ms: Optional[int] = None,
+) -> dict[str, Any]:
+    """
+    Scrape un produit et persiste en base via ScrapingPipeline.
+
+    Retourne un dict avec success, product_id, snapshot, error.
+    """
+    config: AppConfig = get_config()
+    setup_stores()
+
+    if use_playwright is None:
+        use_playwright = config.default_use_playwright
+
+    if timeout_ms is None:
+        timeout_ms = config.default_playwright_timeout
+
+    registry = get_registry()
+    store = registry.detect_store(url)
+    if not store:
+        snapshot = ProductSnapshot(
+            source="unknown",
+            url=url,
+            debug=DebugInfo(
+                method=FetchMethod.HTTP,
+                status=DebugStatus.FAILED,
+                errors=["Aucun store detecte"],
+            ),
+        )
+        ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
+        return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
+
+    canonical_url = store.canonicalize(url)
+
+    html = None
+    fetch_method = FetchMethod.HTTP
+    fetch_error = None
+    duration_ms = None
+    html_size_bytes = None
+    pw_result = None
+
+    http_result = fetch_http(canonical_url)
+    duration_ms = http_result.duration_ms
+
+    if http_result.success:
+        html = http_result.html
+        fetch_method = FetchMethod.HTTP
+    elif use_playwright:
+        pw_result = fetch_playwright(
+            canonical_url,
+            headless=not headful,
+            timeout_ms=timeout_ms,
+            save_screenshot=save_screenshot,
+        )
+        duration_ms = pw_result.duration_ms
+
+        if pw_result.success:
+            html = pw_result.html
+            fetch_method = FetchMethod.PLAYWRIGHT
+        else:
+            fetch_error = pw_result.error
+    else:
+        fetch_error = http_result.error
+
+    if html:
+        html_size_bytes = len(html.encode("utf-8"))
+        if save_html:
+            from pricewatch.app.core.io import save_debug_html
+
+            ref = store.extract_reference(canonical_url) or "unknown"
+            save_debug_html(html, f"{store.store_id}_{ref}")
+
+        if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result:
+            from pricewatch.app.core.io import save_debug_screenshot
+
+            if pw_result and pw_result.screenshot:
+                ref = store.extract_reference(canonical_url) or "unknown"
+                save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
+
+        try:
+            snapshot = store.parse(html, canonical_url)
+            snapshot.debug.method = fetch_method
+            snapshot.debug.duration_ms = duration_ms
+            snapshot.debug.html_size_bytes = html_size_bytes
+            success = snapshot.debug.status != DebugStatus.FAILED
+        except Exception as exc:
+            snapshot = ProductSnapshot(
+                source=store.store_id,
+                url=canonical_url,
+                debug=DebugInfo(
+                    method=fetch_method,
+                    status=DebugStatus.FAILED,
+                    errors=[f"Parsing failed: {exc}"],
+                    duration_ms=duration_ms,
+                    html_size_bytes=html_size_bytes,
+                ),
+            )
+            success = False
+            fetch_error = str(exc)
+    else:
+        snapshot = ProductSnapshot(
+            source=store.store_id,
+            url=canonical_url,
+            debug=DebugInfo(
+                method=fetch_method,
+                status=DebugStatus.FAILED,
+                errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
+                duration_ms=duration_ms,
+            ),
+        )
+        success = False
+
+    product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
+
+    return {
+        "success": success,
+        "product_id": product_id,
+        "snapshot": snapshot,
+        "error": fetch_error,
+    }