""" Tache de scraping asynchrone pour RQ. """ from __future__ import annotations import time from typing import Any, Optional from pricewatch.app.core.config import AppConfig, get_config from pricewatch.app.core.logging import get_logger from pricewatch.app.core.registry import get_registry from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot from pricewatch.app.scraping.http_fetch import fetch_http from pricewatch.app.scraping.pipeline import ScrapingPipeline from pricewatch.app.scraping.pw_fetch import fetch_playwright from pricewatch.app.stores.aliexpress.store import AliexpressStore from pricewatch.app.stores.amazon.store import AmazonStore from pricewatch.app.stores.backmarket.store import BackmarketStore from pricewatch.app.stores.cdiscount.store import CdiscountStore logger = get_logger("tasks.scrape") def setup_stores() -> None: """Enregistre les stores disponibles si besoin.""" registry = get_registry() if registry.list_stores(): return registry.register(AmazonStore()) registry.register(CdiscountStore()) registry.register(BackmarketStore()) registry.register(AliexpressStore()) def scrape_product( url: str, use_playwright: Optional[bool] = None, save_db: bool = True, save_html: bool = False, save_screenshot: bool = False, headful: bool = False, timeout_ms: Optional[int] = None, ) -> dict[str, Any]: """ Scrape un produit et persiste en base via ScrapingPipeline. Retourne un dict avec success, product_id, snapshot, error. """ job_start_time = time.time() logger.info(f"[JOB START] Scraping: {url}") config: AppConfig = get_config() setup_stores() if use_playwright is None: use_playwright = config.default_use_playwright if timeout_ms is None: timeout_ms = config.default_playwright_timeout registry = get_registry() store = registry.detect_store(url) if not store: elapsed_ms = int((time.time() - job_start_time) * 1000) logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)") snapshot = ProductSnapshot( source="unknown", url=url, debug=DebugInfo( method=FetchMethod.HTTP, status=DebugStatus.FAILED, errors=["Aucun store detecte"], ), ) ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db) return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"} logger.info(f"[STORE] Detecte: {store.store_id}") canonical_url = store.canonicalize(url) html = None fetch_method = FetchMethod.HTTP fetch_error = None duration_ms = None html_size_bytes = None pw_result = None logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}") http_result = fetch_http(canonical_url) duration_ms = http_result.duration_ms if http_result.success: html = http_result.html fetch_method = FetchMethod.HTTP logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})") elif use_playwright: logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright") pw_result = fetch_playwright( canonical_url, headless=not headful, timeout_ms=timeout_ms, save_screenshot=save_screenshot, ) duration_ms = pw_result.duration_ms if pw_result.success: html = pw_result.html fetch_method = FetchMethod.PLAYWRIGHT logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})") else: fetch_error = pw_result.error logger.warning(f"[FETCH] Playwright echoue: {fetch_error}") else: fetch_error = http_result.error logger.warning(f"[FETCH] HTTP echoue: {fetch_error}") if html: html_size_bytes = len(html.encode("utf-8")) if save_html: from pricewatch.app.core.io import save_debug_html ref = store.extract_reference(canonical_url) or "unknown" save_debug_html(html, f"{store.store_id}_{ref}") if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result: from pricewatch.app.core.io import save_debug_screenshot if pw_result and pw_result.screenshot: ref = store.extract_reference(canonical_url) or "unknown" save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}") try: logger.debug(f"[PARSE] Parsing avec {store.store_id}...") snapshot = store.parse(html, canonical_url) snapshot.debug.method = fetch_method snapshot.debug.duration_ms = duration_ms snapshot.debug.html_size_bytes = html_size_bytes success = snapshot.debug.status != DebugStatus.FAILED if success: logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}") else: logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}") except Exception as exc: logger.error(f"[PARSE] Exception: {exc}") snapshot = ProductSnapshot( source=store.store_id, url=canonical_url, debug=DebugInfo( method=fetch_method, status=DebugStatus.FAILED, errors=[f"Parsing failed: {exc}"], duration_ms=duration_ms, html_size_bytes=html_size_bytes, ), ) success = False fetch_error = str(exc) else: snapshot = ProductSnapshot( source=store.store_id, url=canonical_url, debug=DebugInfo( method=fetch_method, status=DebugStatus.FAILED, errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"], duration_ms=duration_ms, ), ) success = False product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db) # Log final du job elapsed_ms = int((time.time() - job_start_time) * 1000) if success: logger.info( f"[JOB OK] {store.store_id}/{snapshot.reference} " f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms" ) else: logger.warning( f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} " f"erreur={fetch_error} duree={elapsed_ms}ms" ) return { "success": success, "product_id": product_id, "snapshot": snapshot, "error": fetch_error, }