194 lines
6.7 KiB
Python
194 lines
6.7 KiB
Python
"""
|
|
Tache de scraping asynchrone pour RQ.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from typing import Any, Optional
|
|
|
|
from pricewatch.app.core.config import AppConfig, get_config
|
|
from pricewatch.app.core.logging import get_logger
|
|
from pricewatch.app.core.registry import get_registry
|
|
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot
|
|
from pricewatch.app.scraping.http_fetch import fetch_http
|
|
from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
|
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
|
from pricewatch.app.stores.aliexpress.store import AliexpressStore
|
|
from pricewatch.app.stores.amazon.store import AmazonStore
|
|
from pricewatch.app.stores.backmarket.store import BackmarketStore
|
|
from pricewatch.app.stores.cdiscount.store import CdiscountStore
|
|
|
|
logger = get_logger("tasks.scrape")
|
|
|
|
|
|
def setup_stores() -> None:
|
|
"""Enregistre les stores disponibles si besoin."""
|
|
registry = get_registry()
|
|
if registry.list_stores():
|
|
return
|
|
registry.register(AmazonStore())
|
|
registry.register(CdiscountStore())
|
|
registry.register(BackmarketStore())
|
|
registry.register(AliexpressStore())
|
|
|
|
|
|
def scrape_product(
|
|
url: str,
|
|
use_playwright: Optional[bool] = None,
|
|
save_db: bool = True,
|
|
save_html: bool = False,
|
|
save_screenshot: bool = False,
|
|
headful: bool = False,
|
|
timeout_ms: Optional[int] = None,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Scrape un produit et persiste en base via ScrapingPipeline.
|
|
|
|
Retourne un dict avec success, product_id, snapshot, error.
|
|
"""
|
|
job_start_time = time.time()
|
|
logger.info(f"[JOB START] Scraping: {url}")
|
|
|
|
config: AppConfig = get_config()
|
|
setup_stores()
|
|
|
|
if use_playwright is None:
|
|
use_playwright = config.default_use_playwright
|
|
|
|
if timeout_ms is None:
|
|
timeout_ms = config.default_playwright_timeout
|
|
|
|
registry = get_registry()
|
|
store = registry.detect_store(url)
|
|
if not store:
|
|
elapsed_ms = int((time.time() - job_start_time) * 1000)
|
|
logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
|
|
snapshot = ProductSnapshot(
|
|
source="unknown",
|
|
url=url,
|
|
debug=DebugInfo(
|
|
method=FetchMethod.HTTP,
|
|
status=DebugStatus.FAILED,
|
|
errors=["Aucun store detecte"],
|
|
),
|
|
)
|
|
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
|
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
|
|
|
|
logger.info(f"[STORE] Detecte: {store.store_id}")
|
|
|
|
canonical_url = store.canonicalize(url)
|
|
|
|
html = None
|
|
fetch_method = FetchMethod.HTTP
|
|
fetch_error = None
|
|
duration_ms = None
|
|
html_size_bytes = None
|
|
pw_result = None
|
|
|
|
logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
|
|
http_result = fetch_http(canonical_url)
|
|
duration_ms = http_result.duration_ms
|
|
|
|
if http_result.success:
|
|
html = http_result.html
|
|
fetch_method = FetchMethod.HTTP
|
|
logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
|
|
elif use_playwright:
|
|
logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
|
|
pw_result = fetch_playwright(
|
|
canonical_url,
|
|
headless=not headful,
|
|
timeout_ms=timeout_ms,
|
|
save_screenshot=save_screenshot,
|
|
)
|
|
duration_ms = pw_result.duration_ms
|
|
|
|
if pw_result.success:
|
|
html = pw_result.html
|
|
fetch_method = FetchMethod.PLAYWRIGHT
|
|
logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
|
|
else:
|
|
fetch_error = pw_result.error
|
|
logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
|
|
else:
|
|
fetch_error = http_result.error
|
|
logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")
|
|
|
|
if html:
|
|
html_size_bytes = len(html.encode("utf-8"))
|
|
if save_html:
|
|
from pricewatch.app.core.io import save_debug_html
|
|
|
|
ref = store.extract_reference(canonical_url) or "unknown"
|
|
save_debug_html(html, f"{store.store_id}_{ref}")
|
|
|
|
if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result:
|
|
from pricewatch.app.core.io import save_debug_screenshot
|
|
|
|
if pw_result and pw_result.screenshot:
|
|
ref = store.extract_reference(canonical_url) or "unknown"
|
|
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
|
|
|
|
try:
|
|
logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
|
|
snapshot = store.parse(html, canonical_url)
|
|
snapshot.debug.method = fetch_method
|
|
snapshot.debug.duration_ms = duration_ms
|
|
snapshot.debug.html_size_bytes = html_size_bytes
|
|
success = snapshot.debug.status != DebugStatus.FAILED
|
|
if success:
|
|
logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
|
|
else:
|
|
logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
|
|
except Exception as exc:
|
|
logger.error(f"[PARSE] Exception: {exc}")
|
|
snapshot = ProductSnapshot(
|
|
source=store.store_id,
|
|
url=canonical_url,
|
|
debug=DebugInfo(
|
|
method=fetch_method,
|
|
status=DebugStatus.FAILED,
|
|
errors=[f"Parsing failed: {exc}"],
|
|
duration_ms=duration_ms,
|
|
html_size_bytes=html_size_bytes,
|
|
),
|
|
)
|
|
success = False
|
|
fetch_error = str(exc)
|
|
else:
|
|
snapshot = ProductSnapshot(
|
|
source=store.store_id,
|
|
url=canonical_url,
|
|
debug=DebugInfo(
|
|
method=fetch_method,
|
|
status=DebugStatus.FAILED,
|
|
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
|
|
duration_ms=duration_ms,
|
|
),
|
|
)
|
|
success = False
|
|
|
|
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
|
|
|
# Log final du job
|
|
elapsed_ms = int((time.time() - job_start_time) * 1000)
|
|
if success:
|
|
logger.info(
|
|
f"[JOB OK] {store.store_id}/{snapshot.reference} "
|
|
f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
|
|
)
|
|
else:
|
|
logger.warning(
|
|
f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
|
|
f"erreur={fetch_error} duree={elapsed_ms}ms"
|
|
)
|
|
|
|
return {
|
|
"success": success,
|
|
"product_id": product_id,
|
|
"snapshot": snapshot,
|
|
"error": fetch_error,
|
|
}
|