This commit is contained in:
Gilles Soulier
2026-01-14 21:54:55 +01:00
parent c91c0f1fc9
commit d0b73b9319
140 changed files with 5822 additions and 161 deletions

33
pricewatch/app/tasks/scrape.py Executable file → Normal file
View File

@@ -4,6 +4,7 @@ Tache de scraping asynchrone pour RQ.
from __future__ import annotations
import time
from typing import Any, Optional
from pricewatch.app.core.config import AppConfig, get_config
@@ -46,6 +47,9 @@ def scrape_product(
Retourne un dict avec success, product_id, snapshot, error.
"""
job_start_time = time.time()
logger.info(f"[JOB START] Scraping: {url}")
config: AppConfig = get_config()
setup_stores()
@@ -58,6 +62,8 @@ def scrape_product(
registry = get_registry()
store = registry.detect_store(url)
if not store:
elapsed_ms = int((time.time() - job_start_time) * 1000)
logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
snapshot = ProductSnapshot(
source="unknown",
url=url,
@@ -70,6 +76,8 @@ def scrape_product(
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
logger.info(f"[STORE] Detecte: {store.store_id}")
canonical_url = store.canonicalize(url)
html = None
@@ -79,13 +87,16 @@ def scrape_product(
html_size_bytes = None
pw_result = None
logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
http_result = fetch_http(canonical_url)
duration_ms = http_result.duration_ms
if http_result.success:
html = http_result.html
fetch_method = FetchMethod.HTTP
logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
elif use_playwright:
logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
pw_result = fetch_playwright(
canonical_url,
headless=not headful,
@@ -97,10 +108,13 @@ def scrape_product(
if pw_result.success:
html = pw_result.html
fetch_method = FetchMethod.PLAYWRIGHT
logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
else:
fetch_error = pw_result.error
logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
else:
fetch_error = http_result.error
logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")
if html:
html_size_bytes = len(html.encode("utf-8"))
@@ -118,12 +132,18 @@ def scrape_product(
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
try:
logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
snapshot = store.parse(html, canonical_url)
snapshot.debug.method = fetch_method
snapshot.debug.duration_ms = duration_ms
snapshot.debug.html_size_bytes = html_size_bytes
success = snapshot.debug.status != DebugStatus.FAILED
if success:
logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
else:
logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
except Exception as exc:
logger.error(f"[PARSE] Exception: {exc}")
snapshot = ProductSnapshot(
source=store.store_id,
url=canonical_url,
@@ -152,6 +172,19 @@ def scrape_product(
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
# Log final du job
elapsed_ms = int((time.time() - job_start_time) * 1000)
if success:
logger.info(
f"[JOB OK] {store.store_id}/{snapshot.reference} "
f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
)
else:
logger.warning(
f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
f"erreur={fetch_error} duree={elapsed_ms}ms"
)
return {
"success": success,
"product_id": product_id,