This commit is contained in:
2026-01-14 07:03:38 +01:00
parent ecda149a4b
commit c91c0f1fc9
61 changed files with 4388 additions and 38 deletions

160
pricewatch/app/tasks/scrape.py Executable file
View File

@@ -0,0 +1,160 @@
"""
Tache de scraping asynchrone pour RQ.
"""
from __future__ import annotations
from typing import Any, Optional
from pricewatch.app.core.config import AppConfig, get_config
from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.registry import get_registry
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot
from pricewatch.app.scraping.http_fetch import fetch_http
from pricewatch.app.scraping.pipeline import ScrapingPipeline
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.aliexpress.store import AliexpressStore
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.backmarket.store import BackmarketStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore
logger = get_logger("tasks.scrape")
def setup_stores() -> None:
"""Enregistre les stores disponibles si besoin."""
registry = get_registry()
if registry.list_stores():
return
registry.register(AmazonStore())
registry.register(CdiscountStore())
registry.register(BackmarketStore())
registry.register(AliexpressStore())
def scrape_product(
url: str,
use_playwright: Optional[bool] = None,
save_db: bool = True,
save_html: bool = False,
save_screenshot: bool = False,
headful: bool = False,
timeout_ms: Optional[int] = None,
) -> dict[str, Any]:
"""
Scrape un produit et persiste en base via ScrapingPipeline.
Retourne un dict avec success, product_id, snapshot, error.
"""
config: AppConfig = get_config()
setup_stores()
if use_playwright is None:
use_playwright = config.default_use_playwright
if timeout_ms is None:
timeout_ms = config.default_playwright_timeout
registry = get_registry()
store = registry.detect_store(url)
if not store:
snapshot = ProductSnapshot(
source="unknown",
url=url,
debug=DebugInfo(
method=FetchMethod.HTTP,
status=DebugStatus.FAILED,
errors=["Aucun store detecte"],
),
)
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
canonical_url = store.canonicalize(url)
html = None
fetch_method = FetchMethod.HTTP
fetch_error = None
duration_ms = None
html_size_bytes = None
pw_result = None
http_result = fetch_http(canonical_url)
duration_ms = http_result.duration_ms
if http_result.success:
html = http_result.html
fetch_method = FetchMethod.HTTP
elif use_playwright:
pw_result = fetch_playwright(
canonical_url,
headless=not headful,
timeout_ms=timeout_ms,
save_screenshot=save_screenshot,
)
duration_ms = pw_result.duration_ms
if pw_result.success:
html = pw_result.html
fetch_method = FetchMethod.PLAYWRIGHT
else:
fetch_error = pw_result.error
else:
fetch_error = http_result.error
if html:
html_size_bytes = len(html.encode("utf-8"))
if save_html:
from pricewatch.app.core.io import save_debug_html
ref = store.extract_reference(canonical_url) or "unknown"
save_debug_html(html, f"{store.store_id}_{ref}")
if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result:
from pricewatch.app.core.io import save_debug_screenshot
if pw_result and pw_result.screenshot:
ref = store.extract_reference(canonical_url) or "unknown"
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
try:
snapshot = store.parse(html, canonical_url)
snapshot.debug.method = fetch_method
snapshot.debug.duration_ms = duration_ms
snapshot.debug.html_size_bytes = html_size_bytes
success = snapshot.debug.status != DebugStatus.FAILED
except Exception as exc:
snapshot = ProductSnapshot(
source=store.store_id,
url=canonical_url,
debug=DebugInfo(
method=fetch_method,
status=DebugStatus.FAILED,
errors=[f"Parsing failed: {exc}"],
duration_ms=duration_ms,
html_size_bytes=html_size_bytes,
),
)
success = False
fetch_error = str(exc)
else:
snapshot = ProductSnapshot(
source=store.store_id,
url=canonical_url,
debug=DebugInfo(
method=fetch_method,
status=DebugStatus.FAILED,
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
duration_ms=duration_ms,
),
)
success = False
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
return {
"success": success,
"product_id": product_id,
"snapshot": snapshot,
"error": fetch_error,
}