before claude

This commit is contained in:
Gilles Soulier
2026-01-18 06:26:17 +01:00
parent dc19315e5d
commit 740c3d7516
60 changed files with 3815 additions and 354 deletions

View File

@@ -6,6 +6,7 @@ from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
import hashlib
from typing import Optional
import redis
@@ -127,11 +128,13 @@ class ScrapingScheduler:
interval_hours: int = 24,
use_playwright: Optional[bool] = None,
save_db: bool = True,
job_id: Optional[str] = None,
) -> ScheduledJobInfo:
"""Planifie un scraping recurrent (intervalle en heures)."""
interval_seconds = int(timedelta(hours=interval_hours).total_seconds())
next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds)
resolved_job_id = job_id or self._job_id_for_url(url)
job = self.scheduler.schedule(
scheduled_time=next_run,
func=scrape_product,
@@ -139,6 +142,13 @@ class ScrapingScheduler:
kwargs={"use_playwright": use_playwright, "save_db": save_db},
interval=interval_seconds,
repeat=None,
id=resolved_job_id,
)
logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}")
return ScheduledJobInfo(job_id=job.id, next_run=next_run)
@staticmethod
def _job_id_for_url(url: str) -> str:
"""Genere un job_id stable pour eviter les doublons."""
fingerprint = hashlib.sha1(url.strip().lower().encode("utf-8")).hexdigest()
return f"scrape_{fingerprint}"

View File

@@ -157,6 +157,36 @@ def scrape_product(
)
success = False
fetch_error = str(exc)
# Si captcha detecte via HTTP, forcer une tentative Playwright.
if (
fetch_method == FetchMethod.HTTP
and use_playwright
and snapshot.debug.errors
and any("captcha" in error.lower() for error in snapshot.debug.errors)
):
logger.info("[FETCH] Captcha detecte, tentative Playwright")
pw_result = fetch_playwright(
canonical_url,
headless=not headful,
timeout_ms=timeout_ms,
save_screenshot=save_screenshot,
)
if pw_result.success and pw_result.html:
try:
snapshot = store.parse(pw_result.html, canonical_url)
snapshot.debug.method = FetchMethod.PLAYWRIGHT
snapshot.debug.duration_ms = pw_result.duration_ms
snapshot.debug.html_size_bytes = len(pw_result.html.encode("utf-8"))
snapshot.add_note("Captcha detecte via HTTP, fallback Playwright")
success = snapshot.debug.status != DebugStatus.FAILED
except Exception as exc:
snapshot.add_note(f"Fallback Playwright echoue: {exc}")
logger.error(f"[PARSE] Exception fallback Playwright: {exc}")
fetch_error = str(exc)
else:
error = pw_result.error or "Erreur Playwright"
snapshot.add_note(f"Fallback Playwright echoue: {error}")
fetch_error = error
else:
snapshot = ProductSnapshot(
source=store.store_id,