before claude
This commit is contained in:
@@ -6,6 +6,7 @@ from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import hashlib
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
@@ -127,11 +128,13 @@ class ScrapingScheduler:
|
||||
interval_hours: int = 24,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
job_id: Optional[str] = None,
|
||||
) -> ScheduledJobInfo:
|
||||
"""Planifie un scraping recurrent (intervalle en heures)."""
|
||||
interval_seconds = int(timedelta(hours=interval_hours).total_seconds())
|
||||
next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds)
|
||||
|
||||
resolved_job_id = job_id or self._job_id_for_url(url)
|
||||
job = self.scheduler.schedule(
|
||||
scheduled_time=next_run,
|
||||
func=scrape_product,
|
||||
@@ -139,6 +142,13 @@ class ScrapingScheduler:
|
||||
kwargs={"use_playwright": use_playwright, "save_db": save_db},
|
||||
interval=interval_seconds,
|
||||
repeat=None,
|
||||
id=resolved_job_id,
|
||||
)
|
||||
logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}")
|
||||
return ScheduledJobInfo(job_id=job.id, next_run=next_run)
|
||||
|
||||
@staticmethod
|
||||
def _job_id_for_url(url: str) -> str:
|
||||
"""Genere un job_id stable pour eviter les doublons."""
|
||||
fingerprint = hashlib.sha1(url.strip().lower().encode("utf-8")).hexdigest()
|
||||
return f"scrape_{fingerprint}"
|
||||
|
||||
@@ -157,6 +157,36 @@ def scrape_product(
|
||||
)
|
||||
success = False
|
||||
fetch_error = str(exc)
|
||||
# Si captcha detecte via HTTP, forcer une tentative Playwright.
|
||||
if (
|
||||
fetch_method == FetchMethod.HTTP
|
||||
and use_playwright
|
||||
and snapshot.debug.errors
|
||||
and any("captcha" in error.lower() for error in snapshot.debug.errors)
|
||||
):
|
||||
logger.info("[FETCH] Captcha detecte, tentative Playwright")
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not headful,
|
||||
timeout_ms=timeout_ms,
|
||||
save_screenshot=save_screenshot,
|
||||
)
|
||||
if pw_result.success and pw_result.html:
|
||||
try:
|
||||
snapshot = store.parse(pw_result.html, canonical_url)
|
||||
snapshot.debug.method = FetchMethod.PLAYWRIGHT
|
||||
snapshot.debug.duration_ms = pw_result.duration_ms
|
||||
snapshot.debug.html_size_bytes = len(pw_result.html.encode("utf-8"))
|
||||
snapshot.add_note("Captcha detecte via HTTP, fallback Playwright")
|
||||
success = snapshot.debug.status != DebugStatus.FAILED
|
||||
except Exception as exc:
|
||||
snapshot.add_note(f"Fallback Playwright echoue: {exc}")
|
||||
logger.error(f"[PARSE] Exception fallback Playwright: {exc}")
|
||||
fetch_error = str(exc)
|
||||
else:
|
||||
error = pw_result.error or "Erreur Playwright"
|
||||
snapshot.add_note(f"Fallback Playwright echoue: {error}")
|
||||
fetch_error = error
|
||||
else:
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
|
||||
Reference in New Issue
Block a user