This commit is contained in:
Gilles Soulier
2026-01-14 21:54:55 +01:00
parent c91c0f1fc9
commit d0b73b9319
140 changed files with 5822 additions and 161 deletions

13
pricewatch/app/tasks/__init__.py Executable file → Normal file
View File

@@ -3,6 +3,15 @@ Module tasks pour les jobs RQ.
"""
from pricewatch.app.tasks.scrape import scrape_product
from pricewatch.app.tasks.scheduler import ScrapingScheduler
from pricewatch.app.tasks.scheduler import (
RedisUnavailableError,
ScrapingScheduler,
check_redis_connection,
)
__all__ = ["scrape_product", "ScrapingScheduler"]
__all__ = [
"scrape_product",
"ScrapingScheduler",
"RedisUnavailableError",
"check_redis_connection",
]

75
pricewatch/app/tasks/scheduler.py Executable file → Normal file
View File

@@ -9,6 +9,8 @@ from datetime import datetime, timedelta, timezone
from typing import Optional
import redis
from redis.exceptions import ConnectionError as RedisConnectionError
from redis.exceptions import RedisError, TimeoutError as RedisTimeoutError
from rq import Queue
from rq_scheduler import Scheduler
@@ -19,6 +21,15 @@ from pricewatch.app.tasks.scrape import scrape_product
logger = get_logger("tasks.scheduler")
class RedisUnavailableError(Exception):
"""Exception levee quand Redis n'est pas disponible."""
def __init__(self, message: str = "Redis non disponible", cause: Optional[Exception] = None):
self.message = message
self.cause = cause
super().__init__(self.message)
@dataclass
class ScheduledJobInfo:
"""Infos de retour pour un job planifie."""
@@ -27,14 +38,72 @@ class ScheduledJobInfo:
next_run: datetime
def check_redis_connection(redis_url: str) -> bool:
"""
Verifie si Redis est accessible.
Returns:
True si Redis repond, False sinon.
"""
try:
conn = redis.from_url(redis_url)
conn.ping()
return True
except (RedisConnectionError, RedisTimeoutError, RedisError) as e:
logger.debug(f"Redis ping echoue: {e}")
return False
class ScrapingScheduler:
"""Scheduler pour les jobs de scraping avec RQ."""
def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
self.config = config or get_config()
self.redis = redis.from_url(self.config.redis.url)
self.queue = Queue(queue_name, connection=self.redis)
self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
self._queue_name = queue_name
self._redis: Optional[redis.Redis] = None
self._queue: Optional[Queue] = None
self._scheduler: Optional[Scheduler] = None
def _ensure_connected(self) -> None:
"""Etablit la connexion Redis si necessaire, leve RedisUnavailableError si echec."""
if self._redis is not None:
return
try:
self._redis = redis.from_url(self.config.redis.url)
# Ping pour verifier la connexion
self._redis.ping()
self._queue = Queue(self._queue_name, connection=self._redis)
self._scheduler = Scheduler(queue=self._queue, connection=self._redis)
logger.debug(f"Connexion Redis etablie: {self.config.redis.url}")
except (RedisConnectionError, RedisTimeoutError) as e:
self._redis = None
msg = f"Impossible de se connecter a Redis ({self.config.redis.url}): {e}"
logger.error(msg)
raise RedisUnavailableError(msg, cause=e) from e
except RedisError as e:
self._redis = None
msg = f"Erreur Redis: {e}"
logger.error(msg)
raise RedisUnavailableError(msg, cause=e) from e
@property
def redis(self) -> redis.Redis:
"""Acces a la connexion Redis (lazy)."""
self._ensure_connected()
return self._redis # type: ignore
@property
def queue(self) -> Queue:
"""Acces a la queue RQ (lazy)."""
self._ensure_connected()
return self._queue # type: ignore
@property
def scheduler(self) -> Scheduler:
"""Acces au scheduler RQ (lazy)."""
self._ensure_connected()
return self._scheduler # type: ignore
def enqueue_immediate(
self,

33
pricewatch/app/tasks/scrape.py Executable file → Normal file
View File

@@ -4,6 +4,7 @@ Tache de scraping asynchrone pour RQ.
from __future__ import annotations
import time
from typing import Any, Optional
from pricewatch.app.core.config import AppConfig, get_config
@@ -46,6 +47,9 @@ def scrape_product(
Retourne un dict avec success, product_id, snapshot, error.
"""
job_start_time = time.time()
logger.info(f"[JOB START] Scraping: {url}")
config: AppConfig = get_config()
setup_stores()
@@ -58,6 +62,8 @@ def scrape_product(
registry = get_registry()
store = registry.detect_store(url)
if not store:
elapsed_ms = int((time.time() - job_start_time) * 1000)
logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
snapshot = ProductSnapshot(
source="unknown",
url=url,
@@ -70,6 +76,8 @@ def scrape_product(
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
logger.info(f"[STORE] Detecte: {store.store_id}")
canonical_url = store.canonicalize(url)
html = None
@@ -79,13 +87,16 @@ def scrape_product(
html_size_bytes = None
pw_result = None
logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
http_result = fetch_http(canonical_url)
duration_ms = http_result.duration_ms
if http_result.success:
html = http_result.html
fetch_method = FetchMethod.HTTP
logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
elif use_playwright:
logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
pw_result = fetch_playwright(
canonical_url,
headless=not headful,
@@ -97,10 +108,13 @@ def scrape_product(
if pw_result.success:
html = pw_result.html
fetch_method = FetchMethod.PLAYWRIGHT
logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
else:
fetch_error = pw_result.error
logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
else:
fetch_error = http_result.error
logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")
if html:
html_size_bytes = len(html.encode("utf-8"))
@@ -118,12 +132,18 @@ def scrape_product(
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
try:
logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
snapshot = store.parse(html, canonical_url)
snapshot.debug.method = fetch_method
snapshot.debug.duration_ms = duration_ms
snapshot.debug.html_size_bytes = html_size_bytes
success = snapshot.debug.status != DebugStatus.FAILED
if success:
logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
else:
logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
except Exception as exc:
logger.error(f"[PARSE] Exception: {exc}")
snapshot = ProductSnapshot(
source=store.store_id,
url=canonical_url,
@@ -152,6 +172,19 @@ def scrape_product(
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
# Log final du job
elapsed_ms = int((time.time() - job_start_time) * 1000)
if success:
logger.info(
f"[JOB OK] {store.store_id}/{snapshot.reference} "
f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
)
else:
logger.warning(
f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
f"erreur={fetch_error} duree={elapsed_ms}ms"
)
return {
"success": success,
"product_id": product_id,