codex2
This commit is contained in:
13
pricewatch/app/tasks/__init__.py
Executable file → Normal file
13
pricewatch/app/tasks/__init__.py
Executable file → Normal file
@@ -3,6 +3,15 @@ Module tasks pour les jobs RQ.
|
||||
"""
|
||||
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
from pricewatch.app.tasks.scheduler import ScrapingScheduler
|
||||
from pricewatch.app.tasks.scheduler import (
|
||||
RedisUnavailableError,
|
||||
ScrapingScheduler,
|
||||
check_redis_connection,
|
||||
)
|
||||
|
||||
__all__ = ["scrape_product", "ScrapingScheduler"]
|
||||
__all__ = [
|
||||
"scrape_product",
|
||||
"ScrapingScheduler",
|
||||
"RedisUnavailableError",
|
||||
"check_redis_connection",
|
||||
]
|
||||
|
||||
BIN
pricewatch/app/tasks/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
pricewatch/app/tasks/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
pricewatch/app/tasks/__pycache__/scheduler.cpython-313.pyc
Normal file
BIN
pricewatch/app/tasks/__pycache__/scheduler.cpython-313.pyc
Normal file
Binary file not shown.
BIN
pricewatch/app/tasks/__pycache__/scrape.cpython-313.pyc
Normal file
BIN
pricewatch/app/tasks/__pycache__/scrape.cpython-313.pyc
Normal file
Binary file not shown.
75
pricewatch/app/tasks/scheduler.py
Executable file → Normal file
75
pricewatch/app/tasks/scheduler.py
Executable file → Normal file
@@ -9,6 +9,8 @@ from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
from redis.exceptions import ConnectionError as RedisConnectionError
|
||||
from redis.exceptions import RedisError, TimeoutError as RedisTimeoutError
|
||||
from rq import Queue
|
||||
from rq_scheduler import Scheduler
|
||||
|
||||
@@ -19,6 +21,15 @@ from pricewatch.app.tasks.scrape import scrape_product
|
||||
logger = get_logger("tasks.scheduler")
|
||||
|
||||
|
||||
class RedisUnavailableError(Exception):
|
||||
"""Exception levee quand Redis n'est pas disponible."""
|
||||
|
||||
def __init__(self, message: str = "Redis non disponible", cause: Optional[Exception] = None):
|
||||
self.message = message
|
||||
self.cause = cause
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScheduledJobInfo:
|
||||
"""Infos de retour pour un job planifie."""
|
||||
@@ -27,14 +38,72 @@ class ScheduledJobInfo:
|
||||
next_run: datetime
|
||||
|
||||
|
||||
def check_redis_connection(redis_url: str) -> bool:
|
||||
"""
|
||||
Verifie si Redis est accessible.
|
||||
|
||||
Returns:
|
||||
True si Redis repond, False sinon.
|
||||
"""
|
||||
try:
|
||||
conn = redis.from_url(redis_url)
|
||||
conn.ping()
|
||||
return True
|
||||
except (RedisConnectionError, RedisTimeoutError, RedisError) as e:
|
||||
logger.debug(f"Redis ping echoue: {e}")
|
||||
return False
|
||||
|
||||
|
||||
class ScrapingScheduler:
|
||||
"""Scheduler pour les jobs de scraping avec RQ."""
|
||||
|
||||
def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
|
||||
self.config = config or get_config()
|
||||
self.redis = redis.from_url(self.config.redis.url)
|
||||
self.queue = Queue(queue_name, connection=self.redis)
|
||||
self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
|
||||
self._queue_name = queue_name
|
||||
self._redis: Optional[redis.Redis] = None
|
||||
self._queue: Optional[Queue] = None
|
||||
self._scheduler: Optional[Scheduler] = None
|
||||
|
||||
def _ensure_connected(self) -> None:
|
||||
"""Etablit la connexion Redis si necessaire, leve RedisUnavailableError si echec."""
|
||||
if self._redis is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
self._redis = redis.from_url(self.config.redis.url)
|
||||
# Ping pour verifier la connexion
|
||||
self._redis.ping()
|
||||
self._queue = Queue(self._queue_name, connection=self._redis)
|
||||
self._scheduler = Scheduler(queue=self._queue, connection=self._redis)
|
||||
logger.debug(f"Connexion Redis etablie: {self.config.redis.url}")
|
||||
except (RedisConnectionError, RedisTimeoutError) as e:
|
||||
self._redis = None
|
||||
msg = f"Impossible de se connecter a Redis ({self.config.redis.url}): {e}"
|
||||
logger.error(msg)
|
||||
raise RedisUnavailableError(msg, cause=e) from e
|
||||
except RedisError as e:
|
||||
self._redis = None
|
||||
msg = f"Erreur Redis: {e}"
|
||||
logger.error(msg)
|
||||
raise RedisUnavailableError(msg, cause=e) from e
|
||||
|
||||
@property
|
||||
def redis(self) -> redis.Redis:
|
||||
"""Acces a la connexion Redis (lazy)."""
|
||||
self._ensure_connected()
|
||||
return self._redis # type: ignore
|
||||
|
||||
@property
|
||||
def queue(self) -> Queue:
|
||||
"""Acces a la queue RQ (lazy)."""
|
||||
self._ensure_connected()
|
||||
return self._queue # type: ignore
|
||||
|
||||
@property
|
||||
def scheduler(self) -> Scheduler:
|
||||
"""Acces au scheduler RQ (lazy)."""
|
||||
self._ensure_connected()
|
||||
return self._scheduler # type: ignore
|
||||
|
||||
def enqueue_immediate(
|
||||
self,
|
||||
|
||||
33
pricewatch/app/tasks/scrape.py
Executable file → Normal file
33
pricewatch/app/tasks/scrape.py
Executable file → Normal file
@@ -4,6 +4,7 @@ Tache de scraping asynchrone pour RQ.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
@@ -46,6 +47,9 @@ def scrape_product(
|
||||
|
||||
Retourne un dict avec success, product_id, snapshot, error.
|
||||
"""
|
||||
job_start_time = time.time()
|
||||
logger.info(f"[JOB START] Scraping: {url}")
|
||||
|
||||
config: AppConfig = get_config()
|
||||
setup_stores()
|
||||
|
||||
@@ -58,6 +62,8 @@ def scrape_product(
|
||||
registry = get_registry()
|
||||
store = registry.detect_store(url)
|
||||
if not store:
|
||||
elapsed_ms = int((time.time() - job_start_time) * 1000)
|
||||
logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
|
||||
snapshot = ProductSnapshot(
|
||||
source="unknown",
|
||||
url=url,
|
||||
@@ -70,6 +76,8 @@ def scrape_product(
|
||||
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
|
||||
|
||||
logger.info(f"[STORE] Detecte: {store.store_id}")
|
||||
|
||||
canonical_url = store.canonicalize(url)
|
||||
|
||||
html = None
|
||||
@@ -79,13 +87,16 @@ def scrape_product(
|
||||
html_size_bytes = None
|
||||
pw_result = None
|
||||
|
||||
logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
|
||||
http_result = fetch_http(canonical_url)
|
||||
duration_ms = http_result.duration_ms
|
||||
|
||||
if http_result.success:
|
||||
html = http_result.html
|
||||
fetch_method = FetchMethod.HTTP
|
||||
logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
|
||||
elif use_playwright:
|
||||
logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not headful,
|
||||
@@ -97,10 +108,13 @@ def scrape_product(
|
||||
if pw_result.success:
|
||||
html = pw_result.html
|
||||
fetch_method = FetchMethod.PLAYWRIGHT
|
||||
logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
|
||||
else:
|
||||
fetch_error = pw_result.error
|
||||
logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
|
||||
else:
|
||||
fetch_error = http_result.error
|
||||
logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")
|
||||
|
||||
if html:
|
||||
html_size_bytes = len(html.encode("utf-8"))
|
||||
@@ -118,12 +132,18 @@ def scrape_product(
|
||||
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
|
||||
|
||||
try:
|
||||
logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
|
||||
snapshot = store.parse(html, canonical_url)
|
||||
snapshot.debug.method = fetch_method
|
||||
snapshot.debug.duration_ms = duration_ms
|
||||
snapshot.debug.html_size_bytes = html_size_bytes
|
||||
success = snapshot.debug.status != DebugStatus.FAILED
|
||||
if success:
|
||||
logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
|
||||
else:
|
||||
logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
|
||||
except Exception as exc:
|
||||
logger.error(f"[PARSE] Exception: {exc}")
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
url=canonical_url,
|
||||
@@ -152,6 +172,19 @@ def scrape_product(
|
||||
|
||||
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
|
||||
# Log final du job
|
||||
elapsed_ms = int((time.time() - job_start_time) * 1000)
|
||||
if success:
|
||||
logger.info(
|
||||
f"[JOB OK] {store.store_id}/{snapshot.reference} "
|
||||
f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
|
||||
f"erreur={fetch_error} duree={elapsed_ms}ms"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": success,
|
||||
"product_id": product_id,
|
||||
|
||||
Reference in New Issue
Block a user