Files
scrap/pricewatch/app/tasks/scheduler.py
Gilles Soulier d0b73b9319 codex2
2026-01-14 21:54:55 +01:00

145 lines
4.6 KiB
Python

"""
Planification des jobs de scraping via RQ Scheduler.
"""
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional
import redis
from redis.exceptions import ConnectionError as RedisConnectionError
from redis.exceptions import RedisError, TimeoutError as RedisTimeoutError
from rq import Queue
from rq_scheduler import Scheduler
from pricewatch.app.core.config import AppConfig, get_config
from pricewatch.app.core.logging import get_logger
from pricewatch.app.tasks.scrape import scrape_product
logger = get_logger("tasks.scheduler")
class RedisUnavailableError(Exception):
"""Exception levee quand Redis n'est pas disponible."""
def __init__(self, message: str = "Redis non disponible", cause: Optional[Exception] = None):
self.message = message
self.cause = cause
super().__init__(self.message)
@dataclass
class ScheduledJobInfo:
"""Infos de retour pour un job planifie."""
job_id: str
next_run: datetime
def check_redis_connection(redis_url: str) -> bool:
"""
Verifie si Redis est accessible.
Returns:
True si Redis repond, False sinon.
"""
try:
conn = redis.from_url(redis_url)
conn.ping()
return True
except (RedisConnectionError, RedisTimeoutError, RedisError) as e:
logger.debug(f"Redis ping echoue: {e}")
return False
class ScrapingScheduler:
"""Scheduler pour les jobs de scraping avec RQ."""
def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
self.config = config or get_config()
self._queue_name = queue_name
self._redis: Optional[redis.Redis] = None
self._queue: Optional[Queue] = None
self._scheduler: Optional[Scheduler] = None
def _ensure_connected(self) -> None:
"""Etablit la connexion Redis si necessaire, leve RedisUnavailableError si echec."""
if self._redis is not None:
return
try:
self._redis = redis.from_url(self.config.redis.url)
# Ping pour verifier la connexion
self._redis.ping()
self._queue = Queue(self._queue_name, connection=self._redis)
self._scheduler = Scheduler(queue=self._queue, connection=self._redis)
logger.debug(f"Connexion Redis etablie: {self.config.redis.url}")
except (RedisConnectionError, RedisTimeoutError) as e:
self._redis = None
msg = f"Impossible de se connecter a Redis ({self.config.redis.url}): {e}"
logger.error(msg)
raise RedisUnavailableError(msg, cause=e) from e
except RedisError as e:
self._redis = None
msg = f"Erreur Redis: {e}"
logger.error(msg)
raise RedisUnavailableError(msg, cause=e) from e
@property
def redis(self) -> redis.Redis:
"""Acces a la connexion Redis (lazy)."""
self._ensure_connected()
return self._redis # type: ignore
@property
def queue(self) -> Queue:
"""Acces a la queue RQ (lazy)."""
self._ensure_connected()
return self._queue # type: ignore
@property
def scheduler(self) -> Scheduler:
"""Acces au scheduler RQ (lazy)."""
self._ensure_connected()
return self._scheduler # type: ignore
def enqueue_immediate(
self,
url: str,
use_playwright: Optional[bool] = None,
save_db: bool = True,
):
"""Enqueue un job immediat."""
job = self.queue.enqueue(
scrape_product,
url,
use_playwright=use_playwright,
save_db=save_db,
)
logger.info(f"Job enqueued: {job.id}")
return job
def schedule_product(
self,
url: str,
interval_hours: int = 24,
use_playwright: Optional[bool] = None,
save_db: bool = True,
) -> ScheduledJobInfo:
"""Planifie un scraping recurrent (intervalle en heures)."""
interval_seconds = int(timedelta(hours=interval_hours).total_seconds())
next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds)
job = self.scheduler.schedule(
scheduled_time=next_run,
func=scrape_product,
args=[url],
kwargs={"use_playwright": use_playwright, "save_db": save_db},
interval=interval_seconds,
repeat=None,
)
logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}")
return ScheduledJobInfo(job_id=job.id, next_run=next_run)