codex2

2026-01-14 21:54:55 +01:00
parent c91c0f1fc9
commit d0b73b9319
140 changed files with 5822 additions and 161 deletions
--- a/pricewatch/app/tasks/init.py
+++ b/pricewatch/app/tasks/init.py
@@ -3,6 +3,15 @@ Module tasks pour les jobs RQ.
 """

 from pricewatch.app.tasks.scrape import scrape_product
-from pricewatch.app.tasks.scheduler import ScrapingScheduler
+from pricewatch.app.tasks.scheduler import (
+    RedisUnavailableError,
+    ScrapingScheduler,
+    check_redis_connection,
+)

-__all__ = ["scrape_product", "ScrapingScheduler"]
+__all__ = [
+    "scrape_product",
+    "ScrapingScheduler",
+    "RedisUnavailableError",
+    "check_redis_connection",
+]
--- a/pricewatch/app/tasks/pycache/init.cpython-313.pyc
+++ b/pricewatch/app/tasks/pycache/init.cpython-313.pyc
--- a/pricewatch/app/tasks/pycache/scheduler.cpython-313.pyc
+++ b/pricewatch/app/tasks/pycache/scheduler.cpython-313.pyc
--- a/pricewatch/app/tasks/pycache/scrape.cpython-313.pyc
+++ b/pricewatch/app/tasks/pycache/scrape.cpython-313.pyc
--- a/pricewatch/app/tasks/scheduler.py
+++ b/pricewatch/app/tasks/scheduler.py
@@ -9,6 +9,8 @@ from datetime import datetime, timedelta, timezone
 from typing import Optional

 import redis
+from redis.exceptions import ConnectionError as RedisConnectionError
+from redis.exceptions import RedisError, TimeoutError as RedisTimeoutError
 from rq import Queue
 from rq_scheduler import Scheduler

@@ -19,6 +21,15 @@ from pricewatch.app.tasks.scrape import scrape_product
 logger = get_logger("tasks.scheduler")


+class RedisUnavailableError(Exception):
+    """Exception levee quand Redis n'est pas disponible."""
+
+    def __init__(self, message: str = "Redis non disponible", cause: Optional[Exception] = None):
+        self.message = message
+        self.cause = cause
+        super().__init__(self.message)
+
+
@dataclass
 class ScheduledJobInfo:
    """Infos de retour pour un job planifie."""
@@ -27,14 +38,72 @@ class ScheduledJobInfo:
    next_run: datetime


+def check_redis_connection(redis_url: str) -> bool:
+    """
+    Verifie si Redis est accessible.
+
+    Returns:
+        True si Redis repond, False sinon.
+    """
+    try:
+        conn = redis.from_url(redis_url)
+        conn.ping()
+        return True
+    except (RedisConnectionError, RedisTimeoutError, RedisError) as e:
+        logger.debug(f"Redis ping echoue: {e}")
+        return False
+
+
 class ScrapingScheduler:
    """Scheduler pour les jobs de scraping avec RQ."""

    def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
        self.config = config or get_config()
-        self.redis = redis.from_url(self.config.redis.url)
-        self.queue = Queue(queue_name, connection=self.redis)
-        self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
+        self._queue_name = queue_name
+        self._redis: Optional[redis.Redis] = None
+        self._queue: Optional[Queue] = None
+        self._scheduler: Optional[Scheduler] = None
+
+    def _ensure_connected(self) -> None:
+        """Etablit la connexion Redis si necessaire, leve RedisUnavailableError si echec."""
+        if self._redis is not None:
+            return
+
+        try:
+            self._redis = redis.from_url(self.config.redis.url)
+            # Ping pour verifier la connexion
+            self._redis.ping()
+            self._queue = Queue(self._queue_name, connection=self._redis)
+            self._scheduler = Scheduler(queue=self._queue, connection=self._redis)
+            logger.debug(f"Connexion Redis etablie: {self.config.redis.url}")
+        except (RedisConnectionError, RedisTimeoutError) as e:
+            self._redis = None
+            msg = f"Impossible de se connecter a Redis ({self.config.redis.url}): {e}"
+            logger.error(msg)
+            raise RedisUnavailableError(msg, cause=e) from e
+        except RedisError as e:
+            self._redis = None
+            msg = f"Erreur Redis: {e}"
+            logger.error(msg)
+            raise RedisUnavailableError(msg, cause=e) from e
+
+    @property
+    def redis(self) -> redis.Redis:
+        """Acces a la connexion Redis (lazy)."""
+        self._ensure_connected()
+        return self._redis  # type: ignore
+
+    @property
+    def queue(self) -> Queue:
+        """Acces a la queue RQ (lazy)."""
+        self._ensure_connected()
+        return self._queue  # type: ignore
+
+    @property
+    def scheduler(self) -> Scheduler:
+        """Acces au scheduler RQ (lazy)."""
+        self._ensure_connected()
+        return self._scheduler  # type: ignore

    def enqueue_immediate(
        self,
--- a/pricewatch/app/tasks/scrape.py
+++ b/pricewatch/app/tasks/scrape.py
@@ -4,6 +4,7 @@ Tache de scraping asynchrone pour RQ.

 from __future__ import annotations

+import time
 from typing import Any, Optional

 from pricewatch.app.core.config import AppConfig, get_config
@@ -46,6 +47,9 @@ def scrape_product(

    Retourne un dict avec success, product_id, snapshot, error.
    """
+    job_start_time = time.time()
+    logger.info(f"[JOB START] Scraping: {url}")
+
    config: AppConfig = get_config()
    setup_stores()

@@ -58,6 +62,8 @@ def scrape_product(
    registry = get_registry()
    store = registry.detect_store(url)
    if not store:
+        elapsed_ms = int((time.time() - job_start_time) * 1000)
+        logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
        snapshot = ProductSnapshot(
            source="unknown",
            url=url,
@@ -70,6 +76,8 @@ def scrape_product(
        ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
        return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}

+    logger.info(f"[STORE] Detecte: {store.store_id}")
+
    canonical_url = store.canonicalize(url)

    html = None
@@ -79,13 +87,16 @@ def scrape_product(
    html_size_bytes = None
    pw_result = None

+    logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
    http_result = fetch_http(canonical_url)
    duration_ms = http_result.duration_ms

    if http_result.success:
        html = http_result.html
        fetch_method = FetchMethod.HTTP
+        logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
    elif use_playwright:
+        logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
        pw_result = fetch_playwright(
            canonical_url,
            headless=not headful,
@@ -97,10 +108,13 @@ def scrape_product(
        if pw_result.success:
            html = pw_result.html
            fetch_method = FetchMethod.PLAYWRIGHT
+            logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
        else:
            fetch_error = pw_result.error
+            logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
    else:
        fetch_error = http_result.error
+        logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")

    if html:
        html_size_bytes = len(html.encode("utf-8"))
@@ -118,12 +132,18 @@ def scrape_product(
                save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")

        try:
+            logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
            snapshot = store.parse(html, canonical_url)
            snapshot.debug.method = fetch_method
            snapshot.debug.duration_ms = duration_ms
            snapshot.debug.html_size_bytes = html_size_bytes
            success = snapshot.debug.status != DebugStatus.FAILED
+            if success:
+                logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
+            else:
+                logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
        except Exception as exc:
+            logger.error(f"[PARSE] Exception: {exc}")
            snapshot = ProductSnapshot(
                source=store.store_id,
                url=canonical_url,
@@ -152,6 +172,19 @@ def scrape_product(

    product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)

+    # Log final du job
+    elapsed_ms = int((time.time() - job_start_time) * 1000)
+    if success:
+        logger.info(
+            f"[JOB OK] {store.store_id}/{snapshot.reference} "
+            f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
+        )
+    else:
+        logger.warning(
+            f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
+            f"erreur={fetch_error} duree={elapsed_ms}ms"
+        )
+
    return {
        "success": success,
        "product_id": product_id,