feat: improve SPA scraping and increase test coverage

- Add SPA support for Playwright with wait_for_network_idle and extra_wait_ms - Add BaseStore.get_spa_config() and requires_playwright() methods - Implement AliExpress SPA config with JSON price extraction patterns - Fix Amazon price parsing to prioritize whole+fraction combination - Fix AliExpress regex patterns (remove double backslashes) - Add CLI tests: detect, doctor, fetch, parse, run commands - Add API tests: auth, logs, products, scraping_logs, webhooks Tests: 417 passed, 85% coverage Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 14:46:55 +01:00
parent cf7c415e22
commit 152c2724fc
14 changed files with 1307 additions and 22 deletions
--- a/pricewatch/app/scraping/pw_fetch.py
+++ b/pricewatch/app/scraping/pw_fetch.py
@@ -45,6 +45,8 @@ def fetch_playwright(
    timeout_ms: int = 60000,
    save_screenshot: bool = False,
    wait_for_selector: Optional[str] = None,
+    wait_for_network_idle: bool = False,
+    extra_wait_ms: int = 0,
 ) -> PlaywrightFetchResult:
    """
    Récupère une page avec Playwright.
@@ -55,6 +57,8 @@ def fetch_playwright(
        timeout_ms: Timeout en millisecondes
        save_screenshot: Prendre un screenshot
        wait_for_selector: Attendre un sélecteur CSS avant de récupérer
+        wait_for_network_idle: Attendre que le réseau soit inactif (pour SPA)
+        extra_wait_ms: Délai supplémentaire après chargement (pour JS lent)

    Returns:
        PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
@@ -65,6 +69,8 @@ def fetch_playwright(
    - Headful disponible pour debug visuel
    - Screenshot optionnel pour diagnostiquer les échecs
    - wait_for_selector permet d'attendre le chargement dynamique
+    - wait_for_network_idle utile pour les SPA qui chargent via AJAX
+    - extra_wait_ms pour les sites avec JS lent après DOM ready
    """
    if not url or not url.strip():
        logger.error("URL vide fournie")
@@ -101,7 +107,8 @@ def fetch_playwright(

        # Naviguer vers la page
        logger.debug(f"[Playwright] Navigation vers {url}")
-        response = page.goto(url, wait_until="domcontentloaded")
+        wait_until = "networkidle" if wait_for_network_idle else "domcontentloaded"
+        response = page.goto(url, wait_until=wait_until)

        if not response:
            raise Exception("Pas de réponse du serveur")
@@ -116,6 +123,11 @@ def fetch_playwright(
                    f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
                )

+        # Délai supplémentaire pour JS lent (SPA)
+        if extra_wait_ms > 0:
+            logger.debug(f"[Playwright] Attente supplémentaire: {extra_wait_ms}ms")
+            page.wait_for_timeout(extra_wait_ms)
+
        # Récupérer le HTML
        html = page.content()

--- a/pricewatch/app/stores/aliexpress/store.py
+++ b/pricewatch/app/stores/aliexpress/store.py
@@ -29,13 +29,39 @@ logger = get_logger("stores.aliexpress")


 class AliexpressStore(BaseStore):
-    """Store pour AliExpress.com (marketplace chinois)."""
+    """Store pour AliExpress.com (marketplace chinois).
+
+    AliExpress est une SPA (Single Page Application) qui charge
+    le contenu via JavaScript/AJAX. Nécessite Playwright avec
+    attente du chargement dynamique.
+    """

    def __init__(self):
        """Initialise le store AliExpress avec ses sélecteurs."""
        selectors_path = Path(__file__).parent / "selectors.yml"
        super().__init__(store_id="aliexpress", selectors_path=selectors_path)

+    def get_spa_config(self) -> dict:
+        """
+        Configuration SPA pour AliExpress.
+
+        AliExpress charge les données produit (prix, titre) via AJAX.
+        Il faut attendre que le réseau soit inactif ET ajouter un délai
+        pour laisser le JS terminer le rendu.
+
+        Returns:
+            Configuration Playwright pour SPA
+        """
+        return {
+            "wait_for_network_idle": True,
+            "wait_for_selector": "h1",  # Titre du produit
+            "extra_wait_ms": 2000,  # 2s pour le rendu JS
+        }
+
+    def requires_playwright(self) -> bool:
+        """AliExpress nécessite Playwright pour le rendu SPA."""
+        return True
+
    def match(self, url: str) -> float:
        """
        Détecte si l'URL est AliExpress.
@@ -206,28 +232,71 @@ class AliexpressStore(BaseStore):
        Extrait le prix.

        AliExpress n'a PAS de sélecteur CSS stable pour le prix.
-        On utilise regex sur le HTML brut.
+        Stratégie multi-niveaux:
+        1. Chercher dans les données JSON embarquées
+        2. Chercher dans les spans avec classes contenant "price"
+        3. Regex sur le HTML brut
+        4. Meta tags og:price
        """
-        # Pattern 1: Prix avant € (ex: "136,69 €")
-        match = re.search(r"([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)\\s*€", html)
+        # Priorité 1: Extraire depuis JSON embarqué (skuActivityAmount, formattedActivityPrice)
+        json_patterns = [
+            r'"skuActivityAmount"\s*:\s*\{\s*"value"\s*:\s*(\d+(?:\.\d+)?)',  # {"value": 123.45}
+            r'"formattedActivityPrice"\s*:\s*"([0-9,.\s]+)\s*€"',  # "123,45 €"
+            r'"formattedActivityPrice"\s*:\s*"€\s*([0-9,.\s]+)"',  # "€ 123.45"
+            r'"minPrice"\s*:\s*"([0-9,.\s]+)"',  # "minPrice": "123.45"
+            r'"price"\s*:\s*"([0-9,.\s]+)"',  # "price": "123.45"
+            r'"activityAmount"\s*:\s*\{\s*"value"\s*:\s*(\d+(?:\.\d+)?)',  # activityAmount.value
+        ]
+        for pattern in json_patterns:
+            match = re.search(pattern, html)
+            if match:
+                price = parse_price_text(match.group(1))
+                if price is not None and price > 0:
+                    debug.notes.append(f"Prix extrait depuis JSON: {price}")
+                    return price
+
+        # Priorité 2: Chercher dans les spans/divs avec classes contenant "price"
+        price_selectors = [
+            'span[class*="price--current"]',
+            'span[class*="price--sale"]',
+            'div[class*="price--current"]',
+            'span[class*="product-price"]',
+            'span[class*="Price_Price"]',
+            'div[class*="es--wrap"]',  # Structure AliExpress spécifique
+        ]
+        for selector in price_selectors:
+            elements = soup.select(selector)
+            for elem in elements:
+                text = elem.get_text(strip=True)
+                # Chercher un prix dans le texte
+                price_match = re.search(r'(\d+[,.\s]*\d*)\s*€|€\s*(\d+[,.\s]*\d*)', text)
+                if price_match:
+                    price_str = price_match.group(1) or price_match.group(2)
+                    price = parse_price_text(price_str)
+                    if price is not None and price > 0:
+                        debug.notes.append(f"Prix extrait depuis sélecteur {selector}")
+                        return price
+
+        # Priorité 3: Prix avant € (ex: "136,69€" ou "136,69 €")
+        match = re.search(r'(\d+[,.\s\u00a0\u202f\u2009]*\d*)\s*€', html)
        if match:
            price = parse_price_text(match.group(1))
-            if price is not None:
+            if price is not None and price > 0:
                return price

-        # Pattern 2: € avant prix (ex: "€ 136.69")
-        match = re.search(r"€\\s*([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)", html)
+        # Priorité 4: € avant prix (ex: "€136.69" ou "€ 136.69")
+        match = re.search(r'€\s*(\d+[,.\s\u00a0\u202f\u2009]*\d*)', html)
        if match:
            price = parse_price_text(match.group(1))
-            if price is not None:
+            if price is not None and price > 0:
                return price

-        # Pattern 3: Chercher dans meta tags (moins fiable)
+        # Priorité 5: Chercher dans meta tags (moins fiable)
        og_price = soup.find("meta", property="og:price:amount")
        if og_price:
            price_str = og_price.get("content", "")
            price = parse_price_text(price_str)
-            if price is not None:
+            if price is not None and price > 0:
                return price

        debug.errors.append("Prix non trouvé")
@@ -235,7 +304,7 @@ class AliexpressStore(BaseStore):

    def _extract_msrp(self, html: str, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix conseille si present."""
-        match = re.search(r"originalPrice\"\\s*:\\s*\"([0-9\\s.,]+)\"", html)
+        match = re.search(r'originalPrice"\s*:\s*"([0-9\s.,]+)"', html)
        if match:
            price = parse_price_text(match.group(1))
            if price is not None:
--- a/pricewatch/app/stores/amazon/store.py
+++ b/pricewatch/app/stores/amazon/store.py
@@ -215,6 +215,19 @@ class AmazonStore(BaseStore):

    def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix."""
+        # Priorité 1: combiner les spans séparés a-price-whole et a-price-fraction
+        # C'est le format le plus courant sur Amazon pour les prix avec centimes séparés
+        whole = soup.select_one("span.a-price-whole")
+        fraction = soup.select_one("span.a-price-fraction")
+        if whole and fraction:
+            whole_text = whole.get_text(strip=True).rstrip(",.")
+            fraction_text = fraction.get_text(strip=True)
+            if whole_text and fraction_text:
+                price = parse_price_text(f"{whole_text}.{fraction_text}")
+                if price is not None:
+                    return price
+
+        # Priorité 2: essayer les sélecteurs (incluant a-price-whole seul avec prix complet)
        selectors = self.get_selector("price", [])
        if isinstance(selectors, str):
            selectors = [selectors]
@@ -227,16 +240,6 @@ class AmazonStore(BaseStore):
                if price is not None:
                    return price

-        # Fallback: chercher les spans séparés a-price-whole et a-price-fraction
-        whole = soup.select_one("span.a-price-whole")
-        fraction = soup.select_one("span.a-price-fraction")
-        if whole and fraction:
-            whole_text = whole.get_text(strip=True)
-            fraction_text = fraction.get_text(strip=True)
-            price = parse_price_text(f"{whole_text}.{fraction_text}")
-            if price is not None:
-                return price
-
        debug.errors.append("Prix non trouvé")
        return None

--- a/pricewatch/app/stores/base.py
+++ b/pricewatch/app/stores/base.py
@@ -152,5 +152,32 @@ class BaseStore(ABC):
        """
        return self.selectors.get(key, default)

+    def get_spa_config(self) -> Optional[dict]:
+        """
+        Retourne la configuration SPA pour Playwright si ce store est une SPA.
+
+        Returns:
+            dict avec les options Playwright ou None si pas une SPA:
+            - wait_for_selector: Sélecteur CSS à attendre avant scraping
+            - wait_for_network_idle: Attendre que le réseau soit inactif
+            - extra_wait_ms: Délai supplémentaire après chargement
+
+        Par défaut retourne None (pas de config SPA spécifique).
+        Les stores SPA doivent surcharger cette méthode.
+        """
+        return None
+
+    def requires_playwright(self) -> bool:
+        """
+        Indique si ce store nécessite obligatoirement Playwright.
+
+        Returns:
+            True si Playwright est requis, False sinon
+
+        Par défaut False. Les stores avec anti-bot agressif ou
+        rendu SPA obligatoire doivent surcharger cette méthode.
+        """
+        return False
+
    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} id={self.store_id}>"