chore: sync project files

2026-01-13 19:49:04 +01:00
parent 53f8227941
commit ecda149a4b
149 changed files with 65272 additions and 1 deletions
@@ -0,0 +1,5 @@
+"""Store AliExpress."""
+
+from pricewatch.app.stores.aliexpress.store import AliexpressStore
+
+__all__ = ["AliexpressStore"]
@@ -0,0 +1,163 @@
+# Fixtures AliExpress
+
+Ce dossier contient des fichiers HTML réels capturés depuis AliExpress pour les tests.
+
+## ⚠️ Note importante sur AliExpress
+
+AliExpress utilise un **rendu client-side (SPA React/Vue)**:
+- HTTP simple retourne **HTML minimal** (75KB sans contenu)
+- **Playwright est OBLIGATOIRE** avec attente (~3s)
+- Attendre le sélecteur `.product-title` pour obtenir les données
+- Données chargées via **AJAX** après le render initial
+
+## Spécificité AliExpress
+
+AliExpress est un **marketplace chinois** avec des particularités:
+- **Pas de JSON-LD** schema.org
+- **Prix**: Extrait par **regex** (aucun sélecteur CSS stable)
+- **Images**: Extraites depuis `window._d_c_.DCData.imagePathList` (JSON embarqué)
+- **Classes CSS**: Générées aléatoirement (hachées) → **TRÈS instables**
+- **SKU**: ID numérique long (13 chiffres) depuis l'URL
+
+## Fichiers
+
+### aliexpress_1005007187023722.html
+- **Produit**: Samsung serveur DDR4 mémoire Ram ECC
+- **SKU**: 1005007187023722
+- **URL**: https://fr.aliexpress.com/item/1005007187023722.html
+- **Taille**: 378 KB (rendu complet)
+- **Date capture**: 2026-01-13
+- **Méthode**: Playwright avec wait_for_selector='.product-title'
+- **Prix capturé**: 136,69 EUR
+- **Usage**: Test complet parsing produit électronique
+
+## Structure HTML AliExpress
+
+### JSON-LD Schema.org ✗
+AliExpress **n'utilise PAS** JSON-LD (contrairement à Backmarket).
+
+### Données embarquées ✓
+AliExpress embarque les données dans des variables JavaScript:
+
+```javascript
+window._d_c_.DCData = {
+  "imagePathList": ["https://ae01.alicdn.com/kf/..."],
+  "summImagePathList": ["https://ae01.alicdn.com/kf/..."],
+  "i18nMap": {...},
+  "extParams": {...}
+}
+```
+
+### Sélecteurs identifiés
+
+#### Titre
+```css
+h1                         /* Apparaît après AJAX */
+meta[property="og:title"]  /* Fallback dans meta tags */
+```
+Le h1 n'existe PAS dans le HTML initial, il est ajouté dynamiquement.
+
+#### Prix
+⚠️ **AUCUN SÉLECTEUR CSS STABLE** - Utiliser regex:
+```regex
+([0-9]+[.,][0-9]{2})\s*€    /* Prix avant € */
+€\s*([0-9]+[.,][0-9]{2})    /* € avant prix */
+```
+
+#### Images
+Priorité: **window._d_c_.DCData.imagePathList**
+Fallback: `meta[property="og:image"]`
+
+URLs CDN: `https://ae01.alicdn.com/kf/...`
+
+#### SKU
+Extraction depuis l'URL:
+```regex
+/item/(\d+)\.html
+```
+Exemple: `/item/1005007187023722.html` → SKU = "1005007187023722"
+
+#### Stock
+Chercher bouton "Add to cart" / "Ajouter au panier"
+```css
+button[class*='add-to-cart']
+```
+
+## Comparaison avec autres stores
+
+| Aspect | Amazon | Cdiscount | Backmarket | **AliExpress** |
+|--------|--------|-----------|------------|----------------|
+| **Anti-bot** | Faible | Fort | Fort | Moyen |
+| **Méthode** | HTTP OK | Playwright | Playwright | **Playwright** |
+| **JSON-LD** | Partiel | ✗ Non | ✓ Oui | **✗ Non** |
+| **Sélecteurs** | Stables (IDs) | Instables | Stables | **Très instables** |
+| **SKU format** | `/dp/{ASIN}` | `/f-{cat}-{SKU}` | `/p/{slug}` | **/item/{ID}.html** |
+| **Prix extraction** | CSS | CSS/Regex | JSON-LD | **Regex uniquement** |
+| **Rendu** | Server-side | Server-side | Server-side | **Client-side (SPA)** |
+| **Particularité** | - | Prix dynamiques | Reconditionné | **SPA React/Vue** |
+
+## Utilisation dans les tests
+
+```python
+@pytest.fixture
+def aliexpress_fixture_samsung():
+    fixture_path = Path(__file__).parent.parent.parent / \
+        "pricewatch/app/stores/aliexpress/fixtures/aliexpress_1005007187023722.html"
+    with open(fixture_path, "r", encoding="utf-8") as f:
+        return f.read()
+
+def test_parse_real_fixture(store, aliexpress_fixture_samsung):
+    url = "https://fr.aliexpress.com/item/1005007187023722.html"
+    snapshot = store.parse(aliexpress_fixture_samsung, url)
+
+    assert snapshot.title.startswith("Samsung serveur DDR4")
+    assert snapshot.price == 136.69
+    assert snapshot.reference == "1005007187023722"
+    assert snapshot.currency == "EUR"
+    assert len(snapshot.images) >= 6
+```
+
+## Points d'attention pour les tests
+
+1. **HTML volumineux** - 378KB pour une page (SPA chargée)
+2. **Prix instable** - Peut changer selon promo/devise
+3. **Ne pas tester le prix exact** - Tester le format et la présence
+4. **Images multiples** - Toujours 6+ images par produit
+5. **Titre long** - Souvent 100-150 caractères
+6. **Stock variable** - Peut changer rapidement
+
+## Comment capturer une nouvelle fixture
+
+```python
+from pricewatch.app.scraping.pw_fetch import fetch_playwright
+
+url = "https://fr.aliexpress.com/item/..."
+result = fetch_playwright(
+    url,
+    headless=True,
+    timeout_ms=15000,
+    wait_for_selector=".product-title"  # IMPORTANT!
+)
+
+if result.success:
+    with open("fixture.html", "w", encoding="utf-8") as f:
+        f.write(result.html)
+```
+
+⚠️ **N'utilisez PAS** `fetch_http()` - il retourne un HTML minimal (75KB)!
+⚠️ **Utilisez TOUJOURS** `wait_for_selector=".product-title"` avec Playwright!
+
+## Avantages de AliExpress
+
+✓ **HTTP fonctionne** → Pas d'anti-bot fort (mais HTML vide)
+✓ **Données embarquées** → DCData JSON avec images
+✓ **SKU simple** → ID numérique depuis URL
+
+## Inconvénients
+
+✗ **SPA client-side** → Playwright obligatoire avec wait (~3-5s)
+✗ **Pas de JSON-LD** → Extraction moins fiable
+✗ **Prix par regex** → Fragile, peut casser
+✗ **Classes CSS instables** → Générées aléatoirement (hachées)
+✗ **Temps de chargement** → 3-5s avec Playwright + wait
+✗ **Specs mal structurées** → Souvent dans des onglets/modals
@@ -0,0 +1,79 @@
+# Sélecteurs CSS/XPath pour AliExpress.com
+# Mis à jour le 2026-01-13 après analyse du HTML réel
+
+# ⚠️ IMPORTANT: AliExpress utilise un rendu client-side (SPA React/Vue)
+# - HTTP fonctionne mais retourne un HTML minimal (75KB)
+# - Playwright OBLIGATOIRE pour obtenir le contenu rendu
+# - Attendre le sélecteur '.product-title' ou ajouter un délai (~3s)
+# - Les données sont chargées dynamiquement via AJAX
+
+# ⚠️ Extraction prioritaire:
+# 1. Titre: h1 ou meta[property="og:title"]
+# 2. Prix: Regex dans le HTML (aucun sélecteur stable)
+# 3. Images: window._d_c_.DCData.imagePathList (JSON embarqué)
+# 4. SKU: Depuis l'URL /item/{ID}.html
+
+# Titre du produit
+# Le h1 apparaît après chargement AJAX
+title:
+  - "h1"
+  - "meta[property='og:title']"  # Fallback dans meta tags
+
+# Prix principal
+# ⚠️ AUCUN SÉLECTEUR STABLE - Utiliser regex sur le HTML
+# Pattern: ([0-9]+[.,][0-9]{2})\s*€ ou €\s*([0-9]+[.,][0-9]{2})
+price:
+  - "span[class*='price']"
+  - "div[class*='price']"
+  - "span.product-price"
+  # Ces sélecteurs ne fonctionnent PAS - prix extrait par regex
+
+# Devise
+# Toujours EUR pour fr.aliexpress.com
+currency:
+  - "meta[property='og:price:currency']"
+  # Fallback: détecter depuis l'URL (fr = EUR)
+
+# Images produit
+# ⚠️ Les images sont dans window._d_c_.DCData.imagePathList
+# Format: https://ae01.alicdn.com/kf/{hash}.jpg
+images:
+  - "img[alt]"
+  # Extraction depuis DCData JSON plus fiable
+
+# Catégorie / breadcrumb
+category:
+  - "nav[aria-label='breadcrumb'] a"
+  - ".breadcrumb a"
+
+# Caractéristiques techniques
+# Peuvent être dans des onglets ou sections dépliables
+specs_table:
+  - "div[class*='specification']"
+  - "div[class*='properties']"
+  - "dl"
+
+# SKU / référence produit
+# Extraction depuis l'URL plus fiable
+# URL pattern: /item/{ID}.html
+# SKU = ID (10 chiffres)
+sku:
+  - "meta[property='product:retailer_item_id']"
+  - "span[data-spm-anchor-id]"
+
+# Stock / Disponibilité
+stock_status:
+  - "button[class*='add-to-cart']"
+  - "button[class*='addtocart']"
+  - "div[class*='availability']"
+
+# Notes importantes:
+# 1. ⚠️ Playwright OBLIGATOIRE avec wait - HTML minimal sinon
+# 2. Attendre le sélecteur '.product-title' avant de parser
+# 3. Prix: REGEX obligatoire - aucun sélecteur CSS stable
+# 4. Images: Extraire depuis window._d_c_.DCData (JSON)
+# 5. SKU: Extraire depuis URL /item/{ID}.html → ID = SKU
+# 6. Devise: EUR pour France (fr.aliexpress.com)
+# 7. Classes CSS générées aléatoirement (hachées) - TRÈS INSTABLES
+# 8. Pas de JSON-LD schema.org disponible
+# 9. Temps de chargement: ~3-5s avec Playwright + wait
@@ -0,0 +1,350 @@
+"""
+Store AliExpress - Parsing de produits AliExpress.com.
+
+Supporte l'extraction de: titre, prix, SKU, images, etc.
+Spécificité: Rendu client-side (SPA) - nécessite Playwright avec attente.
+"""
+
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+from pricewatch.app.core.logging import get_logger
+from pricewatch.app.core.schema import (
+    DebugInfo,
+    DebugStatus,
+    FetchMethod,
+    ProductSnapshot,
+    StockStatus,
+)
+from pricewatch.app.stores.base import BaseStore
+
+logger = get_logger("stores.aliexpress")
+
+
+class AliexpressStore(BaseStore):
+    """Store pour AliExpress.com (marketplace chinois)."""
+
+    def __init__(self):
+        """Initialise le store AliExpress avec ses sélecteurs."""
+        selectors_path = Path(__file__).parent / "selectors.yml"
+        super().__init__(store_id="aliexpress", selectors_path=selectors_path)
+
+    def match(self, url: str) -> float:
+        """
+        Détecte si l'URL est AliExpress.
+
+        Returns:
+            0.9 pour aliexpress.com/aliexpress.fr
+            0.0 sinon
+        """
+        if not url:
+            return 0.0
+
+        url_lower = url.lower()
+
+        if "aliexpress.com" in url_lower or "aliexpress.fr" in url_lower:
+            # Vérifier que c'est bien une page produit
+            if "/item/" in url_lower:
+                return 0.9
+            else:
+                return 0.5  # C'est AliExpress mais pas une page produit
+
+        return 0.0
+
+    def canonicalize(self, url: str) -> str:
+        """
+        Normalise l'URL AliExpress.
+
+        Les URLs AliExpress ont généralement la forme:
+        https://fr.aliexpress.com/item/{ID}.html?params...
+
+        On garde juste: https://fr.aliexpress.com/item/{ID}.html
+        """
+        if not url:
+            return url
+
+        parsed = urlparse(url)
+
+        # Extraire le path de base (sans query params)
+        path = parsed.path
+
+        # Garder seulement /item/{ID}.html
+        match = re.search(r"(/item/\d+\.html)", path)
+        if match:
+            clean_path = match.group(1)
+            return f"{parsed.scheme}://{parsed.netloc}{clean_path}"
+
+        # Si le pattern ne matche pas, retirer juste query params
+        return f"{parsed.scheme}://{parsed.netloc}{path}"
+
+    def extract_reference(self, url: str) -> Optional[str]:
+        """
+        Extrait le SKU (Product ID) depuis l'URL.
+
+        Format typique: /item/{ID}.html
+        Exemple: /item/1005007187023722.html → "1005007187023722"
+        """
+        if not url:
+            return None
+
+        # Pattern: /item/{ID}.html
+        match = re.search(r"/item/(\d+)\.html", url, re.IGNORECASE)
+        if match:
+            return match.group(1)
+
+        return None
+
+    def parse(self, html: str, url: str) -> ProductSnapshot:
+        """
+        Parse le HTML AliExpress vers ProductSnapshot.
+
+        AliExpress utilise un rendu client-side (SPA), donc:
+        - Extraction prioritaire depuis meta tags (og:title, og:image)
+        - Prix extrait par regex (pas de sélecteur stable)
+        - Images extraites depuis window._d_c_.DCData JSON
+        """
+        soup = BeautifulSoup(html, "lxml")
+
+        debug_info = DebugInfo(
+            method=FetchMethod.HTTP,  # Sera mis à jour par l'appelant
+            status=DebugStatus.SUCCESS,
+            errors=[],
+            notes=[],
+        )
+
+        # Extraction des champs
+        title = self._extract_title(soup, debug_info)
+        price = self._extract_price(html, soup, debug_info)
+        currency = self._extract_currency(url, soup, debug_info)
+        stock_status = self._extract_stock(soup, debug_info)
+        images = self._extract_images(html, soup, debug_info)
+        category = self._extract_category(soup, debug_info)
+        specs = self._extract_specs(soup, debug_info)
+        reference = self.extract_reference(url)
+
+        # Note sur le rendu client-side
+        if len(html) < 200000:  # HTML trop petit = pas de rendu complet
+            debug_info.notes.append(
+                "HTML court (<200KB) - possiblement non rendu. Utiliser Playwright avec wait."
+            )
+
+        # Déterminer le statut final
+        if not title or price is None:
+            debug_info.status = DebugStatus.PARTIAL
+            debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
+
+        snapshot = ProductSnapshot(
+            source=self.store_id,
+            url=self.canonicalize(url),
+            fetched_at=datetime.now(),
+            title=title,
+            price=price,
+            currency=currency,
+            shipping_cost=None,
+            stock_status=stock_status,
+            reference=reference,
+            category=category,
+            images=images,
+            specs=specs,
+            debug=debug_info,
+        )
+
+        logger.info(
+            f"[AliExpress] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
+            f"title={bool(title)}, price={price is not None}"
+        )
+
+        return snapshot
+
+    def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait le titre du produit."""
+        # Priorité 1: h1 (apparaît après rendu AJAX)
+        h1 = soup.find("h1")
+        if h1:
+            title = h1.get_text(strip=True)
+            if title and len(title) > 10:  # Titre valide
+                return title
+
+        # Priorité 2: og:title (dans meta tags)
+        og_title = soup.find("meta", property="og:title")
+        if og_title:
+            title = og_title.get("content", "")
+            if title:
+                # Nettoyer " - AliExpress" à la fin
+                title = re.sub(r"\s*-\s*AliExpress.*$", "", title)
+                return title.strip()
+
+        debug.errors.append("Titre non trouvé")
+        return None
+
+    def _extract_price(
+        self, html: str, soup: BeautifulSoup, debug: DebugInfo
+    ) -> Optional[float]:
+        """
+        Extrait le prix.
+
+        AliExpress n'a PAS de sélecteur CSS stable pour le prix.
+        On utilise regex sur le HTML brut.
+        """
+        # Pattern 1: Prix avant € (ex: "136,69 €")
+        match = re.search(r"([0-9]+[.,][0-9]{2})\s*€", html)
+        if match:
+            price_str = match.group(1).replace(",", ".")
+            try:
+                return float(price_str)
+            except ValueError:
+                pass
+
+        # Pattern 2: € avant prix (ex: "€ 136.69")
+        match = re.search(r"€\s*([0-9]+[.,][0-9]{2})", html)
+        if match:
+            price_str = match.group(1).replace(",", ".")
+            try:
+                return float(price_str)
+            except ValueError:
+                pass
+
+        # Pattern 3: Chercher dans meta tags (moins fiable)
+        og_price = soup.find("meta", property="og:price:amount")
+        if og_price:
+            price_str = og_price.get("content", "")
+            try:
+                return float(price_str)
+            except ValueError:
+                pass
+
+        debug.errors.append("Prix non trouvé")
+        return None
+
+    def _extract_currency(
+        self, url: str, soup: BeautifulSoup, debug: DebugInfo
+    ) -> str:
+        """Extrait la devise."""
+        # Priorité 1: og:price:currency
+        og_currency = soup.find("meta", property="og:price:currency")
+        if og_currency:
+            currency = og_currency.get("content", "")
+            if currency:
+                return currency.upper()
+
+        # Priorité 2: Détecter depuis l'URL
+        if "fr.aliexpress" in url.lower():
+            return "EUR"
+        elif "aliexpress.com" in url.lower():
+            return "USD"
+
+        # Défaut
+        return "EUR"
+
+    def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
+        """Extrait le statut de stock."""
+        # Chercher le bouton "Add to cart" / "Ajouter au panier"
+        buttons = soup.find_all("button")
+        for btn in buttons:
+            text = btn.get_text(strip=True).lower()
+            if any(
+                keyword in text
+                for keyword in ["add to cart", "ajouter", "buy now", "acheter"]
+            ):
+                # Bouton trouvé et pas disabled
+                if not btn.get("disabled"):
+                    return StockStatus.IN_STOCK
+
+        # Fallback: chercher texte indiquant la disponibilité
+        text_lower = soup.get_text().lower()
+        if "out of stock" in text_lower or "rupture" in text_lower:
+            return StockStatus.OUT_OF_STOCK
+
+        return StockStatus.UNKNOWN
+
+    def _extract_images(
+        self, html: str, soup: BeautifulSoup, debug: DebugInfo
+    ) -> list[str]:
+        """
+        Extrait les URLs d'images.
+
+        Priorité: window._d_c_.DCData.imagePathList (JSON embarqué)
+        """
+        images = []
+
+        # Priorité 1: Extraire depuis DCData JSON
+        match = re.search(
+            r"window\._d_c_\.DCData\s*=\s*(\{[^;]*\});", html, re.DOTALL
+        )
+        if match:
+            try:
+                data = json.loads(match.group(1))
+                if "imagePathList" in data:
+                    image_list = data["imagePathList"]
+                    if isinstance(image_list, list):
+                        images.extend(image_list)
+                        debug.notes.append(
+                            f"Images extraites depuis DCData: {len(images)}"
+                        )
+            except (json.JSONDecodeError, KeyError):
+                pass
+
+        # Priorité 2: og:image
+        if not images:
+            og_image = soup.find("meta", property="og:image")
+            if og_image:
+                img_url = og_image.get("content", "")
+                if img_url:
+                    images.append(img_url)
+
+        # Priorité 3: Chercher dans les <img> avec alicdn.com
+        if not images:
+            img_elems = soup.find_all("img", src=True)
+            for img in img_elems:
+                src = img.get("src", "")
+                if "alicdn.com" in src and not any(
+                    x in src for x in ["logo", "icon", "avatar"]
+                ):
+                    if src not in images:
+                        images.append(src)
+
+        return images
+
+    def _extract_category(
+        self, soup: BeautifulSoup, debug: DebugInfo
+    ) -> Optional[str]:
+        """Extrait la catégorie depuis le breadcrumb."""
+        selectors = self.get_selector("category", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            elements = soup.select(selector)
+            if elements:
+                # Prendre le dernier élément du breadcrumb
+                categories = [
+                    elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)
+                ]
+                if categories:
+                    return categories[-1]
+
+        return None
+
+    def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
+        """Extrait les caractéristiques techniques."""
+        specs = {}
+
+        # Chercher les dl (definition lists)
+        dls = soup.find_all("dl")
+        for dl in dls:
+            dts = dl.find_all("dt")
+            dds = dl.find_all("dd")
+
+            for dt, dd in zip(dts, dds):
+                key = dt.get_text(strip=True)
+                value = dd.get_text(strip=True)
+                if key and value:
+                    specs[key] = value
+
+        return specs