chore: sync project files

2026-01-13 19:49:04 +01:00
parent 53f8227941
commit ecda149a4b
149 changed files with 65272 additions and 1 deletions
--- a/pricewatch/app/stores/amazon/init.py
+++ b/pricewatch/app/stores/amazon/init.py
--- a/pricewatch/app/stores/amazon/pycache/init.cpython-313.pyc
+++ b/pricewatch/app/stores/amazon/pycache/init.cpython-313.pyc
--- a/pricewatch/app/stores/amazon/pycache/store.cpython-313.pyc
+++ b/pricewatch/app/stores/amazon/pycache/store.cpython-313.pyc
--- a/pricewatch/app/stores/amazon/fixtures/README.md
+++ b/pricewatch/app/stores/amazon/fixtures/README.md
@@ -0,0 +1,54 @@
+# Fixtures Amazon
+
+Ce dossier contient des fichiers HTML réels capturés depuis Amazon.fr pour les tests.
+
+## Fichiers
+
+### amazon_B0D4DX8PH3.html
+- **Produit**: elago MS1 Station de Charge Compatible avec Le Chargeur MagSafe
+- **ASIN**: B0D4DX8PH3
+- **URL**: https://www.amazon.fr/dp/B0D4DX8PH3
+- **Taille**: ~2.4 MB
+- **Lignes**: 11151
+- **Date capture**: 2026-01-13
+- **Usage**: Test complet parsing avec images, specs, prix
+
+### amazon_B0F6MWNJ6J.html
+- **Produit**: Baseus Docking Station, Nomos Air 12 in 1
+- **ASIN**: B0F6MWNJ6J
+- **URL**: https://www.amazon.fr/dp/B0F6MWNJ6J
+- **Taille**: ~2.3 MB
+- **Lignes**: 11168
+- **Date capture**: 2026-01-13
+- **Usage**: Test complet parsing produit tech complexe
+
+### captcha.html
+- **Contenu**: Page captcha Amazon
+- **Taille**: 5.1 KB
+- **Lignes**: 115
+- **Usage**: Test détection captcha et gestion erreurs
+
+## Utilisation
+
+Les tests utilisent ces fixtures avec pytest:
+
+```python
+@pytest.fixture
+def amazon_fixture_b0d4dx8ph3():
+    fixture_path = Path(__file__).parent.parent / "pricewatch/app/stores/amazon/fixtures/amazon_B0D4DX8PH3.html"
+    with open(fixture_path, "r", encoding="utf-8") as f:
+        return f.read()
+
+def test_parse_real_fixture(store, amazon_fixture_b0d4dx8ph3):
+    url = "https://www.amazon.fr/dp/B0D4DX8PH3"
+    snapshot = store.parse(amazon_fixture_b0d4dx8ph3, url)
+    assert snapshot.reference == "B0D4DX8PH3"
+    assert snapshot.price is not None
+    # ...
+```
+
+## Notes
+
+- Ces fichiers sont de vraies pages HTML capturées, ils peuvent contenir beaucoup de JavaScript et de métadonnées
+- Les tests doivent se concentrer sur l'extraction des données essentielles (titre, prix, ASIN, stock)
+- Ne pas tester les données qui peuvent changer (prix exact, nombre d'avis, etc.) mais plutôt le format
--- a/pricewatch/app/stores/amazon/fixtures/amazon_B0D4DX8PH3.html
+++ b/pricewatch/app/stores/amazon/fixtures/amazon_B0D4DX8PH3.html
--- a/pricewatch/app/stores/amazon/fixtures/amazon_B0F6MWNJ6J.html
+++ b/pricewatch/app/stores/amazon/fixtures/amazon_B0F6MWNJ6J.html
--- a/pricewatch/app/stores/amazon/fixtures/captcha.html
+++ b/pricewatch/app/stores/amazon/fixtures/captcha.html
@@ -0,0 +1,115 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]> <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
+<!--[if IE 7]>    <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
+<!--[if IE 8]>    <html lang="fr" class="a-no-js a-lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!-->
+<html class="a-no-js" lang="fr"><!--<![endif]--><head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+<title dir="ltr">Amazon.fr</title>
+<meta name="viewport" content="width=device-width">
+<link rel="stylesheet" href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css">
+<script>
+
+if (true === true) {
+    var ue_t0 = (+ new Date()),
+        ue_csm = window,
+        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
+        ue_furl = "fls-eu.amazon.fr",
+        ue_mid = "A13V1IB3VIYZZH",
+        ue_sid = (document.cookie.match(/session-id=([0-9-]+)/) || [])[1],
+        ue_sn = "opfcaptcha.amazon.fr",
+        ue_id = 'V1R3HCVDQ573ZEMZKZQD';
+}
+</script>
+</head>
+<body>
+
+<!--
+        To discuss automated access to Amazon data please contact api-services-support@amazon.com.
+        For information about migrating to our APIs refer to our Marketplace APIs at https://developer.amazonservices.fr/ref=rm_c_sv, or our Product Advertising API at https://partenaires.amazon.fr/gp/advertising/api/detail/main.html/ref=rm_c_ac for advertising use cases.
+-->
+
+<!--
+Correios.DoNotSend
+-->
+
+<div class="a-container a-padding-double-large" style="min-width:350px;padding:44px 0 !important">
+
+    <div class="a-row a-spacing-double-large" style="width: 350px; margin: 0 auto">
+
+        <div class="a-row a-spacing-medium a-text-center"><i class="a-icon a-logo" alt="Logo d'Amazon"></i></div>
+
+        <div class="a-box a-alert a-alert-info a-spacing-base">
+            <div class="a-box-inner">
+                <i class="a-icon a-icon-alert" alt="IcÃ´ne d'alerte"></i>
+                <h4>Cliquez sur le bouton ci-dessous pour continuer vos achats</h4>
+                </div>
+            </div>
+
+            <div class="a-section">
+
+                <div class="a-box a-color-offset-background">
+                    <div class="a-box-inner a-padding-extra-large">
+
+                        <form method="get" action="/errors/validateCaptcha" name="">
+                            <input type=hidden name="amzn" value="2W5U2H7MWJXqdgImnmg0CQ==" /><input type=hidden name="amzn-r" value="&#047;dp&#047;B0DFWRHZ7L" />
+                            <input type=hidden name="field-keywords" value="ELFGJB" />
+                            <div class="a-section a-spacing-extra-large">
+
+                                <div class="a-row">
+                                    <span class="a-button a-button-primary a-span12">
+                                        <span class="a-button-inner">
+                                            <button type="submit" class="a-button-text" alt="Continuer les achats">Continuer les achats</button>
+                                        </span>
+                                    </span>
+                                </div>
+
+                            </div>
+                        </form>
+
+                    </div>
+                </div>
+
+            </div>
+
+        </div>
+
+        <div class="a-divider a-divider-section"><div class="a-divider-inner"></div></div>
+
+        <div class="a-text-center a-spacing-small a-size-mini">
+            <a href="https://www.amazon.fr/gp/help/customer/display.html/ref=footer_cou?ie=UTF8&nodeId=548524">Conditions gÃ©nÃ©rales de vente</a>
+            <span class="a-letter-space"></span>
+            <span class="a-letter-space"></span>
+            <span class="a-letter-space"></span>
+            <span class="a-letter-space"></span>
+            <a href="https://www.amazon.fr/gp/help/customer/display.html/ref=footer_privacy?ie=UTF8&nodeId=3329781">Vos informations personnelles</a>
+        </div>
+
+        <div class="a-text-center a-size-mini a-color-base">
+          &copy; 1996-2025, Amazon.com, Inc. ou ses filiales.
+          <script>
+           if (true === true) {
+             document.write('<img src="https://fls-eu.amaz'+'on.fr/'+'1/oc-csi/1/OP/requestId=V1R3HCVDQ573ZEMZKZQD&js=1" alt=""/>');
+           };
+          </script>
+          <noscript>
+            <img src="https://fls-eu.amazon.fr/1/oc-csi/1/OP/requestId=V1R3HCVDQ573ZEMZKZQD&js=0" alt=""/>
+          </noscript>
+        </div>
+    </div>
+    <script>
+    if (true === true) {
+        var head = document.getElementsByTagName('head')[0],
+            prefix = "https://images-eu.ssl-images-amazon.com/images/G/01/csminstrumentation/",
+            elem = document.createElement("script");
+        elem.src = prefix + "csm-captcha-instrumentation.min.js";
+        head.appendChild(elem);
+
+        elem = document.createElement("script");
+        elem.src = prefix + "rd-script-6d68177fa6061598e9509dc4b5bdd08d.js";
+        head.appendChild(elem);
+    }
+    </script>
+</body></html>
--- a/pricewatch/app/stores/amazon/selectors.yml
+++ b/pricewatch/app/stores/amazon/selectors.yml
@@ -0,0 +1,69 @@
+# Sélecteurs CSS/XPath pour Amazon
+# Ces sélecteurs sont à ajuster selon l'évolution du site
+
+# Titre du produit
+title:
+  - "#productTitle"
+  - "#title"
+  - "h1.product-title"
+
+# Prix principal
+price:
+  - "span.a-price-whole"
+  - ".a-price .a-offscreen"
+  - "#priceblock_ourprice"
+  - "#priceblock_dealprice"
+  - ".a-price-range .a-price .a-offscreen"
+
+# Devise (généralement dans le symbole)
+currency:
+  - "span.a-price-symbol"
+  - ".a-price-symbol"
+
+# Frais de port
+shipping_cost:
+  - "#ourprice_shippingmessage"
+  - "#price-shipping-message"
+  - "#deliveryMessageMirId"
+
+# Statut de stock
+stock_status:
+  - "#availability span"
+  - "#availability"
+  - ".a-declarative .a-size-medium"
+
+# Images produit
+images:
+  - "#landingImage"
+  - "#imgBlkFront"
+  - ".a-dynamic-image"
+  - "#main-image"
+
+# Catégorie / breadcrumb
+category:
+  - "#wayfinding-breadcrumbs_feature_div"
+  - ".a-breadcrumb"
+
+# Caractéristiques techniques (table specs)
+specs_table:
+  - "#productDetails_techSpec_section_1"
+  - "#productDetails_detailBullets_sections1"
+  - ".prodDetTable"
+  - "#product-specification-table"
+
+# ASIN (parfois dans les métadonnées)
+asin:
+  - "input[name='ASIN']"
+  - "th:contains('ASIN') + td"
+
+# Messages captcha / robot check
+captcha_indicators:
+  - "form[action*='validateCaptcha']"
+  - "p.a-last:contains('Sorry')"
+  - "img[alt*='captcha']"
+
+# Notes pour le parsing:
+# - Amazon change fréquemment ses sélecteurs
+# - Plusieurs fallbacks sont fournis pour chaque champ
+# - Le parsing doit tester tous les sélecteurs dans l'ordre
+# - En cas d'échec, marquer le champ comme null dans ProductSnapshot
--- a/pricewatch/app/stores/amazon/store.py
+++ b/pricewatch/app/stores/amazon/store.py
@@ -0,0 +1,330 @@
+"""
+Store Amazon - Parsing de produits Amazon.fr et Amazon.com.
+
+Supporte l'extraction de: titre, prix, ASIN, images, specs, etc.
+"""
+
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+from pricewatch.app.core.logging import get_logger
+from pricewatch.app.core.schema import (
+    DebugInfo,
+    DebugStatus,
+    FetchMethod,
+    ProductSnapshot,
+    StockStatus,
+)
+from pricewatch.app.stores.base import BaseStore
+
+logger = get_logger("stores.amazon")
+
+
+class AmazonStore(BaseStore):
+    """Store pour Amazon.fr et Amazon.com."""
+
+    def __init__(self):
+        """Initialise le store Amazon avec ses sélecteurs."""
+        selectors_path = Path(__file__).parent / "selectors.yml"
+        super().__init__(store_id="amazon", selectors_path=selectors_path)
+
+    def match(self, url: str) -> float:
+        """
+        Détecte si l'URL est Amazon.
+
+        Returns:
+            0.9 pour amazon.fr
+            0.8 pour amazon.com et autres domaines amazon
+            0.0 sinon
+        """
+        if not url:
+            return 0.0
+
+        url_lower = url.lower()
+
+        if "amazon.fr" in url_lower:
+            return 0.9
+        elif "amazon.com" in url_lower or "amazon.co" in url_lower:
+            return 0.8
+        elif "amazon." in url_lower:
+            return 0.7
+
+        return 0.0
+
+    def canonicalize(self, url: str) -> str:
+        """
+        Normalise l'URL Amazon vers /dp/{ASIN}.
+
+        Exemples:
+            https://www.amazon.fr/product-name/dp/B08N5WRWNW/ref=...
+            → https://www.amazon.fr/dp/B08N5WRWNW
+
+        Justification: L'ASIN est l'identifiant unique, le reste est superflu.
+        """
+        if not url:
+            return url
+
+        # Extraire l'ASIN
+        asin = self.extract_reference(url)
+        if not asin:
+            # Si pas d'ASIN trouvé, retourner l'URL sans query params
+            parsed = urlparse(url)
+            return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
+
+        # Reconstruire l'URL canonique
+        parsed = urlparse(url)
+        return f"{parsed.scheme}://{parsed.netloc}/dp/{asin}"
+
+    def extract_reference(self, url: str) -> Optional[str]:
+        """
+        Extrait l'ASIN depuis l'URL.
+
+        L'ASIN est généralement après /dp/ ou /gp/product/.
+        L'ASIN doit avoir exactement 10 caractères alphanumériques.
+
+        Exemples:
+            /dp/B08N5WRWNW → B08N5WRWNW
+            /gp/product/B08N5WRWNW → B08N5WRWNW
+        """
+        if not url:
+            return None
+
+        # Pattern: /dp/{ASIN} ou /gp/product/{ASIN}
+        # L'ASIN doit être suivi de /, ?, #, ou fin de string
+        match = re.search(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:/|\?|#|$)", url)
+        if match:
+            return match.group(1)
+
+        return None
+
+    def parse(self, html: str, url: str) -> ProductSnapshot:
+        """
+        Parse le HTML Amazon vers ProductSnapshot.
+
+        Utilise BeautifulSoup et les sélecteurs du fichier YAML.
+        """
+        soup = BeautifulSoup(html, "lxml")
+
+        debug_info = DebugInfo(
+            method=FetchMethod.HTTP,  # Sera mis à jour par l'appelant
+            status=DebugStatus.SUCCESS,
+            errors=[],
+            notes=[],
+        )
+
+        # Vérifier si captcha/robot check
+        if self._detect_captcha(soup):
+            debug_info.errors.append("Captcha ou robot check détecté")
+            debug_info.status = DebugStatus.FAILED
+            logger.warning(f"[Amazon] Captcha détecté pour: {url}")
+
+        # Extraction des champs
+        title = self._extract_title(soup, debug_info)
+        price = self._extract_price(soup, debug_info)
+        currency = self._extract_currency(soup, debug_info)
+        stock_status = self._extract_stock(soup, debug_info)
+        images = self._extract_images(soup, debug_info)
+        category = self._extract_category(soup, debug_info)
+        specs = self._extract_specs(soup, debug_info)
+        reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
+
+        # Déterminer le statut final (ne pas écraser FAILED)
+        if debug_info.status != DebugStatus.FAILED:
+            if not title or price is None:
+                debug_info.status = DebugStatus.PARTIAL
+                debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
+
+        snapshot = ProductSnapshot(
+            source=self.store_id,
+            url=self.canonicalize(url),
+            fetched_at=datetime.now(),
+            title=title,
+            price=price,
+            currency=currency or "EUR",
+            shipping_cost=None,  # Difficile à extraire
+            stock_status=stock_status,
+            reference=reference,
+            category=category,
+            images=images,
+            specs=specs,
+            debug=debug_info,
+        )
+
+        logger.info(
+            f"[Amazon] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
+            f"title={bool(title)}, price={price is not None}"
+        )
+
+        return snapshot
+
+    def _detect_captcha(self, soup: BeautifulSoup) -> bool:
+        """Détecte si la page contient un captcha/robot check."""
+        captcha_selectors = self.get_selector("captcha_indicators", [])
+        if isinstance(captcha_selectors, str):
+            captcha_selectors = [captcha_selectors]
+
+        for selector in captcha_selectors:
+            if soup.select(selector):
+                return True
+
+        # Vérifier dans le texte
+        text = soup.get_text().lower()
+        if "captcha" in text or "robot check" in text or "sorry" in text:
+            return True
+
+        return False
+
+    def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait le titre du produit."""
+        selectors = self.get_selector("title", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                title = element.get_text(strip=True)
+                if title:
+                    return title
+
+        debug.errors.append("Titre non trouvé")
+        return None
+
+    def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
+        """Extrait le prix."""
+        selectors = self.get_selector("price", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            elements = soup.select(selector)
+            for element in elements:
+                text = element.get_text(strip=True)
+                # Extraire nombre (format: "299,99" ou "299.99")
+                match = re.search(r"(\d+)[.,](\d+)", text)
+                if match:
+                    price_str = f"{match.group(1)}.{match.group(2)}"
+                    try:
+                        return float(price_str)
+                    except ValueError:
+                        continue
+
+        debug.errors.append("Prix non trouvé")
+        return None
+
+    def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait la devise."""
+        selectors = self.get_selector("currency", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                symbol = element.get_text(strip=True)
+                # Mapper symboles vers codes ISO
+                currency_map = {"€": "EUR", "$": "USD", "£": "GBP"}
+                return currency_map.get(symbol, "EUR")
+
+        # Défaut basé sur le domaine
+        return "EUR"
+
+    def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
+        """Extrait le statut de stock."""
+        selectors = self.get_selector("stock_status", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                text = element.get_text(strip=True).lower()
+                if "en stock" in text or "available" in text or "in stock" in text:
+                    return StockStatus.IN_STOCK
+                elif (
+                    "rupture" in text
+                    or "indisponible" in text
+                    or "out of stock" in text
+                ):
+                    return StockStatus.OUT_OF_STOCK
+
+        return StockStatus.UNKNOWN
+
+    def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
+        """Extrait les URLs d'images."""
+        images = []
+        selectors = self.get_selector("images", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            elements = soup.select(selector)
+            for element in elements:
+                # Attribut src ou data-src
+                url = element.get("src") or element.get("data-src")
+                if url and url.startswith("http"):
+                    images.append(url)
+
+        return list(set(images))  # Dédupliquer
+
+    def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait la catégorie depuis les breadcrumbs."""
+        selectors = self.get_selector("category", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                # Prendre le dernier élément du breadcrumb
+                links = element.select("a")
+                if links:
+                    return links[-1].get_text(strip=True)
+
+        return None
+
+    def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
+        """Extrait les caractéristiques techniques."""
+        specs = {}
+        selectors = self.get_selector("specs_table", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            table = soup.select_one(selector)
+            if table:
+                # Parser table <th>/<td>
+                rows = table.select("tr")
+                for row in rows:
+                    th = row.select_one("th")
+                    td = row.select_one("td")
+                    if th and td:
+                        key = th.get_text(strip=True)
+                        value = td.get_text(strip=True)
+                        if key and value:
+                            specs[key] = value
+
+        return specs
+
+    def _extract_asin_from_html(self, soup: BeautifulSoup) -> Optional[str]:
+        """Extrait l'ASIN depuis le HTML (fallback)."""
+        selectors = self.get_selector("asin", [])
+        if isinstance(selectors, str):
+            selectors = [selectors]
+
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                # Input avec attribut value
+                if element.name == "input":
+                    return element.get("value")
+                # TD dans une table
+                else:
+                    return element.get_text(strip=True)
+
+        return None