codex2

2026-01-14 21:54:55 +01:00
parent c91c0f1fc9
commit d0b73b9319
140 changed files with 5822 additions and 161 deletions
@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
    StockStatus,
 )
 from pricewatch.app.stores.base import BaseStore
+from pricewatch.app.stores.price_parser import parse_price_text

 logger = get_logger("stores.aliexpress")

@@ -126,6 +127,8 @@ class AliexpressStore(BaseStore):
        images = self._extract_images(html, soup, debug_info)
        category = self._extract_category(soup, debug_info)
        specs = self._extract_specs(soup, debug_info)
+        description = self._extract_description(soup, debug_info)
+        msrp = self._extract_msrp(html, debug_info)
        reference = self.extract_reference(url)

        # Note sur le rendu client-side
@@ -150,8 +153,10 @@ class AliexpressStore(BaseStore):
            stock_status=stock_status,
            reference=reference,
            category=category,
+            description=description,
            images=images,
            specs=specs,
+            msrp=msrp,
            debug=debug_info,
        )

@@ -183,6 +188,17 @@ class AliexpressStore(BaseStore):
        debug.errors.append("Titre non trouvé")
        return None

+    def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait la description (meta tags)."""
+        meta = soup.find("meta", property="og:description") or soup.find(
+            "meta", attrs={"name": "description"}
+        )
+        if meta:
+            description = meta.get("content", "").strip()
+            if description:
+                return description
+        return None
+
    def _extract_price(
        self, html: str, soup: BeautifulSoup, debug: DebugInfo
    ) -> Optional[float]:
@@ -193,35 +209,39 @@ class AliexpressStore(BaseStore):
        On utilise regex sur le HTML brut.
        """
        # Pattern 1: Prix avant € (ex: "136,69 €")
-        match = re.search(r"([0-9]+[.,][0-9]{2})\s*€", html)
+        match = re.search(r"([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)\\s*€", html)
        if match:
-            price_str = match.group(1).replace(",", ".")
-            try:
-                return float(price_str)
-            except ValueError:
-                pass
+            price = parse_price_text(match.group(1))
+            if price is not None:
+                return price

        # Pattern 2: € avant prix (ex: "€ 136.69")
-        match = re.search(r"€\s*([0-9]+[.,][0-9]{2})", html)
+        match = re.search(r"€\\s*([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)", html)
        if match:
-            price_str = match.group(1).replace(",", ".")
-            try:
-                return float(price_str)
-            except ValueError:
-                pass
+            price = parse_price_text(match.group(1))
+            if price is not None:
+                return price

        # Pattern 3: Chercher dans meta tags (moins fiable)
        og_price = soup.find("meta", property="og:price:amount")
        if og_price:
            price_str = og_price.get("content", "")
-            try:
-                return float(price_str)
-            except ValueError:
-                pass
+            price = parse_price_text(price_str)
+            if price is not None:
+                return price

        debug.errors.append("Prix non trouvé")
        return None

+    def _extract_msrp(self, html: str, debug: DebugInfo) -> Optional[float]:
+        """Extrait le prix conseille si present."""
+        match = re.search(r"originalPrice\"\\s*:\\s*\"([0-9\\s.,]+)\"", html)
+        if match:
+            price = parse_price_text(match.group(1))
+            if price is not None:
+                return price
+        return None
+
    def _extract_currency(
        self, url: str, soup: BeautifulSoup, debug: DebugInfo
    ) -> str:
@@ -54,12 +54,12 @@ specs_table:
 # ASIN (parfois dans les métadonnées)
 asin:
  - "input[name='ASIN']"
-  - "th:contains('ASIN') + td"
+  - "th:-soup-contains('ASIN') + td"

 # Messages captcha / robot check
 captcha_indicators:
  - "form[action*='validateCaptcha']"
-  - "p.a-last:contains('Sorry')"
+  - "p.a-last:-soup-contains('Sorry')"
  - "img[alt*='captcha']"

 # Notes pour le parsing:
@@ -4,7 +4,9 @@ Store Amazon - Parsing de produits Amazon.fr et Amazon.com.
 Supporte l'extraction de: titre, prix, ASIN, images, specs, etc.
 """

+import json
 import re
+from html import unescape
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
@@ -21,6 +23,7 @@ from pricewatch.app.core.schema import (
    StockStatus,
 )
 from pricewatch.app.stores.base import BaseStore
+from pricewatch.app.stores.price_parser import parse_price_text

 logger = get_logger("stores.amazon")

@@ -131,6 +134,8 @@ class AmazonStore(BaseStore):
        images = self._extract_images(soup, debug_info)
        category = self._extract_category(soup, debug_info)
        specs = self._extract_specs(soup, debug_info)
+        description = self._extract_description(soup, debug_info)
+        msrp = self._extract_msrp(soup, debug_info)
        reference = self.extract_reference(url) or self._extract_asin_from_html(soup)

        # Déterminer le statut final (ne pas écraser FAILED)
@@ -150,8 +155,10 @@ class AmazonStore(BaseStore):
            stock_status=stock_status,
            reference=reference,
            category=category,
+            description=description,
            images=images,
            specs=specs,
+            msrp=msrp,
            debug=debug_info,
        )

@@ -195,6 +202,17 @@ class AmazonStore(BaseStore):
        debug.errors.append("Titre non trouvé")
        return None

+    def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait la description (meta tags)."""
+        meta = soup.find("meta", property="og:description") or soup.find(
+            "meta", attrs={"name": "description"}
+        )
+        if meta:
+            description = meta.get("content", "").strip()
+            if description:
+                return description
+        return None
+
    def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix."""
        selectors = self.get_selector("price", [])
@@ -205,14 +223,9 @@ class AmazonStore(BaseStore):
            elements = soup.select(selector)
            for element in elements:
                text = element.get_text(strip=True)
-                # Extraire nombre (format: "299,99" ou "299.99")
-                match = re.search(r"(\d+)[.,](\d+)", text)
-                if match:
-                    price_str = f"{match.group(1)}.{match.group(2)}"
-                    try:
-                        return float(price_str)
-                    except ValueError:
-                        continue
+                price = parse_price_text(text)
+                if price is not None:
+                    return price

        # Fallback: chercher les spans séparés a-price-whole et a-price-fraction
        whole = soup.select_one("span.a-price-whole")
@@ -220,15 +233,24 @@ class AmazonStore(BaseStore):
        if whole and fraction:
            whole_text = whole.get_text(strip=True)
            fraction_text = fraction.get_text(strip=True)
-            try:
-                price_str = f"{whole_text}.{fraction_text}"
-                return float(price_str)
-            except ValueError:
-                pass
+            price = parse_price_text(f"{whole_text}.{fraction_text}")
+            if price is not None:
+                return price

        debug.errors.append("Prix non trouvé")
        return None

+    def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
+        """Extrait le prix conseille."""
+        strike = soup.select_one("span.priceBlockStrikePriceString") or soup.select_one(
+            "span.a-text-price span.a-offscreen"
+        )
+        if strike:
+            price = parse_price_text(strike.get_text(strip=True))
+            if price is not None:
+                return price
+        return None
+
    def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la devise."""
        selectors = self.get_selector("currency", [])
@@ -270,6 +292,7 @@ class AmazonStore(BaseStore):
    def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
        """Extrait les URLs d'images."""
        images = []
+        seen = set()
        selectors = self.get_selector("images", [])
        if isinstance(selectors, str):
            selectors = [selectors]
@@ -278,19 +301,57 @@ class AmazonStore(BaseStore):
            elements = soup.select(selector)
            for element in elements:
                # Attribut src ou data-src
-                url = element.get("src") or element.get("data-src")
+                url = element.get("src") or element.get("data-src") or element.get("data-old-hires")
                if url and url.startswith("http"):
-                    images.append(url)
+                    if self._is_product_image(url) and url not in seen:
+                        images.append(url)
+                        seen.add(url)
+                dynamic = element.get("data-a-dynamic-image")
+                if dynamic:
+                    urls = self._extract_dynamic_images(dynamic)
+                    for dyn_url in urls:
+                        if self._is_product_image(dyn_url) and dyn_url not in seen:
+                            images.append(dyn_url)
+                            seen.add(dyn_url)

        # Fallback: chercher tous les img tags si aucune image trouvée
        if not images:
            all_imgs = soup.find_all("img")
            for img in all_imgs:
                url = img.get("src") or img.get("data-src")
-                if url and url.startswith("http"):
-                    images.append(url)
+                if url and url.startswith("http") and self._is_product_image(url):
+                    if url not in seen:
+                        images.append(url)
+                        seen.add(url)

-        return list(set(images))  # Dédupliquer
+        return images
+
+    def _extract_dynamic_images(self, raw: str) -> list[str]:
+        """Extrait les URLs du JSON data-a-dynamic-image."""
+        try:
+            data = json.loads(unescape(raw))
+        except (TypeError, json.JSONDecodeError):
+            return []
+
+        urls = []
+        if isinstance(data, dict):
+            candidates = []
+            for url, dims in data.items():
+                if not isinstance(url, str) or not url.startswith("http"):
+                    continue
+                size = dims[0] if isinstance(dims, list) and dims else 0
+                candidates.append((size, url))
+            candidates.sort(key=lambda item: item[0], reverse=True)
+            for _, url in candidates:
+                urls.append(url)
+        return urls
+
+    def _is_product_image(self, url: str) -> bool:
+        """Filtre basique pour eviter les logos et sprites."""
+        lowered = url.lower()
+        if "prime_logo" in lowered or "sprite" in lowered:
+            return False
+        return True

    def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la catégorie depuis les breadcrumbs."""
@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
    StockStatus,
 )
 from pricewatch.app.stores.base import BaseStore
+from pricewatch.app.stores.price_parser import parse_price_text

 logger = get_logger("stores.backmarket")

@@ -116,6 +117,8 @@ class BackmarketStore(BaseStore):
        images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
        category = self._extract_category(soup, debug_info)
        specs = self._extract_specs(soup, debug_info)
+        description = self._extract_description(soup, debug_info)
+        msrp = self._extract_msrp(soup, debug_info)
        reference = self.extract_reference(url)

        # Spécifique Backmarket: condition (état du reconditionné)
@@ -140,8 +143,10 @@ class BackmarketStore(BaseStore):
            stock_status=stock_status,
            reference=reference,
            category=category,
+            description=description,
            images=images,
            specs=specs,
+            msrp=msrp,
            debug=debug_info,
        )

@@ -213,6 +218,17 @@ class BackmarketStore(BaseStore):
        debug.errors.append("Titre non trouvé")
        return None

+    def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait la description (meta tags)."""
+        meta = soup.find("meta", property="og:description") or soup.find(
+            "meta", attrs={"name": "description"}
+        )
+        if meta:
+            description = meta.get("content", "").strip()
+            if description:
+                return description
+        return None
+
    def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix."""
        selectors = self.get_selector("price", [])
@@ -225,20 +241,29 @@ class BackmarketStore(BaseStore):
                # Attribut content (schema.org) ou texte
                price_text = element.get("content") or element.get_text(strip=True)

-                # Extraire nombre (format: "299,99" ou "299.99" ou "299")
-                match = re.search(r"(\d+)[.,]?(\d*)", price_text)
-                if match:
-                    integer_part = match.group(1)
-                    decimal_part = match.group(2) or "00"
-                    price_str = f"{integer_part}.{decimal_part}"
-                    try:
-                        return float(price_str)
-                    except ValueError:
-                        continue
+                price = parse_price_text(price_text)
+                if price is not None:
+                    return price

        debug.errors.append("Prix non trouvé")
        return None

+    def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
+        """Extrait le prix conseille."""
+        selectors = [
+            ".price--old",
+            ".price--striked",
+            ".price__old",
+            "del",
+        ]
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                price = parse_price_text(element.get_text(strip=True))
+                if price is not None:
+                    return price
+        return None
+
    def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la devise."""
        selectors = self.get_selector("currency", [])
@@ -4,6 +4,7 @@ Store Cdiscount - Parsing de produits Cdiscount.com.
 Supporte l'extraction de: titre, prix, SKU, images, specs, etc.
 """

+import json
 import re
 from datetime import datetime
 from pathlib import Path
@@ -21,6 +22,7 @@ from pricewatch.app.core.schema import (
    StockStatus,
 )
 from pricewatch.app.stores.base import BaseStore
+from pricewatch.app.stores.price_parser import parse_price_text

 logger = get_logger("stores.cdiscount")

@@ -112,6 +114,8 @@ class CdiscountStore(BaseStore):
        images = self._extract_images(soup, debug_info)
        category = self._extract_category(soup, debug_info)
        specs = self._extract_specs(soup, debug_info)
+        description = self._extract_description(soup, debug_info)
+        msrp = self._extract_msrp(soup, debug_info)
        reference = self.extract_reference(url) or self._extract_sku_from_html(soup)

        # Déterminer le statut final
@@ -130,8 +134,10 @@ class CdiscountStore(BaseStore):
            stock_status=stock_status,
            reference=reference,
            category=category,
+            description=description,
            images=images,
            specs=specs,
+            msrp=msrp,
            debug=debug_info,
        )

@@ -158,6 +164,21 @@ class CdiscountStore(BaseStore):
        debug.errors.append("Titre non trouvé")
        return None

+    def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+        """Extrait la description (meta tags)."""
+        meta = soup.find("meta", property="og:description") or soup.find(
+            "meta", attrs={"name": "description"}
+        )
+        if meta:
+            description = meta.get("content", "").strip()
+            if description:
+                return description
+        product_ld = self._find_product_ld(soup)
+        desc_ld = product_ld.get("description") if product_ld else None
+        if isinstance(desc_ld, str) and desc_ld.strip():
+            return desc_ld.strip()
+        return None
+
    def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix."""
        selectors = self.get_selector("price", [])
@@ -170,20 +191,29 @@ class CdiscountStore(BaseStore):
                # Attribut content (schema.org) ou texte
                price_text = element.get("content") or element.get_text(strip=True)

-                # Extraire nombre (format: "299,99" ou "299.99")
-                match = re.search(r"(\d+)[.,]?(\d*)", price_text)
-                if match:
-                    integer_part = match.group(1)
-                    decimal_part = match.group(2) or "00"
-                    price_str = f"{integer_part}.{decimal_part}"
-                    try:
-                        return float(price_str)
-                    except ValueError:
-                        continue
+                price = parse_price_text(price_text)
+                if price is not None:
+                    return price

        debug.errors.append("Prix non trouvé")
        return None

+    def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
+        """Extrait le prix conseille."""
+        selectors = [
+            ".jsStrikePrice",
+            ".price__old",
+            ".c-price__strike",
+            ".price-strike",
+        ]
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element:
+                price = parse_price_text(element.get_text(strip=True))
+                if price is not None:
+                    return price
+        return None
+
    def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la devise."""
        selectors = self.get_selector("currency", [])
@@ -249,7 +279,14 @@ class CdiscountStore(BaseStore):
                        url = f"https:{url}"
                    images.append(url)

-        return list(set(images))  # Dédupliquer
+        ld_images = self._extract_ld_images(self._find_product_ld(soup))
+        for url in ld_images:
+            if url and url not in images:
+                if url.startswith("//"):
+                    url = f"https:{url}"
+                images.append(url)
+
+        return list(dict.fromkeys(images))  # Préserver l’ordre

    def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la catégorie depuis les breadcrumbs."""
@@ -275,6 +312,53 @@ class CdiscountStore(BaseStore):

        return None

+    def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:
+        """Parse les scripts JSON-LD et retourne les objets."""
+        entries = []
+        scripts = soup.find_all("script", type="application/ld+json")
+        for script in scripts:
+            raw = script.string or script.text
+            if not raw:
+                continue
+            try:
+                payload = json.loads(raw.strip())
+            except (json.JSONDecodeError, TypeError):
+                continue
+            if isinstance(payload, list):
+                entries.extend(payload)
+            else:
+                entries.append(payload)
+        return entries
+
+    def _find_product_ld(self, soup: BeautifulSoup) -> dict:
+        """Retourne l’objet Product JSON-LD si présent."""
+        for entry in self._extract_json_ld_entries(soup):
+            if not isinstance(entry, dict):
+                continue
+            type_field = entry.get("@type") or entry.get("type")
+            if isinstance(type_field, str) and "product" in type_field.lower():
+                return entry
+        return {}
+
+    def _extract_ld_images(self, product_ld: dict) -> list[str]:
+        """Récupère les images listées dans le JSON-LD."""
+        if not product_ld:
+            return []
+        images = product_ld.get("image") or product_ld.get("images")
+        if not images:
+            return []
+        if isinstance(images, str):
+            images = [images]
+        extracted = []
+        for item in images:
+            if isinstance(item, str):
+                extracted.append(item)
+            elif isinstance(item, dict):
+                url = item.get("url")
+                if isinstance(url, str):
+                    extracted.append(url)
+        return extracted
+
    def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
        """Extrait les caractéristiques techniques."""
        specs = {}
@@ -298,6 +382,19 @@ class CdiscountStore(BaseStore):
                            if key and value:
                                specs[key] = value

+        product_ld = self._find_product_ld(soup)
+        additional = product_ld.get("additionalProperty") if product_ld else None
+        if isinstance(additional, dict):
+            additional = [additional]
+        if isinstance(additional, list):
+            for item in additional:
+                if not isinstance(item, dict):
+                    continue
+                key = item.get("name") or item.get("propertyID")
+                value = item.get("value") or item.get("valueReference")
+                if key and value:
+                    specs[key] = value
+
        return specs

    def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]:
@@ -0,0 +1,48 @@
+"""
+Helpers pour parser des prix avec separateurs de milliers.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+
+def parse_price_text(text: str) -> Optional[float]:
+    """
+    Parse un texte de prix en float.
+
+    Gere les separateurs espace, point, virgule et espaces insécables.
+    """
+    if not text:
+        return None
+
+    text = re.sub(r"(\d)\s*€\s*(\d)", r"\1,\2", text)
+    cleaned = text.replace("\u00a0", " ").replace("\u202f", " ").replace("\u2009", " ")
+    cleaned = "".join(ch for ch in cleaned if ch.isdigit() or ch in ".,")
+    if not cleaned:
+        return None
+
+    if "," in cleaned and "." in cleaned:
+        if cleaned.rfind(",") > cleaned.rfind("."):
+            cleaned = cleaned.replace(".", "")
+            cleaned = cleaned.replace(",", ".")
+        else:
+            cleaned = cleaned.replace(",", "")
+    elif "," in cleaned:
+        parts = cleaned.split(",")
+        if len(parts) > 1:
+            decimal = parts[-1]
+            integer = "".join(parts[:-1])
+            cleaned = f"{integer}.{decimal}" if decimal else integer
+    elif "." in cleaned:
+        parts = cleaned.split(".")
+        if len(parts) > 1:
+            decimal = parts[-1]
+            integer = "".join(parts[:-1])
+            cleaned = f"{integer}.{decimal}" if decimal else integer
+
+    try:
+        return float(cleaned)
+    except ValueError:
+        return None