""" Store Cdiscount - Parsing de produits Cdiscount.com. Supporte l'extraction de: titre, prix, SKU, images, specs, etc. """ import re from datetime import datetime from pathlib import Path from typing import Optional from urllib.parse import urlparse from bs4 import BeautifulSoup from pricewatch.app.core.logging import get_logger from pricewatch.app.core.schema import ( DebugInfo, DebugStatus, FetchMethod, ProductSnapshot, StockStatus, ) from pricewatch.app.stores.base import BaseStore logger = get_logger("stores.cdiscount") class CdiscountStore(BaseStore): """Store pour Cdiscount.com.""" def __init__(self): """Initialise le store Cdiscount avec ses sélecteurs.""" selectors_path = Path(__file__).parent / "selectors.yml" super().__init__(store_id="cdiscount", selectors_path=selectors_path) def match(self, url: str) -> float: """ Détecte si l'URL est Cdiscount. Returns: 0.9 pour cdiscount.com 0.0 sinon """ if not url: return 0.0 url_lower = url.lower() if "cdiscount.com" in url_lower: return 0.9 return 0.0 def canonicalize(self, url: str) -> str: """ Normalise l'URL Cdiscount. Les URLs Cdiscount ont généralement la forme: https://www.cdiscount.com/category/product-name/f-{ID}-{SKU}.html On garde l'URL complète sans query params. """ if not url: return url parsed = urlparse(url) # Retirer query params et fragment return f"{parsed.scheme}://{parsed.netloc}{parsed.path}" def extract_reference(self, url: str) -> Optional[str]: """ Extrait le SKU depuis l'URL. Format typique: /f-{ID}-{SKU}.html Exemple: /f-1070123-example.html → "1070123-example" """ if not url: return None # Pattern: /f-{ID}-{SKU}.html match = re.search(r"/f-(\d+-[\w-]+)\.html", url) if match: return match.group(1) # Fallback: extraire après /f- match = re.search(r"/f-([\w-]+)", url) if match: return match.group(1) return None def parse(self, html: str, url: str) -> ProductSnapshot: """ Parse le HTML Cdiscount vers ProductSnapshot. Utilise BeautifulSoup et les sélecteurs du fichier YAML. """ soup = BeautifulSoup(html, "lxml") debug_info = DebugInfo( method=FetchMethod.HTTP, # Sera mis à jour par l'appelant status=DebugStatus.SUCCESS, errors=[], notes=[], ) # Extraction des champs title = self._extract_title(soup, debug_info) price = self._extract_price(soup, debug_info) currency = self._extract_currency(soup, debug_info) stock_status = self._extract_stock(soup, debug_info) images = self._extract_images(soup, debug_info) category = self._extract_category(soup, debug_info) specs = self._extract_specs(soup, debug_info) reference = self.extract_reference(url) or self._extract_sku_from_html(soup) # Déterminer le statut final if not title or price is None: debug_info.status = DebugStatus.PARTIAL debug_info.notes.append("Parsing incomplet: titre ou prix manquant") snapshot = ProductSnapshot( source=self.store_id, url=self.canonicalize(url), fetched_at=datetime.now(), title=title, price=price, currency=currency or "EUR", shipping_cost=None, stock_status=stock_status, reference=reference, category=category, images=images, specs=specs, debug=debug_info, ) logger.info( f"[Cdiscount] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: " f"title={bool(title)}, price={price is not None}" ) return snapshot def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]: """Extrait le titre du produit.""" selectors = self.get_selector("title", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: title = element.get_text(strip=True) if title: return title debug.errors.append("Titre non trouvé") return None def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]: """Extrait le prix.""" selectors = self.get_selector("price", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: elements = soup.select(selector) for element in elements: # Attribut content (schema.org) ou texte price_text = element.get("content") or element.get_text(strip=True) # Extraire nombre (format: "299,99" ou "299.99") match = re.search(r"(\d+)[.,]?(\d*)", price_text) if match: integer_part = match.group(1) decimal_part = match.group(2) or "00" price_str = f"{integer_part}.{decimal_part}" try: return float(price_str) except ValueError: continue debug.errors.append("Prix non trouvé") return None def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]: """Extrait la devise.""" selectors = self.get_selector("currency", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Attribut content currency = element.get("content") if currency: return currency.upper() # Défaut EUR pour Cdiscount return "EUR" def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus: """Extrait le statut de stock.""" selectors = self.get_selector("stock_status", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Attribut href (schema.org) ou texte href = element.get("href", "").lower() text = element.get_text(strip=True).lower() combined = href + " " + text if "instock" in combined or "en stock" in combined: return StockStatus.IN_STOCK elif ( "outofstock" in combined or "rupture" in combined or "indisponible" in combined ): return StockStatus.OUT_OF_STOCK return StockStatus.UNKNOWN def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]: """Extrait les URLs d'images.""" images = [] selectors = self.get_selector("images", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: elements = soup.select(selector) for element in elements: # Attribut src, data-src, ou itemprop url = ( element.get("src") or element.get("data-src") or element.get("content") ) if url and ("http" in url or url.startswith("//")): # Normaliser // vers https:// if url.startswith("//"): url = f"https:{url}" images.append(url) return list(set(images)) # Dédupliquer def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]: """Extrait la catégorie depuis les breadcrumbs.""" selectors = self.get_selector("category", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Prendre le dernier élément du breadcrumb links = element.select("a") if links: return links[-1].get_text(strip=True) # Fallback sur le texte complet text = element.get_text(strip=True) if text: # Séparer par > et prendre le dernier parts = [p.strip() for p in text.split(">")] if parts: return parts[-1] return None def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]: """Extrait les caractéristiques techniques.""" specs = {} selectors = self.get_selector("specs_table", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: container = soup.select_one(selector) if container: # Parser les lignes (souvent des divs ou des li) # Chercher des paires clé: valeur lines = container.get_text(separator="\n").split("\n") for line in lines: # Format "Clé: Valeur" ou "Clé : Valeur" if ":" in line: parts = line.split(":", 1) if len(parts) == 2: key = parts[0].strip() value = parts[1].strip() if key and value: specs[key] = value return specs def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]: """Extrait le SKU depuis le HTML (fallback).""" selectors = self.get_selector("sku", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Attribut content ou itemprop sku = element.get("content") or element.get_text(strip=True) if sku: return sku return None