""" Store Cdiscount - Parsing de produits Cdiscount.com. Supporte l'extraction de: titre, prix, SKU, images, specs, etc. """ import json import re from datetime import datetime from pathlib import Path from typing import Optional from urllib.parse import urlparse from bs4 import BeautifulSoup from pricewatch.app.core.logging import get_logger from pricewatch.app.core.schema import ( DebugInfo, DebugStatus, FetchMethod, ProductSnapshot, StockStatus, ) from pricewatch.app.stores.base import BaseStore from pricewatch.app.stores.price_parser import parse_price_text logger = get_logger("stores.cdiscount") class CdiscountStore(BaseStore): """Store pour Cdiscount.com.""" def __init__(self): """Initialise le store Cdiscount avec ses sélecteurs.""" selectors_path = Path(__file__).parent / "selectors.yml" super().__init__(store_id="cdiscount", selectors_path=selectors_path) def match(self, url: str) -> float: """ Détecte si l'URL est Cdiscount. Returns: 0.9 pour cdiscount.com 0.0 sinon """ if not url: return 0.0 url_lower = url.lower() if "cdiscount.com" in url_lower: return 0.9 return 0.0 def canonicalize(self, url: str) -> str: """ Normalise l'URL Cdiscount. Les URLs Cdiscount ont généralement la forme: https://www.cdiscount.com/category/product-name/f-{ID}-{SKU}.html On garde l'URL complète sans query params. """ if not url: return url parsed = urlparse(url) # Retirer query params et fragment return f"{parsed.scheme}://{parsed.netloc}{parsed.path}" def extract_reference(self, url: str) -> Optional[str]: """ Extrait le SKU depuis l'URL. Format typique: /f-{ID}-{SKU}.html Exemple: /f-1070123-example.html → "1070123-example" """ if not url: return None # Pattern: /f-{ID}-{SKU}.html match = re.search(r"/f-(\d+-[\w-]+)\.html", url) if match: return match.group(1) # Fallback: extraire après /f- match = re.search(r"/f-([\w-]+)", url) if match: return match.group(1) return None def parse(self, html: str, url: str) -> ProductSnapshot: """ Parse le HTML Cdiscount vers ProductSnapshot. Utilise BeautifulSoup et les sélecteurs du fichier YAML. """ soup = BeautifulSoup(html, "lxml") debug_info = DebugInfo( method=FetchMethod.HTTP, # Sera mis à jour par l'appelant status=DebugStatus.SUCCESS, errors=[], notes=[], ) # Extraction des champs title = self._extract_title(soup, debug_info) price = self._extract_price(soup, debug_info) currency = self._extract_currency(soup, debug_info) stock_status = self._extract_stock(soup, debug_info) images = self._extract_images(soup, debug_info) category = self._extract_category(soup, debug_info, url) specs = self._extract_specs(soup, debug_info) description = self._extract_description(soup, debug_info) msrp = self._extract_msrp(soup, debug_info) reference = self.extract_reference(url) or self._extract_sku_from_html(soup) # Déterminer le statut final if not title or price is None: debug_info.status = DebugStatus.PARTIAL debug_info.notes.append("Parsing incomplet: titre ou prix manquant") snapshot = ProductSnapshot( source=self.store_id, url=self.canonicalize(url), fetched_at=datetime.now(), title=title, price=price, currency=currency or "EUR", shipping_cost=None, stock_status=stock_status, reference=reference, category=category, description=description, images=images, specs=specs, msrp=msrp, debug=debug_info, ) logger.info( f"[Cdiscount] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: " f"title={bool(title)}, price={price is not None}" ) return snapshot def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]: """Extrait le titre du produit.""" selectors = self.get_selector("title", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: title = element.get_text(strip=True) if title: return title debug.errors.append("Titre non trouvé") return None def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]: """Extrait la description (meta tags).""" meta = soup.find("meta", property="og:description") or soup.find( "meta", attrs={"name": "description"} ) if meta: description = meta.get("content", "").strip() if description: return description product_ld = self._find_product_ld(soup) desc_ld = product_ld.get("description") if product_ld else None if isinstance(desc_ld, str) and desc_ld.strip(): return desc_ld.strip() return None def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]: """Extrait le prix (DOM puis JSON-LD).""" selectors = self.get_selector("price", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: elements = soup.select(selector) for element in elements: price_text = element.get("content") or element.get_text(strip=True) price = parse_price_text(price_text) if price is not None: return price price = self._extract_price_from_json_ld(soup) if price is not None: return price debug.errors.append("Prix non trouvé") return None def _extract_price_from_json_ld(self, soup: BeautifulSoup) -> Optional[float]: """Extrait le prix depuis les scripts JSON-LD.""" product_ld = self._find_product_ld(soup) offers = product_ld.get("offers") if isinstance(offers, list): offers = offers[0] if offers else None if isinstance(offers, dict): price = offers.get("price") if isinstance(price, str): return parse_price_text(price) if isinstance(price, (int, float)): # convert to float but maintain decimals return float(price) return None def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]: """Extrait le prix conseille.""" selectors = [ ".jsStrikePrice", ".price__old", ".c-price__strike", ".price-strike", "div[data-e2e='strikedPrice']", "div.SecondaryPrice-price", ] for selector in selectors: element = soup.select_one(selector) if element: price = parse_price_text(element.get_text(strip=True)) if price is not None: return price # Fallback: JSON-LD (offers price + promotions) product_ld = self._find_product_ld(soup) offer = product_ld.get("offers") if isinstance(offer, dict): price = offer.get("price") if isinstance(price, str): candidate = parse_price_text(price) elif isinstance(price, (int, float)): candidate = float(price) else: candidate = None if candidate is not None: return candidate return None def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]: """Extrait la devise.""" selectors = self.get_selector("currency", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Attribut content currency = element.get("content") if currency: return currency.upper() # Défaut EUR pour Cdiscount return "EUR" def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus: """Extrait le statut de stock.""" selectors = self.get_selector("stock_status", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Attribut href (schema.org) ou texte href = element.get("href", "").lower() text = element.get_text(strip=True).lower() combined = href + " " + text if "instock" in combined or "en stock" in combined: return StockStatus.IN_STOCK elif ( "outofstock" in combined or "rupture" in combined or "indisponible" in combined ): return StockStatus.OUT_OF_STOCK return StockStatus.UNKNOWN def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]: """Extrait les URLs d'images.""" images = [] selectors = self.get_selector("images", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: elements = soup.select(selector) for element in elements: # Attribut src, data-src, ou itemprop url = ( element.get("src") or element.get("data-src") or element.get("content") ) if url and ("http" in url or url.startswith("//")): # Normaliser // vers https:// if url.startswith("//"): url = f"https:{url}" images.append(url) ld_images = self._extract_ld_images(self._find_product_ld(soup)) for url in ld_images: if url and url not in images: if url.startswith("//"): url = f"https:{url}" images.append(url) return list(dict.fromkeys(images)) # Préserver l’ordre def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo, url: str) -> Optional[str]: """Extrait la catégorie depuis les breadcrumbs.""" selectors = self.get_selector("category", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Prendre le dernier élément du breadcrumb links = element.select("a") if links: return links[-1].get_text(strip=True) # Fallback sur le texte complet text = element.get_text(strip=True) if text: # Séparer par > et prendre le dernier parts = [p.strip() for p in text.split(">")] if parts: return parts[-1] if title := self._extract_category_from_breadcrumbs(soup): return title return self._extract_category_from_url(url) def _extract_category_from_breadcrumbs(self, soup: BeautifulSoup) -> Optional[str]: """Cherche un breadcrumb via JSON-LD (BreadcrumbList) et retourne l'avant-dernier item.""" entries = self._extract_json_ld_entries(soup) for entry in entries: if not isinstance(entry, dict): continue if entry.get("@type") != "BreadcrumbList": continue items = entry.get("itemListElement", []) if not isinstance(items, list): continue positions = [ element.get("position") for element in items if isinstance(element, dict) and isinstance(element.get("position"), int) ] max_pos = max(positions) if positions else None for element in reversed(items): if not isinstance(element, dict): continue position = element.get("position") if max_pos is not None and position == max_pos: continue item = element.get("item", {}) name = item.get("name") if name and isinstance(name, str): title = name.strip() if title: return title return None def _extract_category_from_url(self, url: str) -> Optional[str]: """Déduit la catégorie via l'URL /informatique/.../f-...""" if not url: return None parsed = urlparse(url) segments = [seg for seg in parsed.path.split("/") if seg] breadcrumb = [] for segment in segments: if segment.startswith("f-") or segment.startswith("p-"): break breadcrumb.append(segment) if breadcrumb: return breadcrumb[-1].replace("-", " ").title() return None def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]: """Parse les scripts JSON-LD et retourne les objets.""" entries = [] scripts = soup.find_all("script", type="application/ld+json") for script in scripts: raw = script.string or script.text if not raw: continue try: payload = json.loads(raw.strip()) except (json.JSONDecodeError, TypeError): continue if isinstance(payload, list): entries.extend(payload) else: entries.append(payload) return entries def _find_product_ld(self, soup: BeautifulSoup) -> dict: """Retourne l’objet Product JSON-LD si présent.""" for entry in self._extract_json_ld_entries(soup): if not isinstance(entry, dict): continue type_field = entry.get("@type") or entry.get("type") if isinstance(type_field, str) and "product" in type_field.lower(): return entry return {} def _extract_ld_images(self, product_ld: dict) -> list[str]: """Récupère les images listées dans le JSON-LD.""" if not product_ld: return [] images = product_ld.get("image") or product_ld.get("images") if not images: return [] if isinstance(images, str): images = [images] extracted = [] for item in images: if isinstance(item, str): extracted.append(item) elif isinstance(item, dict): url = item.get("url") if isinstance(url, str): extracted.append(url) return extracted def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]: """Extrait les caractéristiques techniques.""" specs = {} selectors = self.get_selector("specs_table", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: container = soup.select_one(selector) if container: # Parser les lignes (souvent des divs ou des li) # Chercher des paires clé: valeur lines = container.get_text(separator="\n").split("\n") for line in lines: # Format "Clé: Valeur" ou "Clé : Valeur" if ":" in line: parts = line.split(":", 1) if len(parts) == 2: key = parts[0].strip() value = parts[1].strip() if key and value: specs[key] = value product_ld = self._find_product_ld(soup) additional = product_ld.get("additionalProperty") if product_ld else None if isinstance(additional, dict): additional = [additional] if isinstance(additional, list): for item in additional: if not isinstance(item, dict): continue key = item.get("name") or item.get("propertyID") value = item.get("value") or item.get("valueReference") if key and value: specs[key] = value return specs def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]: """Extrait le SKU depuis le HTML (fallback).""" selectors = self.get_selector("sku", []) if isinstance(selectors, str): selectors = [selectors] for selector in selectors: element = soup.select_one(selector) if element: # Attribut content ou itemprop sku = element.get("content") or element.get_text(strip=True) if sku: return sku return None