Files
scrap/pricewatch/app/stores/cdiscount/store.py
2026-01-13 19:49:04 +01:00

318 lines
10 KiB
Python
Executable File

"""
Store Cdiscount - Parsing de produits Cdiscount.com.
Supporte l'extraction de: titre, prix, SKU, images, specs, etc.
"""
import re
from datetime import datetime
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import (
DebugInfo,
DebugStatus,
FetchMethod,
ProductSnapshot,
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
logger = get_logger("stores.cdiscount")
class CdiscountStore(BaseStore):
"""Store pour Cdiscount.com."""
def __init__(self):
"""Initialise le store Cdiscount avec ses sélecteurs."""
selectors_path = Path(__file__).parent / "selectors.yml"
super().__init__(store_id="cdiscount", selectors_path=selectors_path)
def match(self, url: str) -> float:
"""
Détecte si l'URL est Cdiscount.
Returns:
0.9 pour cdiscount.com
0.0 sinon
"""
if not url:
return 0.0
url_lower = url.lower()
if "cdiscount.com" in url_lower:
return 0.9
return 0.0
def canonicalize(self, url: str) -> str:
"""
Normalise l'URL Cdiscount.
Les URLs Cdiscount ont généralement la forme:
https://www.cdiscount.com/category/product-name/f-{ID}-{SKU}.html
On garde l'URL complète sans query params.
"""
if not url:
return url
parsed = urlparse(url)
# Retirer query params et fragment
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
def extract_reference(self, url: str) -> Optional[str]:
"""
Extrait le SKU depuis l'URL.
Format typique: /f-{ID}-{SKU}.html
Exemple: /f-1070123-example.html → "1070123-example"
"""
if not url:
return None
# Pattern: /f-{ID}-{SKU}.html
match = re.search(r"/f-(\d+-[\w-]+)\.html", url)
if match:
return match.group(1)
# Fallback: extraire après /f-
match = re.search(r"/f-([\w-]+)", url)
if match:
return match.group(1)
return None
def parse(self, html: str, url: str) -> ProductSnapshot:
"""
Parse le HTML Cdiscount vers ProductSnapshot.
Utilise BeautifulSoup et les sélecteurs du fichier YAML.
"""
soup = BeautifulSoup(html, "lxml")
debug_info = DebugInfo(
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
status=DebugStatus.SUCCESS,
errors=[],
notes=[],
)
# Extraction des champs
title = self._extract_title(soup, debug_info)
price = self._extract_price(soup, debug_info)
currency = self._extract_currency(soup, debug_info)
stock_status = self._extract_stock(soup, debug_info)
images = self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
reference = self.extract_reference(url) or self._extract_sku_from_html(soup)
# Déterminer le statut final
if not title or price is None:
debug_info.status = DebugStatus.PARTIAL
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
snapshot = ProductSnapshot(
source=self.store_id,
url=self.canonicalize(url),
fetched_at=datetime.now(),
title=title,
price=price,
currency=currency or "EUR",
shipping_cost=None,
stock_status=stock_status,
reference=reference,
category=category,
images=images,
specs=specs,
debug=debug_info,
)
logger.info(
f"[Cdiscount] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
f"title={bool(title)}, price={price is not None}"
)
return snapshot
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait le titre du produit."""
selectors = self.get_selector("title", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
title = element.get_text(strip=True)
if title:
return title
debug.errors.append("Titre non trouvé")
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
selectors = self.get_selector("price", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
# Attribut content (schema.org) ou texte
price_text = element.get("content") or element.get_text(strip=True)
# Extraire nombre (format: "299,99" ou "299.99")
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
if match:
integer_part = match.group(1)
decimal_part = match.group(2) or "00"
price_str = f"{integer_part}.{decimal_part}"
try:
return float(price_str)
except ValueError:
continue
debug.errors.append("Prix non trouvé")
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la devise."""
selectors = self.get_selector("currency", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
# Attribut content
currency = element.get("content")
if currency:
return currency.upper()
# Défaut EUR pour Cdiscount
return "EUR"
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
"""Extrait le statut de stock."""
selectors = self.get_selector("stock_status", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
# Attribut href (schema.org) ou texte
href = element.get("href", "").lower()
text = element.get_text(strip=True).lower()
combined = href + " " + text
if "instock" in combined or "en stock" in combined:
return StockStatus.IN_STOCK
elif (
"outofstock" in combined
or "rupture" in combined
or "indisponible" in combined
):
return StockStatus.OUT_OF_STOCK
return StockStatus.UNKNOWN
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
"""Extrait les URLs d'images."""
images = []
selectors = self.get_selector("images", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
# Attribut src, data-src, ou itemprop
url = (
element.get("src")
or element.get("data-src")
or element.get("content")
)
if url and ("http" in url or url.startswith("//")):
# Normaliser // vers https://
if url.startswith("//"):
url = f"https:{url}"
images.append(url)
return list(set(images)) # Dédupliquer
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la catégorie depuis les breadcrumbs."""
selectors = self.get_selector("category", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
# Prendre le dernier élément du breadcrumb
links = element.select("a")
if links:
return links[-1].get_text(strip=True)
# Fallback sur le texte complet
text = element.get_text(strip=True)
if text:
# Séparer par > et prendre le dernier
parts = [p.strip() for p in text.split(">")]
if parts:
return parts[-1]
return None
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
"""Extrait les caractéristiques techniques."""
specs = {}
selectors = self.get_selector("specs_table", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
container = soup.select_one(selector)
if container:
# Parser les lignes (souvent des divs ou des li)
# Chercher des paires clé: valeur
lines = container.get_text(separator="\n").split("\n")
for line in lines:
# Format "Clé: Valeur" ou "Clé : Valeur"
if ":" in line:
parts = line.split(":", 1)
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip()
if key and value:
specs[key] = value
return specs
def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]:
"""Extrait le SKU depuis le HTML (fallback)."""
selectors = self.get_selector("sku", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
# Attribut content ou itemprop
sku = element.get("content") or element.get_text(strip=True)
if sku:
return sku
return None