This commit is contained in:
Gilles Soulier
2026-01-14 21:54:55 +01:00
parent c91c0f1fc9
commit d0b73b9319
140 changed files with 5822 additions and 161 deletions

View File

@@ -4,6 +4,7 @@ Store Cdiscount - Parsing de produits Cdiscount.com.
Supporte l'extraction de: titre, prix, SKU, images, specs, etc.
"""
import json
import re
from datetime import datetime
from pathlib import Path
@@ -21,6 +22,7 @@ from pricewatch.app.core.schema import (
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text
logger = get_logger("stores.cdiscount")
@@ -112,6 +114,8 @@ class CdiscountStore(BaseStore):
images = self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
reference = self.extract_reference(url) or self._extract_sku_from_html(soup)
# Déterminer le statut final
@@ -130,8 +134,10 @@ class CdiscountStore(BaseStore):
stock_status=stock_status,
reference=reference,
category=category,
description=description,
images=images,
specs=specs,
msrp=msrp,
debug=debug_info,
)
@@ -158,6 +164,21 @@ class CdiscountStore(BaseStore):
debug.errors.append("Titre non trouvé")
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
product_ld = self._find_product_ld(soup)
desc_ld = product_ld.get("description") if product_ld else None
if isinstance(desc_ld, str) and desc_ld.strip():
return desc_ld.strip()
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
selectors = self.get_selector("price", [])
@@ -170,20 +191,29 @@ class CdiscountStore(BaseStore):
# Attribut content (schema.org) ou texte
price_text = element.get("content") or element.get_text(strip=True)
# Extraire nombre (format: "299,99" ou "299.99")
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
if match:
integer_part = match.group(1)
decimal_part = match.group(2) or "00"
price_str = f"{integer_part}.{decimal_part}"
try:
return float(price_str)
except ValueError:
continue
price = parse_price_text(price_text)
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille."""
selectors = [
".jsStrikePrice",
".price__old",
".c-price__strike",
".price-strike",
]
for selector in selectors:
element = soup.select_one(selector)
if element:
price = parse_price_text(element.get_text(strip=True))
if price is not None:
return price
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la devise."""
selectors = self.get_selector("currency", [])
@@ -249,7 +279,14 @@ class CdiscountStore(BaseStore):
url = f"https:{url}"
images.append(url)
return list(set(images)) # Dédupliquer
ld_images = self._extract_ld_images(self._find_product_ld(soup))
for url in ld_images:
if url and url not in images:
if url.startswith("//"):
url = f"https:{url}"
images.append(url)
return list(dict.fromkeys(images)) # Préserver lordre
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la catégorie depuis les breadcrumbs."""
@@ -275,6 +312,53 @@ class CdiscountStore(BaseStore):
return None
def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:
"""Parse les scripts JSON-LD et retourne les objets."""
entries = []
scripts = soup.find_all("script", type="application/ld+json")
for script in scripts:
raw = script.string or script.text
if not raw:
continue
try:
payload = json.loads(raw.strip())
except (json.JSONDecodeError, TypeError):
continue
if isinstance(payload, list):
entries.extend(payload)
else:
entries.append(payload)
return entries
def _find_product_ld(self, soup: BeautifulSoup) -> dict:
"""Retourne lobjet Product JSON-LD si présent."""
for entry in self._extract_json_ld_entries(soup):
if not isinstance(entry, dict):
continue
type_field = entry.get("@type") or entry.get("type")
if isinstance(type_field, str) and "product" in type_field.lower():
return entry
return {}
def _extract_ld_images(self, product_ld: dict) -> list[str]:
"""Récupère les images listées dans le JSON-LD."""
if not product_ld:
return []
images = product_ld.get("image") or product_ld.get("images")
if not images:
return []
if isinstance(images, str):
images = [images]
extracted = []
for item in images:
if isinstance(item, str):
extracted.append(item)
elif isinstance(item, dict):
url = item.get("url")
if isinstance(url, str):
extracted.append(url)
return extracted
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
"""Extrait les caractéristiques techniques."""
specs = {}
@@ -298,6 +382,19 @@ class CdiscountStore(BaseStore):
if key and value:
specs[key] = value
product_ld = self._find_product_ld(soup)
additional = product_ld.get("additionalProperty") if product_ld else None
if isinstance(additional, dict):
additional = [additional]
if isinstance(additional, list):
for item in additional:
if not isinstance(item, dict):
continue
key = item.get("name") or item.get("propertyID")
value = item.get("value") or item.get("valueReference")
if key and value:
specs[key] = value
return specs
def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]: