before claude
This commit is contained in:
Binary file not shown.
BIN
pricewatch/app/stores/cdiscount/__pycache__/store.cpython-313.pyc
Executable file → Normal file
BIN
pricewatch/app/stores/cdiscount/__pycache__/store.cpython-313.pyc
Executable file → Normal file
Binary file not shown.
@@ -112,7 +112,7 @@ class CdiscountStore(BaseStore):
|
||||
currency = self._extract_currency(soup, debug_info)
|
||||
stock_status = self._extract_stock(soup, debug_info)
|
||||
images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info, url)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
@@ -180,7 +180,7 @@ class CdiscountStore(BaseStore):
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
"""Extrait le prix (DOM puis JSON-LD)."""
|
||||
selectors = self.get_selector("price", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
@@ -188,16 +188,33 @@ class CdiscountStore(BaseStore):
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Attribut content (schema.org) ou texte
|
||||
price_text = element.get("content") or element.get_text(strip=True)
|
||||
|
||||
price = parse_price_text(price_text)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
price = self._extract_price_from_json_ld(soup)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_price_from_json_ld(self, soup: BeautifulSoup) -> Optional[float]:
|
||||
"""Extrait le prix depuis les scripts JSON-LD."""
|
||||
product_ld = self._find_product_ld(soup)
|
||||
offers = product_ld.get("offers")
|
||||
if isinstance(offers, list):
|
||||
offers = offers[0] if offers else None
|
||||
if isinstance(offers, dict):
|
||||
price = offers.get("price")
|
||||
if isinstance(price, str):
|
||||
return parse_price_text(price)
|
||||
if isinstance(price, (int, float)):
|
||||
# convert to float but maintain decimals
|
||||
return float(price)
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille."""
|
||||
selectors = [
|
||||
@@ -205,6 +222,8 @@ class CdiscountStore(BaseStore):
|
||||
".price__old",
|
||||
".c-price__strike",
|
||||
".price-strike",
|
||||
"div[data-e2e='strikedPrice']",
|
||||
"div.SecondaryPrice-price",
|
||||
]
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
@@ -212,6 +231,19 @@ class CdiscountStore(BaseStore):
|
||||
price = parse_price_text(element.get_text(strip=True))
|
||||
if price is not None:
|
||||
return price
|
||||
# Fallback: JSON-LD (offers price + promotions)
|
||||
product_ld = self._find_product_ld(soup)
|
||||
offer = product_ld.get("offers")
|
||||
if isinstance(offer, dict):
|
||||
price = offer.get("price")
|
||||
if isinstance(price, str):
|
||||
candidate = parse_price_text(price)
|
||||
elif isinstance(price, (int, float)):
|
||||
candidate = float(price)
|
||||
else:
|
||||
candidate = None
|
||||
if candidate is not None:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
@@ -288,7 +320,7 @@ class CdiscountStore(BaseStore):
|
||||
|
||||
return list(dict.fromkeys(images)) # Préserver l’ordre
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo, url: str) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis les breadcrumbs."""
|
||||
selectors = self.get_selector("category", [])
|
||||
if isinstance(selectors, str):
|
||||
@@ -310,6 +342,54 @@ class CdiscountStore(BaseStore):
|
||||
if parts:
|
||||
return parts[-1]
|
||||
|
||||
if title := self._extract_category_from_breadcrumbs(soup):
|
||||
return title
|
||||
return self._extract_category_from_url(url)
|
||||
|
||||
def _extract_category_from_breadcrumbs(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Cherche un breadcrumb via JSON-LD (BreadcrumbList) et retourne l'avant-dernier item."""
|
||||
entries = self._extract_json_ld_entries(soup)
|
||||
for entry in entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if entry.get("@type") != "BreadcrumbList":
|
||||
continue
|
||||
items = entry.get("itemListElement", [])
|
||||
if not isinstance(items, list):
|
||||
continue
|
||||
positions = [
|
||||
element.get("position")
|
||||
for element in items
|
||||
if isinstance(element, dict) and isinstance(element.get("position"), int)
|
||||
]
|
||||
max_pos = max(positions) if positions else None
|
||||
for element in reversed(items):
|
||||
if not isinstance(element, dict):
|
||||
continue
|
||||
position = element.get("position")
|
||||
if max_pos is not None and position == max_pos:
|
||||
continue
|
||||
item = element.get("item", {})
|
||||
name = item.get("name")
|
||||
if name and isinstance(name, str):
|
||||
title = name.strip()
|
||||
if title:
|
||||
return title
|
||||
return None
|
||||
|
||||
def _extract_category_from_url(self, url: str) -> Optional[str]:
|
||||
"""Déduit la catégorie via l'URL /informatique/.../f-..."""
|
||||
if not url:
|
||||
return None
|
||||
parsed = urlparse(url)
|
||||
segments = [seg for seg in parsed.path.split("/") if seg]
|
||||
breadcrumb = []
|
||||
for segment in segments:
|
||||
if segment.startswith("f-") or segment.startswith("p-"):
|
||||
break
|
||||
breadcrumb.append(segment)
|
||||
if breadcrumb:
|
||||
return breadcrumb[-1].replace("-", " ").title()
|
||||
return None
|
||||
|
||||
def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:
|
||||
|
||||
@@ -17,6 +17,18 @@ def parse_price_text(text: str) -> Optional[float]:
|
||||
if not text:
|
||||
return None
|
||||
|
||||
euro_suffix = re.search(r"([0-9 .,]+)\s*€\s*(\d{2})\b", text)
|
||||
if euro_suffix:
|
||||
integer_part = euro_suffix.group(1)
|
||||
decimal_part = euro_suffix.group(2)
|
||||
integer_clean = re.sub(r"[^\d]", "", integer_part)
|
||||
if integer_clean:
|
||||
cleaned_decimal = f"{integer_clean}.{decimal_part}"
|
||||
try:
|
||||
return float(cleaned_decimal)
|
||||
except ValueError:
|
||||
pass
|
||||
# Fallback to original replacement if suffix logic fails
|
||||
text = re.sub(r"(\d)\s*€\s*(\d)", r"\1,\2", text)
|
||||
cleaned = text.replace("\u00a0", " ").replace("\u202f", " ").replace("\u2009", " ")
|
||||
cleaned = "".join(ch for ch in cleaned if ch.isdigit() or ch in ".,")
|
||||
|
||||
Reference in New Issue
Block a user