before claude

This commit is contained in:
Gilles Soulier
2026-01-17 13:40:26 +01:00
parent d0b73b9319
commit cf7c415e22
35 changed files with 3411 additions and 221 deletions

View File

@@ -21,31 +21,32 @@ from sqlalchemy import and_, desc, func
from sqlalchemy.orm import Session
from pricewatch.app.api.schemas import (
BackendLogEntry,
EnqueueRequest,
EnqueueResponse,
HealthStatus,
PriceHistoryOut,
PriceHistoryCreate,
PriceHistoryOut,
PriceHistoryUpdate,
ProductOut,
ProductCreate,
ProductHistoryPoint,
ProductOut,
ProductUpdate,
ScheduleRequest,
ScheduleResponse,
ScrapingLogOut,
ScrapingLogCreate,
ScrapingLogUpdate,
ScrapePreviewRequest,
ScrapePreviewResponse,
ScrapeCommitRequest,
ScrapeCommitResponse,
VersionResponse,
BackendLogEntry,
ScrapePreviewRequest,
ScrapePreviewResponse,
ScrapingLogCreate,
ScrapingLogOut,
ScrapingLogUpdate,
UvicornLogEntry,
WebhookOut,
VersionResponse,
WebhookCreate,
WebhookUpdate,
WebhookOut,
WebhookTestResponse,
WebhookUpdate,
)
from pricewatch.app.core.config import get_config
from pricewatch.app.core.logging import get_logger
@@ -794,6 +795,9 @@ def _read_uvicorn_lines(limit: int = 200) -> list[str]:
return []
PRODUCT_HISTORY_LIMIT = 12
def _product_to_out(session: Session, product: Product) -> ProductOut:
"""Helper pour mapper Product + dernier prix."""
latest = (
@@ -810,6 +814,18 @@ def _product_to_out(session: Session, product: Product) -> ProductOut:
discount_amount = float(product.msrp) - float(latest.price)
if product.msrp > 0:
discount_percent = (discount_amount / float(product.msrp)) * 100
history_rows = (
session.query(PriceHistory)
.filter(PriceHistory.product_id == product.id, PriceHistory.price != None)
.order_by(desc(PriceHistory.fetched_at))
.limit(PRODUCT_HISTORY_LIMIT)
.all()
)
history_points = [
ProductHistoryPoint(price=float(row.price), fetched_at=row.fetched_at)
for row in reversed(history_rows)
if row.price is not None
]
return ProductOut(
id=product.id,
source=product.source,
@@ -832,6 +848,7 @@ def _product_to_out(session: Session, product: Product) -> ProductOut:
specs=specs,
discount_amount=discount_amount,
discount_percent=discount_percent,
history=history_points,
)

View File

@@ -13,6 +13,11 @@ class HealthStatus(BaseModel):
redis: bool
class ProductHistoryPoint(BaseModel):
price: float
fetched_at: datetime
class ProductOut(BaseModel):
id: int
source: str
@@ -33,6 +38,7 @@ class ProductOut(BaseModel):
specs: dict[str, str] = {}
discount_amount: Optional[float] = None
discount_percent: Optional[float] = None
history: list[ProductHistoryPoint] = Field(default_factory=list)
class ProductCreate(BaseModel):

Binary file not shown.

View File

@@ -112,7 +112,7 @@ class CdiscountStore(BaseStore):
currency = self._extract_currency(soup, debug_info)
stock_status = self._extract_stock(soup, debug_info)
images = self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
category = self._extract_category(soup, debug_info, url)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
@@ -180,7 +180,7 @@ class CdiscountStore(BaseStore):
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
"""Extrait le prix (DOM puis JSON-LD)."""
selectors = self.get_selector("price", [])
if isinstance(selectors, str):
selectors = [selectors]
@@ -188,16 +188,33 @@ class CdiscountStore(BaseStore):
for selector in selectors:
elements = soup.select(selector)
for element in elements:
# Attribut content (schema.org) ou texte
price_text = element.get("content") or element.get_text(strip=True)
price = parse_price_text(price_text)
if price is not None:
return price
price = self._extract_price_from_json_ld(soup)
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_price_from_json_ld(self, soup: BeautifulSoup) -> Optional[float]:
"""Extrait le prix depuis les scripts JSON-LD."""
product_ld = self._find_product_ld(soup)
offers = product_ld.get("offers")
if isinstance(offers, list):
offers = offers[0] if offers else None
if isinstance(offers, dict):
price = offers.get("price")
if isinstance(price, str):
return parse_price_text(price)
if isinstance(price, (int, float)):
# convert to float but maintain decimals
return float(price)
return None
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille."""
selectors = [
@@ -205,6 +222,8 @@ class CdiscountStore(BaseStore):
".price__old",
".c-price__strike",
".price-strike",
"div[data-e2e='strikedPrice']",
"div.SecondaryPrice-price",
]
for selector in selectors:
element = soup.select_one(selector)
@@ -212,6 +231,19 @@ class CdiscountStore(BaseStore):
price = parse_price_text(element.get_text(strip=True))
if price is not None:
return price
# Fallback: JSON-LD (offers price + promotions)
product_ld = self._find_product_ld(soup)
offer = product_ld.get("offers")
if isinstance(offer, dict):
price = offer.get("price")
if isinstance(price, str):
candidate = parse_price_text(price)
elif isinstance(price, (int, float)):
candidate = float(price)
else:
candidate = None
if candidate is not None:
return candidate
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
@@ -288,7 +320,7 @@ class CdiscountStore(BaseStore):
return list(dict.fromkeys(images)) # Préserver lordre
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo, url: str) -> Optional[str]:
"""Extrait la catégorie depuis les breadcrumbs."""
selectors = self.get_selector("category", [])
if isinstance(selectors, str):
@@ -310,6 +342,54 @@ class CdiscountStore(BaseStore):
if parts:
return parts[-1]
if title := self._extract_category_from_breadcrumbs(soup):
return title
return self._extract_category_from_url(url)
def _extract_category_from_breadcrumbs(self, soup: BeautifulSoup) -> Optional[str]:
"""Cherche un breadcrumb via JSON-LD (BreadcrumbList) et retourne l'avant-dernier item."""
entries = self._extract_json_ld_entries(soup)
for entry in entries:
if not isinstance(entry, dict):
continue
if entry.get("@type") != "BreadcrumbList":
continue
items = entry.get("itemListElement", [])
if not isinstance(items, list):
continue
positions = [
element.get("position")
for element in items
if isinstance(element, dict) and isinstance(element.get("position"), int)
]
max_pos = max(positions) if positions else None
for element in reversed(items):
if not isinstance(element, dict):
continue
position = element.get("position")
if max_pos is not None and position == max_pos:
continue
item = element.get("item", {})
name = item.get("name")
if name and isinstance(name, str):
title = name.strip()
if title:
return title
return None
def _extract_category_from_url(self, url: str) -> Optional[str]:
"""Déduit la catégorie via l'URL /informatique/.../f-..."""
if not url:
return None
parsed = urlparse(url)
segments = [seg for seg in parsed.path.split("/") if seg]
breadcrumb = []
for segment in segments:
if segment.startswith("f-") or segment.startswith("p-"):
break
breadcrumb.append(segment)
if breadcrumb:
return breadcrumb[-1].replace("-", " ").title()
return None
def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:

View File

@@ -17,6 +17,18 @@ def parse_price_text(text: str) -> Optional[float]:
if not text:
return None
euro_suffix = re.search(r"([0-9 .,]+)\s*€\s*(\d{2})\b", text)
if euro_suffix:
integer_part = euro_suffix.group(1)
decimal_part = euro_suffix.group(2)
integer_clean = re.sub(r"[^\d]", "", integer_part)
if integer_clean:
cleaned_decimal = f"{integer_clean}.{decimal_part}"
try:
return float(cleaned_decimal)
except ValueError:
pass
# Fallback to original replacement if suffix logic fails
text = re.sub(r"(\d)\s*€\s*(\d)", r"\1,\2", text)
cleaned = text.replace("\u00a0", " ").replace("\u202f", " ").replace("\u2009", " ")
cleaned = "".join(ch for ch in cleaned if ch.isdigit() or ch in ".,")