before claude

2026-01-17 13:40:26 +01:00
parent d0b73b9319
commit cf7c415e22
35 changed files with 3411 additions and 221 deletions
@@ -21,31 +21,32 @@ from sqlalchemy import and_, desc, func
 from sqlalchemy.orm import Session

 from pricewatch.app.api.schemas import (
+    BackendLogEntry,
    EnqueueRequest,
    EnqueueResponse,
    HealthStatus,
-    PriceHistoryOut,
    PriceHistoryCreate,
+    PriceHistoryOut,
    PriceHistoryUpdate,
-    ProductOut,
    ProductCreate,
+    ProductHistoryPoint,
+    ProductOut,
    ProductUpdate,
    ScheduleRequest,
    ScheduleResponse,
-    ScrapingLogOut,
-    ScrapingLogCreate,
-    ScrapingLogUpdate,
-    ScrapePreviewRequest,
-    ScrapePreviewResponse,
    ScrapeCommitRequest,
    ScrapeCommitResponse,
-    VersionResponse,
-    BackendLogEntry,
+    ScrapePreviewRequest,
+    ScrapePreviewResponse,
+    ScrapingLogCreate,
+    ScrapingLogOut,
+    ScrapingLogUpdate,
    UvicornLogEntry,
-    WebhookOut,
+    VersionResponse,
    WebhookCreate,
-    WebhookUpdate,
+    WebhookOut,
    WebhookTestResponse,
+    WebhookUpdate,
 )
 from pricewatch.app.core.config import get_config
 from pricewatch.app.core.logging import get_logger
@@ -794,6 +795,9 @@ def _read_uvicorn_lines(limit: int = 200) -> list[str]:
        return []


+PRODUCT_HISTORY_LIMIT = 12
+
+
 def _product_to_out(session: Session, product: Product) -> ProductOut:
    """Helper pour mapper Product + dernier prix."""
    latest = (
@@ -810,6 +814,18 @@ def _product_to_out(session: Session, product: Product) -> ProductOut:
        discount_amount = float(product.msrp) - float(latest.price)
        if product.msrp > 0:
            discount_percent = (discount_amount / float(product.msrp)) * 100
+    history_rows = (
+        session.query(PriceHistory)
+        .filter(PriceHistory.product_id == product.id, PriceHistory.price != None)
+        .order_by(desc(PriceHistory.fetched_at))
+        .limit(PRODUCT_HISTORY_LIMIT)
+        .all()
+    )
+    history_points = [
+        ProductHistoryPoint(price=float(row.price), fetched_at=row.fetched_at)
+        for row in reversed(history_rows)
+        if row.price is not None
+    ]
    return ProductOut(
        id=product.id,
        source=product.source,
@@ -832,6 +848,7 @@ def _product_to_out(session: Session, product: Product) -> ProductOut:
        specs=specs,
        discount_amount=discount_amount,
        discount_percent=discount_percent,
+        history=history_points,
    )


@@ -13,6 +13,11 @@ class HealthStatus(BaseModel):
    redis: bool


+class ProductHistoryPoint(BaseModel):
+    price: float
+    fetched_at: datetime
+
+
 class ProductOut(BaseModel):
    id: int
    source: str
@@ -33,6 +38,7 @@ class ProductOut(BaseModel):
    specs: dict[str, str] = {}
    discount_amount: Optional[float] = None
    discount_percent: Optional[float] = None
+    history: list[ProductHistoryPoint] = Field(default_factory=list)


 class ProductCreate(BaseModel):
@@ -112,7 +112,7 @@ class CdiscountStore(BaseStore):
        currency = self._extract_currency(soup, debug_info)
        stock_status = self._extract_stock(soup, debug_info)
        images = self._extract_images(soup, debug_info)
-        category = self._extract_category(soup, debug_info)
+        category = self._extract_category(soup, debug_info, url)
        specs = self._extract_specs(soup, debug_info)
        description = self._extract_description(soup, debug_info)
        msrp = self._extract_msrp(soup, debug_info)
@@ -180,7 +180,7 @@ class CdiscountStore(BaseStore):
        return None

    def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
-        """Extrait le prix."""
+        """Extrait le prix (DOM puis JSON-LD)."""
        selectors = self.get_selector("price", [])
        if isinstance(selectors, str):
            selectors = [selectors]
@@ -188,16 +188,33 @@ class CdiscountStore(BaseStore):
        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
-                # Attribut content (schema.org) ou texte
                price_text = element.get("content") or element.get_text(strip=True)
-
                price = parse_price_text(price_text)
                if price is not None:
                    return price

+        price = self._extract_price_from_json_ld(soup)
+        if price is not None:
+            return price
+
        debug.errors.append("Prix non trouvé")
        return None

+    def _extract_price_from_json_ld(self, soup: BeautifulSoup) -> Optional[float]:
+        """Extrait le prix depuis les scripts JSON-LD."""
+        product_ld = self._find_product_ld(soup)
+        offers = product_ld.get("offers")
+        if isinstance(offers, list):
+            offers = offers[0] if offers else None
+        if isinstance(offers, dict):
+            price = offers.get("price")
+            if isinstance(price, str):
+                return parse_price_text(price)
+            if isinstance(price, (int, float)):
+                # convert to float but maintain decimals
+                return float(price)
+        return None
+
    def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix conseille."""
        selectors = [
@@ -205,6 +222,8 @@ class CdiscountStore(BaseStore):
            ".price__old",
            ".c-price__strike",
            ".price-strike",
+            "div[data-e2e='strikedPrice']",
+            "div.SecondaryPrice-price",
        ]
        for selector in selectors:
            element = soup.select_one(selector)
@@ -212,6 +231,19 @@ class CdiscountStore(BaseStore):
                price = parse_price_text(element.get_text(strip=True))
                if price is not None:
                    return price
+        # Fallback: JSON-LD (offers price + promotions)
+        product_ld = self._find_product_ld(soup)
+        offer = product_ld.get("offers")
+        if isinstance(offer, dict):
+            price = offer.get("price")
+            if isinstance(price, str):
+                candidate = parse_price_text(price)
+            elif isinstance(price, (int, float)):
+                candidate = float(price)
+            else:
+                candidate = None
+            if candidate is not None:
+                return candidate
        return None

    def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
@@ -288,7 +320,7 @@ class CdiscountStore(BaseStore):

        return list(dict.fromkeys(images))  # Préserver l’ordre

-    def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
+    def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo, url: str) -> Optional[str]:
        """Extrait la catégorie depuis les breadcrumbs."""
        selectors = self.get_selector("category", [])
        if isinstance(selectors, str):
@@ -310,6 +342,54 @@ class CdiscountStore(BaseStore):
                    if parts:
                        return parts[-1]

+        if title := self._extract_category_from_breadcrumbs(soup):
+            return title
+        return self._extract_category_from_url(url)
+
+    def _extract_category_from_breadcrumbs(self, soup: BeautifulSoup) -> Optional[str]:
+        """Cherche un breadcrumb via JSON-LD (BreadcrumbList) et retourne l'avant-dernier item."""
+        entries = self._extract_json_ld_entries(soup)
+        for entry in entries:
+            if not isinstance(entry, dict):
+                continue
+            if entry.get("@type") != "BreadcrumbList":
+                continue
+            items = entry.get("itemListElement", [])
+            if not isinstance(items, list):
+                continue
+            positions = [
+                element.get("position")
+                for element in items
+                if isinstance(element, dict) and isinstance(element.get("position"), int)
+            ]
+            max_pos = max(positions) if positions else None
+            for element in reversed(items):
+                if not isinstance(element, dict):
+                    continue
+                position = element.get("position")
+                if max_pos is not None and position == max_pos:
+                    continue
+                item = element.get("item", {})
+                name = item.get("name")
+                if name and isinstance(name, str):
+                    title = name.strip()
+                    if title:
+                        return title
+        return None
+
+    def _extract_category_from_url(self, url: str) -> Optional[str]:
+        """Déduit la catégorie via l'URL /informatique/.../f-..."""
+        if not url:
+            return None
+        parsed = urlparse(url)
+        segments = [seg for seg in parsed.path.split("/") if seg]
+        breadcrumb = []
+        for segment in segments:
+            if segment.startswith("f-") or segment.startswith("p-"):
+                break
+            breadcrumb.append(segment)
+        if breadcrumb:
+            return breadcrumb[-1].replace("-", " ").title()
        return None

    def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:
@@ -17,6 +17,18 @@ def parse_price_text(text: str) -> Optional[float]:
    if not text:
        return None

+    euro_suffix = re.search(r"([0-9 .,]+)\s*€\s*(\d{2})\b", text)
+    if euro_suffix:
+        integer_part = euro_suffix.group(1)
+        decimal_part = euro_suffix.group(2)
+        integer_clean = re.sub(r"[^\d]", "", integer_part)
+        if integer_clean:
+            cleaned_decimal = f"{integer_clean}.{decimal_part}"
+            try:
+                return float(cleaned_decimal)
+            except ValueError:
+                pass
+        # Fallback to original replacement if suffix logic fails
    text = re.sub(r"(\d)\s*€\s*(\d)", r"\1,\2", text)
    cleaned = text.replace("\u00a0", " ").replace("\u202f", " ").replace("\u2009", " ")
    cleaned = "".join(ch for ch in cleaned if ch.isdigit() or ch in ".,")