scrap/pricewatch/app/stores/backmarket/store.py

"""
Store Backmarket - Parsing de produits Backmarket.fr.

Supporte l'extraction de: titre, prix, SKU, images, condition (état), etc.
Spécificité: Backmarket vend du reconditionné, donc prix variable selon condition.
"""

import json
import re
from datetime import datetime
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse

from bs4 import BeautifulSoup

from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import (
    DebugInfo,
    DebugStatus,
    FetchMethod,
    ProductSnapshot,
    StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text

logger = get_logger("stores.backmarket")


class BackmarketStore(BaseStore):
    """Store pour Backmarket.fr (produits reconditionnés)."""

    def __init__(self):
        """Initialise le store Backmarket avec ses sélecteurs."""
        selectors_path = Path(__file__).parent / "selectors.yml"
        super().__init__(store_id="backmarket", selectors_path=selectors_path)

    def match(self, url: str) -> float:
        """
        Détecte si l'URL est Backmarket.

        Returns:
            0.9 pour backmarket.fr/backmarket.com
            0.0 sinon
        """
        if not url:
            return 0.0

        url_lower = url.lower()

        if "backmarket.fr" in url_lower:
            return 0.9
        elif "backmarket.com" in url_lower:
            return 0.8  # .com pour autres pays

        return 0.0

    def canonicalize(self, url: str) -> str:
        """
        Normalise l'URL Backmarket.

        Les URLs Backmarket ont généralement la forme:
        https://www.backmarket.fr/fr-fr/p/{slug}

        On garde l'URL complète sans query params.
        """
        if not url:
            return url

        parsed = urlparse(url)
        # Retirer query params et fragment
        return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"

    def extract_reference(self, url: str) -> Optional[str]:
        """
        Extrait le SKU (slug) depuis l'URL.

        Format typique: /fr-fr/p/{slug}
        Exemple: /fr-fr/p/iphone-15-pro → "iphone-15-pro"
        """
        if not url:
            return None

        # Pattern: /p/{slug} (peut être /fr-fr/p/ ou /en-us/p/ etc.)
        match = re.search(r"/p/([a-z0-9-]+)", url, re.IGNORECASE)
        if match:
            return match.group(1)

        return None

    def parse(self, html: str, url: str) -> ProductSnapshot:
        """
        Parse le HTML Backmarket vers ProductSnapshot.

        Utilise en priorité JSON-LD schema.org, puis BeautifulSoup avec sélecteurs.
        """
        soup = BeautifulSoup(html, "lxml")

        debug_info = DebugInfo(
            method=FetchMethod.HTTP,  # Sera mis à jour par l'appelant
            status=DebugStatus.SUCCESS,
            errors=[],
            notes=[],
        )

        # Extraction prioritaire depuis JSON-LD
        json_ld_data = self._extract_json_ld(soup)

        # Extraction des champs
        title = json_ld_data.get("name") or self._extract_title(soup, debug_info)
        price = json_ld_data.get("price") or self._extract_price(soup, debug_info)
        currency = (
            json_ld_data.get("priceCurrency") or self._extract_currency(soup, debug_info) or "EUR"
        )
        stock_status = self._extract_stock(soup, debug_info)
        images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
        category = self._extract_category(soup, debug_info)
        specs = self._extract_specs(soup, debug_info)
        description = self._extract_description(soup, debug_info)
        msrp = self._extract_msrp(soup, debug_info)
        reference = self.extract_reference(url)

        # Spécifique Backmarket: condition (état du reconditionné)
        condition = self._extract_condition(soup, debug_info)
        if condition:
            specs["Condition"] = condition
            debug_info.notes.append(f"Produit reconditionné: {condition}")

        # Déterminer le statut final
        if not title or price is None:
            debug_info.status = DebugStatus.PARTIAL
            debug_info.notes.append("Parsing incomplet: titre ou prix manquant")

        snapshot = ProductSnapshot(
            source=self.store_id,
            url=self.canonicalize(url),
            fetched_at=datetime.now(),
            title=title,
            price=price,
            currency=currency,
            shipping_cost=None,
            stock_status=stock_status,
            reference=reference,
            category=category,
            description=description,
            images=images,
            specs=specs,
            msrp=msrp,
            debug=debug_info,
        )

        logger.info(
            f"[Backmarket] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
            f"title={bool(title)}, price={price is not None}"
        )

        return snapshot

    def _extract_json_ld(self, soup: BeautifulSoup) -> dict:
        """
        Extrait les données depuis JSON-LD schema.org.

        Backmarket utilise schema.org Product, c'est la source la plus fiable.
        """
        json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})

        for script in json_ld_scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict) and data.get("@type") == "Product":
                    result = {
                        "name": data.get("name"),
                        "priceCurrency": None,
                        "price": None,
                        "images": [],
                    }

                    # Prix depuis offers
                    offers = data.get("offers", {})
                    if isinstance(offers, dict):
                        result["price"] = offers.get("price")
                        result["priceCurrency"] = offers.get("priceCurrency")

                        # Convertir en float si c'est une string
                        if isinstance(result["price"], str):
                            try:
                                result["price"] = float(result["price"])
                            except ValueError:
                                result["price"] = None

                    # Images
                    image_data = data.get("image")
                    if isinstance(image_data, str):
                        result["images"] = [image_data]
                    elif isinstance(image_data, list):
                        result["images"] = image_data

                    return result
            except (json.JSONDecodeError, AttributeError):
                continue

        return {}

    def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait le titre du produit."""
        selectors = self.get_selector("title", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                title = element.get_text(strip=True)
                if title:
                    return title

        debug.errors.append("Titre non trouvé")
        return None

    def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la description (meta tags)."""
        meta = soup.find("meta", property="og:description") or soup.find(
            "meta", attrs={"name": "description"}
        )
        if meta:
            description = meta.get("content", "").strip()
            if description:
                return description
        return None

    def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix."""
        selectors = self.get_selector("price", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                # Attribut content (schema.org) ou texte
                price_text = element.get("content") or element.get_text(strip=True)

                price = parse_price_text(price_text)
                if price is not None:
                    return price

        debug.errors.append("Prix non trouvé")
        return None

    def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix conseille."""
        selectors = [
            ".price--old",
            ".price--striked",
            ".price__old",
            "del",
        ]
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                price = parse_price_text(element.get_text(strip=True))
                if price is not None:
                    return price
        return None

    def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la devise."""
        selectors = self.get_selector("currency", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                # Attribut content
                currency = element.get("content")
                if currency:
                    return currency.upper()

        # Défaut EUR pour Backmarket France
        return "EUR"

    def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
        """Extrait le statut de stock."""
        # Chercher le bouton "Ajouter au panier"
        add_to_cart = soup.find("button", attrs={"data-test": "add-to-cart"})
        if add_to_cart and not add_to_cart.get("disabled"):
            return StockStatus.IN_STOCK

        # Fallback: chercher textes indiquant la disponibilité
        selectors = self.get_selector("stock_status", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                text = element.get_text(strip=True).lower()

                if "en stock" in text or "disponible" in text or "ajouter" in text:
                    return StockStatus.IN_STOCK
                elif (
                    "rupture" in text
                    or "indisponible" in text
                    or "épuisé" in text
                ):
                    return StockStatus.OUT_OF_STOCK

        return StockStatus.UNKNOWN

    def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
        """Extrait les URLs d'images."""
        images = []
        selectors = self.get_selector("images", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                # src ou data-src
                img_url = element.get("src") or element.get("data-src")
                if img_url and img_url.startswith("http"):
                    # Éviter les doublons
                    if img_url not in images:
                        images.append(img_url)

        return images

    def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la catégorie depuis le breadcrumb."""
        selectors = self.get_selector("category", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            elements = soup.select(selector)
            if elements:
                # Prendre le dernier élément du breadcrumb (catégorie la plus spécifique)
                categories = [elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)]
                if categories:
                    return categories[-1]

        return None

    def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
        """Extrait les caractéristiques techniques."""
        specs = {}

        # Chercher les dl (definition lists)
        dls = soup.find_all("dl")
        for dl in dls:
            dts = dl.find_all("dt")
            dds = dl.find_all("dd")

            for dt, dd in zip(dts, dds):
                key = dt.get_text(strip=True)
                value = dd.get_text(strip=True)
                if key and value:
                    specs[key] = value

        return specs

    def _extract_condition(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """
        Extrait la condition/état du produit reconditionné.

        Spécifique à Backmarket: Correct, Bon, Très bon, Excellent, etc.
        """
        selectors = self.get_selector("condition", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                text = element.get_text(strip=True)
                # Chercher les grades Backmarket
                if any(grade in text for grade in ["Correct", "Bon", "Très bon", "Excellent", "Comme neuf"]):
                    return text

        return None