scrap/pricewatch/app/stores/aliexpress/store.py

"""
Store AliExpress - Parsing de produits AliExpress.com.

Supporte l'extraction de: titre, prix, SKU, images, etc.
Spécificité: Rendu client-side (SPA) - nécessite Playwright avec attente.
"""

import json
import re
from datetime import datetime
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse

from bs4 import BeautifulSoup

from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import (
    DebugInfo,
    DebugStatus,
    FetchMethod,
    ProductSnapshot,
    StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text

logger = get_logger("stores.aliexpress")


class AliexpressStore(BaseStore):
    """Store pour AliExpress.com (marketplace chinois)."""

    def __init__(self):
        """Initialise le store AliExpress avec ses sélecteurs."""
        selectors_path = Path(__file__).parent / "selectors.yml"
        super().__init__(store_id="aliexpress", selectors_path=selectors_path)

    def match(self, url: str) -> float:
        """
        Détecte si l'URL est AliExpress.

        Returns:
            0.9 pour aliexpress.com/aliexpress.fr
            0.0 sinon
        """
        if not url:
            return 0.0

        url_lower = url.lower()

        if "aliexpress.com" in url_lower or "aliexpress.fr" in url_lower:
            # Vérifier que c'est bien une page produit
            if "/item/" in url_lower:
                return 0.9
            else:
                return 0.5  # C'est AliExpress mais pas une page produit

        return 0.0

    def canonicalize(self, url: str) -> str:
        """
        Normalise l'URL AliExpress.

        Les URLs AliExpress ont généralement la forme:
        https://fr.aliexpress.com/item/{ID}.html?params...

        On garde juste: https://fr.aliexpress.com/item/{ID}.html
        """
        if not url:
            return url

        parsed = urlparse(url)

        # Extraire le path de base (sans query params)
        path = parsed.path

        # Garder seulement /item/{ID}.html
        match = re.search(r"(/item/\d+\.html)", path)
        if match:
            clean_path = match.group(1)
            return f"{parsed.scheme}://{parsed.netloc}{clean_path}"

        # Si le pattern ne matche pas, retirer juste query params
        return f"{parsed.scheme}://{parsed.netloc}{path}"

    def extract_reference(self, url: str) -> Optional[str]:
        """
        Extrait le SKU (Product ID) depuis l'URL.

        Format typique: /item/{ID}.html
        Exemple: /item/1005007187023722.html → "1005007187023722"
        """
        if not url:
            return None

        # Pattern: /item/{ID}.html
        match = re.search(r"/item/(\d+)\.html", url, re.IGNORECASE)
        if match:
            return match.group(1)

        return None

    def parse(self, html: str, url: str) -> ProductSnapshot:
        """
        Parse le HTML AliExpress vers ProductSnapshot.

        AliExpress utilise un rendu client-side (SPA), donc:
        - Extraction prioritaire depuis meta tags (og:title, og:image)
        - Prix extrait par regex (pas de sélecteur stable)
        - Images extraites depuis window._d_c_.DCData JSON
        """
        soup = BeautifulSoup(html, "lxml")

        debug_info = DebugInfo(
            method=FetchMethod.HTTP,  # Sera mis à jour par l'appelant
            status=DebugStatus.SUCCESS,
            errors=[],
            notes=[],
        )

        # Extraction des champs
        title = self._extract_title(soup, debug_info)
        price = self._extract_price(html, soup, debug_info)
        currency = self._extract_currency(url, soup, debug_info)
        stock_status = self._extract_stock(soup, debug_info)
        images = self._extract_images(html, soup, debug_info)
        category = self._extract_category(soup, debug_info)
        specs = self._extract_specs(soup, debug_info)
        description = self._extract_description(soup, debug_info)
        msrp = self._extract_msrp(html, debug_info)
        reference = self.extract_reference(url)

        # Note sur le rendu client-side
        if len(html) < 200000:  # HTML trop petit = pas de rendu complet
            debug_info.notes.append(
                "HTML court (<200KB) - possiblement non rendu. Utiliser Playwright avec wait."
            )

        # Déterminer le statut final
        if not title or price is None:
            debug_info.status = DebugStatus.PARTIAL
            debug_info.notes.append("Parsing incomplet: titre ou prix manquant")

        snapshot = ProductSnapshot(
            source=self.store_id,
            url=self.canonicalize(url),
            fetched_at=datetime.now(),
            title=title,
            price=price,
            currency=currency,
            shipping_cost=None,
            stock_status=stock_status,
            reference=reference,
            category=category,
            description=description,
            images=images,
            specs=specs,
            msrp=msrp,
            debug=debug_info,
        )

        logger.info(
            f"[AliExpress] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
            f"title={bool(title)}, price={price is not None}"
        )

        return snapshot

    def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait le titre du produit."""
        # Priorité 1: h1 (apparaît après rendu AJAX)
        h1 = soup.find("h1")
        if h1:
            title = h1.get_text(strip=True)
            if title and len(title) > 10:  # Titre valide
                return title

        # Priorité 2: og:title (dans meta tags)
        og_title = soup.find("meta", property="og:title")
        if og_title:
            title = og_title.get("content", "")
            if title:
                # Nettoyer " - AliExpress" à la fin
                title = re.sub(r"\s*-\s*AliExpress.*$", "", title)
                return title.strip()

        debug.errors.append("Titre non trouvé")
        return None

    def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
        """Extrait la description (meta tags)."""
        meta = soup.find("meta", property="og:description") or soup.find(
            "meta", attrs={"name": "description"}
        )
        if meta:
            description = meta.get("content", "").strip()
            if description:
                return description
        return None

    def _extract_price(
        self, html: str, soup: BeautifulSoup, debug: DebugInfo
    ) -> Optional[float]:
        """
        Extrait le prix.

        AliExpress n'a PAS de sélecteur CSS stable pour le prix.
        On utilise regex sur le HTML brut.
        """
        # Pattern 1: Prix avant € (ex: "136,69 €")
        match = re.search(r"([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)\\s*€", html)
        if match:
            price = parse_price_text(match.group(1))
            if price is not None:
                return price

        # Pattern 2: € avant prix (ex: "€ 136.69")
        match = re.search(r"€\\s*([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)", html)
        if match:
            price = parse_price_text(match.group(1))
            if price is not None:
                return price

        # Pattern 3: Chercher dans meta tags (moins fiable)
        og_price = soup.find("meta", property="og:price:amount")
        if og_price:
            price_str = og_price.get("content", "")
            price = parse_price_text(price_str)
            if price is not None:
                return price

        debug.errors.append("Prix non trouvé")
        return None

    def _extract_msrp(self, html: str, debug: DebugInfo) -> Optional[float]:
        """Extrait le prix conseille si present."""
        match = re.search(r"originalPrice\"\\s*:\\s*\"([0-9\\s.,]+)\"", html)
        if match:
            price = parse_price_text(match.group(1))
            if price is not None:
                return price
        return None

    def _extract_currency(
        self, url: str, soup: BeautifulSoup, debug: DebugInfo
    ) -> str:
        """Extrait la devise."""
        # Priorité 1: og:price:currency
        og_currency = soup.find("meta", property="og:price:currency")
        if og_currency:
            currency = og_currency.get("content", "")
            if currency:
                return currency.upper()

        # Priorité 2: Détecter depuis l'URL
        if "fr.aliexpress" in url.lower():
            return "EUR"
        elif "aliexpress.com" in url.lower():
            return "USD"

        # Défaut
        return "EUR"

    def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
        """Extrait le statut de stock."""
        # Chercher le bouton "Add to cart" / "Ajouter au panier"
        buttons = soup.find_all("button")
        for btn in buttons:
            text = btn.get_text(strip=True).lower()
            if any(
                keyword in text
                for keyword in ["add to cart", "ajouter", "buy now", "acheter"]
            ):
                # Bouton trouvé et pas disabled
                if not btn.get("disabled"):
                    return StockStatus.IN_STOCK

        # Fallback: chercher texte indiquant la disponibilité
        text_lower = soup.get_text().lower()
        if "out of stock" in text_lower or "rupture" in text_lower:
            return StockStatus.OUT_OF_STOCK

        return StockStatus.UNKNOWN

    def _extract_images(
        self, html: str, soup: BeautifulSoup, debug: DebugInfo
    ) -> list[str]:
        """
        Extrait les URLs d'images.

        Priorité: window._d_c_.DCData.imagePathList (JSON embarqué)
        """
        images = []

        # Priorité 1: Extraire depuis DCData JSON
        match = re.search(
            r"window\._d_c_\.DCData\s*=\s*(\{[^;]*\});", html, re.DOTALL
        )
        if match:
            try:
                data = json.loads(match.group(1))
                if "imagePathList" in data:
                    image_list = data["imagePathList"]
                    if isinstance(image_list, list):
                        images.extend(image_list)
                        debug.notes.append(
                            f"Images extraites depuis DCData: {len(images)}"
                        )
            except (json.JSONDecodeError, KeyError):
                pass

        # Priorité 2: og:image
        if not images:
            og_image = soup.find("meta", property="og:image")
            if og_image:
                img_url = og_image.get("content", "")
                if img_url:
                    images.append(img_url)

        # Priorité 3: Chercher dans les <img> avec alicdn.com
        if not images:
            img_elems = soup.find_all("img", src=True)
            for img in img_elems:
                src = img.get("src", "")
                if "alicdn.com" in src and not any(
                    x in src for x in ["logo", "icon", "avatar"]
                ):
                    if src not in images:
                        images.append(src)

        return images

    def _extract_category(
        self, soup: BeautifulSoup, debug: DebugInfo
    ) -> Optional[str]:
        """Extrait la catégorie depuis le breadcrumb."""
        selectors = self.get_selector("category", [])
        if isinstance(selectors, str):
            selectors = [selectors]

        for selector in selectors:
            elements = soup.select(selector)
            if elements:
                # Prendre le dernier élément du breadcrumb
                categories = [
                    elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)
                ]
                if categories:
                    return categories[-1]

        return None

    def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
        """Extrait les caractéristiques techniques."""
        specs = {}

        # Chercher les dl (definition lists)
        dls = soup.find_all("dl")
        for dl in dls:
            dts = dl.find_all("dt")
            dds = dl.find_all("dd")

            for dt, dd in zip(dts, dds):
                key = dt.get_text(strip=True)
                value = dd.get_text(strip=True)
                if key and value:
                    specs[key] = value

        return specs