suivi_produit/backend/app/scraper/amazon/parser.py

from __future__ import annotations

import re
from typing import Any

from bs4 import BeautifulSoup
from loguru import logger
from playwright.sync_api import Page

from backend.app.scraper.normalize import (
    parse_price_fr,
    parse_rating_count,
    parse_rating_value,
    parse_stock_status,
)


def detect_blocked(html: str) -> bool:
    # détection simple des blocages / captcha
    lowered = html.lower()
    if "captcha" in lowered or "robot" in lowered:
        return True
    if "saisissez les caractères" in lowered or "vérification" in lowered:
        return True
    return False


def _safe_text(page: Page, selector: str) -> str | None:
    try:
        locator = page.locator(selector)
        if locator.count() == 0:
            return None
        value = locator.first.inner_text().strip()
        return value or None
    except Exception:
        return None


def _safe_attr(page: Page, selector: str, attr: str) -> str | None:
    try:
        locator = page.locator(selector)
        if locator.count() == 0:
            return None
        return locator.first.get_attribute(attr)
    except Exception:
        return None


def _extract_asin_from_url(url: str) -> str | None:
    match = re.search(r"/dp/([A-Z0-9]{10})", url)
    if match:
        return match.group(1)
    return None


def _safe_text_soup(soup: BeautifulSoup, selector: str) -> str | None:
    node = soup.select_one(selector)
    if not node:
        return None
    value = node.get_text(strip=True)
    return value or None


def _safe_attr_soup(soup: BeautifulSoup, selector: str, attr: str) -> str | None:
    node = soup.select_one(selector)
    if not node:
        return None
    return node.get(attr)


def _has_selector_soup(soup: BeautifulSoup, selector: str) -> bool:
    return soup.select_one(selector) is not None


def _compose_price_from_parts(whole: str | None, fraction: str | None, symbol: str | None) -> str | None:
    if not whole:
        return None
    whole_digits = re.sub(r"[^\d]", "", whole)
    if not whole_digits:
        return None
    fraction_digits = re.sub(r"[^\d]", "", fraction or "")
    if not fraction_digits:
        fraction_digits = "00"
    fraction_digits = fraction_digits[:2].ljust(2, "0")
    symbol = (symbol or "€").strip()
    return f"{whole_digits},{fraction_digits} {symbol}"


def _extract_lowest_30d_text_soup(soup: BeautifulSoup) -> str | None:
    containers = []
    container = soup.select_one("#priceBadging_feature_div")
    if container:
        containers.append(container)
    containers.extend(soup.select(".basisPrice"))
    for node in containers:
        text = node.get_text(" ", strip=True)
        if text and re.search(r"prix.+(30|trente).+jour", text.lower()):
            price_node = node.select_one(".a-offscreen")
            if price_node:
                price_text = price_node.get_text(" ", strip=True)
                if price_text:
                    return price_text
            return text
    return None


def _extract_about_bullets(soup: BeautifulSoup) -> list[str] | None:
    container = soup.select_one("#feature-bullets")
    if not container:
        return None
    items = []
    for node in container.select("ul li span.a-list-item"):
        text = node.get_text(" ", strip=True)
        if text:
            items.append(text)
    return items or None


def _extract_description(soup: BeautifulSoup) -> str | None:
    node = soup.select_one("#productDescription")
    if not node:
        return None
    text = node.get_text(" ", strip=True)
    return text or None


def _extract_table_kv(table) -> dict[str, str]:
    data: dict[str, str] = {}
    for row in table.select("tr"):
        key = row.select_one("th")
        value = row.select_one("td")
        if not key or not value:
            continue
        key_text = key.get_text(" ", strip=True)
        value_text = value.get_text(" ", strip=True)
        if key_text and value_text:
            data[key_text] = value_text
    return data


def _extract_tables_from_selector(soup: BeautifulSoup, selector: str) -> list:
    section = soup.select_one(selector)
    if not section:
        return []
    if section.name == "table":
        return [section]
    return section.select("table")


def _extract_carateristique(soup: BeautifulSoup) -> dict[str, str] | None:
    selectors = [
        "[data-csa-c-content-id='voyager-expander-btn']",
        "#productDetails_techSpec_section_1",
        "#productDetails_techSpec_section_2",
    ]
    specs: dict[str, str] = {}
    for selector in selectors:
        tables = _extract_tables_from_selector(soup, selector)
        for table in tables:
            specs.update(_extract_table_kv(table))
    return specs or None


def _extract_details(soup: BeautifulSoup) -> dict[str, str] | None:
    container = soup.select_one("[data-csa-c-content-id='voyager-expander-btn']")
    carateristique_tables = set(container.select("table")) if container else set()
    selectors = [
        "#productDetails_techSpec_section_1",
        "#productDetails_detailBullets_sections1",
        "#productDetails_detailBullets_sections2",
        "#productDetails",
    ]
    details: dict[str, str] = {}
    seen_tables = set()
    for selector in selectors:
        for table in _extract_tables_from_selector(soup, selector):
            if table in carateristique_tables or table in seen_tables:
                continue
            seen_tables.add(table)
            details.update(_extract_table_kv(table))
    return details or None


def _parse_percent(text: str | None) -> int | None:
    if not text:
        return None
    match = re.search(r"(-?\d+)", text.replace("\u00a0", " "))
    if not match:
        return None
    try:
        return int(match.group(1))
    except ValueError:
        return None


def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
    soup = BeautifulSoup(html, "html.parser")
    title = _safe_text_soup(soup, "#productTitle")

    image_main_url = _safe_attr_soup(soup, "#landingImage", "src")
    if not image_main_url:
        image_main_url = _safe_attr_soup(soup, "#imgTagWrapperId img", "src")

    price_text = _safe_text_soup(soup, "#corePriceDisplay_desktop_feature_div .a-offscreen")
    if not price_text:
        price_text = _safe_text_soup(soup, "#priceblock_ourprice")
    if not price_text:
        price_text = _safe_text_soup(soup, "#priceblock_dealprice")
    if not price_text:
        whole = _safe_text_soup(soup, ".a-price .a-price-whole")
        fraction = _safe_text_soup(soup, ".a-price .a-price-fraction")
        symbol = _safe_text_soup(soup, ".a-price .a-price-symbol")
        price_text = _compose_price_from_parts(whole, fraction, symbol)
    if not price_text:
        price_text = _safe_attr_soup(soup, "#twister-plus-price-data-price", "value")

    # prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
    price_list_text = _safe_text_soup(soup, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
    if not price_list_text:
        price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen")
    if not price_list_text:
        price_list_text = _safe_text_soup(soup, "#priceblock_strikeprice")
    # fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
    if not price_list_text:
        price_list_text = _safe_text_soup(
            soup, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen"
        )

    stock_text = _safe_text_soup(soup, "#availability span")
    if not stock_text:
        stock_text = _safe_text_soup(soup, "#availability")

    in_stock, stock_text = parse_stock_status(stock_text)

    rating_text = _safe_text_soup(soup, "#acrPopover .a-icon-alt")
    rating_count_text = _safe_text_soup(soup, "#acrCustomerReviewText")

    amazon_choice = _safe_text_soup(soup, "#acBadge_feature_div")
    limited_time_deal = _safe_text_soup(soup, "#dealBadge_feature_div")
    prime_eligible = None
    if _has_selector_soup(soup, "#primeBadge"):
        prime_eligible = True
    elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
        prime_eligible = True
    elif _has_selector_soup(soup, "#priceBadging_feature_div i.a-icon-prime"):
        prime_eligible = True
    elif _has_selector_soup(soup, "#corePriceDisplay_desktop_feature_div i.a-icon-prime"):
        prime_eligible = True
    elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
        prime_eligible = True
    elif _has_selector_soup(soup, "i#prime-badge"):
        prime_eligible = True
    elif _has_selector_soup(soup, "i.a-icon-prime[aria-label*='prime']"):
        prime_eligible = True
    amazon_exclusive = "Exclusivité Amazon" if "Exclusivité Amazon" in soup.get_text() else None

    # prix plus bas 30 jours (basisPrice avec mention "30 jours")
    lowest_30d_text = _extract_lowest_30d_text_soup(soup)
    lowest_30d_price = None
    if lowest_30d_text:
        lowest_30d_price = parse_price_fr(lowest_30d_text)

    # si le prix conseillé == prix min 30j, c'est une erreur de détection
    # (le prix barré dans corePriceDisplay est en fait le prix min 30j, pas le conseillé)
    price_list_value = parse_price_fr(price_list_text)
    if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
        price_list_text = None
        price_list_value = None

    # réductions
    reduction_savings_text = _safe_text_soup(
        soup, "#corePriceDisplay_desktop_feature_div .savingsPercentage"
    )
    reduction_conseille_text = _safe_text_soup(soup, ".srpSavingsPercentageBlock")

    # attribuer correctement les réductions selon ce qui est présent
    # - si prix min 30j présent, savingsPercentage = réduction par rapport au min 30j
    # - si prix conseillé présent (srpPriceBlock), srpSavingsPercentageBlock = réduction par rapport au conseillé
    reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
    reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
    # si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
    if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
        reduction_conseille = _parse_percent(reduction_savings_text)

    a_propos = _extract_about_bullets(soup)
    description = _extract_description(soup)
    carateristique = _extract_carateristique(soup)
    details = _extract_details(soup)

    asin = _safe_attr_soup(soup, "input#ASIN", "value") or _extract_asin_from_url(url)

    data = {
        "url": url,
        "asin": asin,
        "titre": title,
        "url_image_principale": image_main_url,
        "prix_actuel": parse_price_fr(price_text),
        "prix_conseille": price_list_value,
        "prix_min_30j": lowest_30d_price,
        "prix_conseille_reduction": reduction_conseille,
        "prix_min_30j_reduction": reduction_min_30j,
        "etat_stock": stock_text,
        "en_stock": in_stock,
        "note": parse_rating_value(rating_text),
        "nombre_avis": parse_rating_count(rating_count_text),
        "choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
        "offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
        "prime": True if prime_eligible else None,
        "exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
        "a_propos": a_propos,
        "description": description,
        "carateristique": carateristique,
        "details": details,
    }

    missing = [key for key in ("titre", "prix_actuel", "note") if not data.get(key)]
    if missing:
        logger.warning("Champs manquants (html): {}", ", ".join(missing))

    return data


def extract_product_data(page: Page, url: str) -> dict[str, Any]:
    # champ titre
    title = _safe_text(page, "#productTitle")

    # image principale
    image_main_url = _safe_attr(page, "#landingImage", "src")
    if not image_main_url:
        image_main_url = _safe_attr(page, "#imgTagWrapperId img", "src")

    # prix actuel
    price_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-offscreen")
    if not price_text:
        price_text = _safe_text(page, "#priceblock_ourprice")
    if not price_text:
        price_text = _safe_text(page, "#priceblock_dealprice")
    if not price_text:
        whole = _safe_text(page, ".a-price .a-price-whole")
        fraction = _safe_text(page, ".a-price .a-price-fraction")
        symbol = _safe_text(page, ".a-price .a-price-symbol")
        price_text = _compose_price_from_parts(whole, fraction, symbol)
    if not price_text:
        price_text = _safe_attr(page, "#twister-plus-price-data-price", "value")

    # prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
    price_list_text = _safe_text(page, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
    if not price_list_text:
        price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen")
    if not price_list_text:
        price_list_text = _safe_text(page, "#priceblock_strikeprice")
    # fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
    if not price_list_text:
        price_list_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen")

    # stock
    stock_text = _safe_text(page, "#availability span")
    if not stock_text:
        stock_text = _safe_text(page, "#availability")

    in_stock, stock_text = parse_stock_status(stock_text)

    # rating
    rating_text = _safe_text(page, "#acrPopover .a-icon-alt")
    rating_count_text = _safe_text(page, "#acrCustomerReviewText")

    # badges
    amazon_choice = _safe_text(page, "#acBadge_feature_div")
    limited_time_deal = _safe_text(page, "#dealBadge_feature_div")
    prime_eligible = None
    if page.locator("#primeBadge").count() > 0:
        prime_eligible = True
    elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
        prime_eligible = True
    elif page.locator("#priceBadging_feature_div i.a-icon-prime").count() > 0:
        prime_eligible = True
    elif page.locator("#corePriceDisplay_desktop_feature_div i.a-icon-prime").count() > 0:
        prime_eligible = True
    elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
        prime_eligible = True
    elif page.locator("i#prime-badge").count() > 0:
        prime_eligible = True
    elif page.locator("i.a-icon-prime[aria-label*='prime']").count() > 0:
        prime_eligible = True

    amazon_exclusive = _safe_text(page, "text=Exclusivité Amazon")

    # prix plus bas 30 jours (basisPrice ou corePriceDisplay avec mention "30 jours")
    lowest_30d_text = None
    lowest_30d_price = None
    if page.locator(".basisPrice").count() > 0:
        basis_text = page.locator(".basisPrice").first.inner_text()
        if basis_text and re.search(r"prix.+(30|trente).+jour", basis_text.lower()):
            lowest_30d_text = _safe_text(page, ".basisPrice .a-price .a-offscreen") or basis_text
            lowest_30d_price = parse_price_fr(lowest_30d_text)
    # fallback sur corePriceDisplay si contient mention 30 jours
    if lowest_30d_price is None and page.locator("#corePriceDisplay_desktop_feature_div .a-text-price").count() > 0:
        core_text = page.locator("#corePriceDisplay_desktop_feature_div").first.inner_text()
        if core_text and re.search(r"prix.+(30|trente).+jour", core_text.lower()):
            lowest_30d_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price .a-offscreen")
            lowest_30d_price = parse_price_fr(lowest_30d_text)
    if not lowest_30d_price and page.locator("#priceBadging_feature_div").count() > 0:
        badging_text = page.locator("#priceBadging_feature_div").first.inner_text()
        if badging_text and re.search(r"prix.+(30|trente).+jour", badging_text.lower()):
            lowest_30d_text = _safe_text(page, "#priceBadging_feature_div .a-offscreen") or badging_text
            lowest_30d_price = parse_price_fr(lowest_30d_text)

    # si le prix conseillé == prix min 30j, c'est une erreur de détection
    price_list_value = parse_price_fr(price_list_text)
    if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
        price_list_text = None
        price_list_value = None

    # réductions
    # savingsPercentage dans corePriceDisplay = réduction par rapport au prix min 30j (si présent)
    # srpSavingsPercentageBlock = réduction par rapport au prix conseillé
    reduction_savings_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .savingsPercentage")
    reduction_conseille_text = _safe_text(page, ".srpSavingsPercentageBlock")

    # attribuer correctement les réductions selon ce qui est présent
    reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
    reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
    # si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
    if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
        reduction_conseille = _parse_percent(reduction_savings_text)

    asin = _safe_attr(page, "input#ASIN", "value") or _extract_asin_from_url(url)

    soup = BeautifulSoup(page.content(), "html.parser")
    a_propos = _extract_about_bullets(soup)
    description = _extract_description(soup)
    carateristique = _extract_carateristique(soup)
    details = _extract_details(soup)

    data = {
        "url": url,
        "asin": asin,
        "titre": title,
        "url_image_principale": image_main_url,
        "prix_actuel": parse_price_fr(price_text),
        "prix_conseille": price_list_value,
        "prix_min_30j": lowest_30d_price,
        "prix_conseille_reduction": reduction_conseille,
        "prix_min_30j_reduction": reduction_min_30j,
        "etat_stock": stock_text,
        "en_stock": in_stock,
        "note": parse_rating_value(rating_text),
        "nombre_avis": parse_rating_count(rating_count_text),
        "choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
        "offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
        "prime": True if prime_eligible else None,
        "exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
        "a_propos": a_propos,
        "description": description,
        "carateristique": carateristique,
        "details": details,
    }

    return data