460 lines
18 KiB
Python
460 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any
|
|
|
|
from bs4 import BeautifulSoup
|
|
from loguru import logger
|
|
from playwright.sync_api import Page
|
|
|
|
from backend.app.scraper.normalize import (
|
|
parse_price_fr,
|
|
parse_rating_count,
|
|
parse_rating_value,
|
|
parse_stock_status,
|
|
)
|
|
|
|
|
|
def detect_blocked(html: str) -> bool:
|
|
# détection simple des blocages / captcha
|
|
lowered = html.lower()
|
|
if "captcha" in lowered or "robot" in lowered:
|
|
return True
|
|
if "saisissez les caractères" in lowered or "vérification" in lowered:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _safe_text(page: Page, selector: str) -> str | None:
|
|
try:
|
|
locator = page.locator(selector)
|
|
if locator.count() == 0:
|
|
return None
|
|
value = locator.first.inner_text().strip()
|
|
return value or None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _safe_attr(page: Page, selector: str, attr: str) -> str | None:
|
|
try:
|
|
locator = page.locator(selector)
|
|
if locator.count() == 0:
|
|
return None
|
|
return locator.first.get_attribute(attr)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _extract_asin_from_url(url: str) -> str | None:
|
|
match = re.search(r"/dp/([A-Z0-9]{10})", url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def _safe_text_soup(soup: BeautifulSoup, selector: str) -> str | None:
|
|
node = soup.select_one(selector)
|
|
if not node:
|
|
return None
|
|
value = node.get_text(strip=True)
|
|
return value or None
|
|
|
|
|
|
def _safe_attr_soup(soup: BeautifulSoup, selector: str, attr: str) -> str | None:
|
|
node = soup.select_one(selector)
|
|
if not node:
|
|
return None
|
|
return node.get(attr)
|
|
|
|
|
|
def _has_selector_soup(soup: BeautifulSoup, selector: str) -> bool:
|
|
return soup.select_one(selector) is not None
|
|
|
|
|
|
def _compose_price_from_parts(whole: str | None, fraction: str | None, symbol: str | None) -> str | None:
|
|
if not whole:
|
|
return None
|
|
whole_digits = re.sub(r"[^\d]", "", whole)
|
|
if not whole_digits:
|
|
return None
|
|
fraction_digits = re.sub(r"[^\d]", "", fraction or "")
|
|
if not fraction_digits:
|
|
fraction_digits = "00"
|
|
fraction_digits = fraction_digits[:2].ljust(2, "0")
|
|
symbol = (symbol or "€").strip()
|
|
return f"{whole_digits},{fraction_digits} {symbol}"
|
|
|
|
|
|
def _extract_lowest_30d_text_soup(soup: BeautifulSoup) -> str | None:
|
|
containers = []
|
|
container = soup.select_one("#priceBadging_feature_div")
|
|
if container:
|
|
containers.append(container)
|
|
containers.extend(soup.select(".basisPrice"))
|
|
for node in containers:
|
|
text = node.get_text(" ", strip=True)
|
|
if text and re.search(r"prix.+(30|trente).+jour", text.lower()):
|
|
price_node = node.select_one(".a-offscreen")
|
|
if price_node:
|
|
price_text = price_node.get_text(" ", strip=True)
|
|
if price_text:
|
|
return price_text
|
|
return text
|
|
return None
|
|
|
|
|
|
def _extract_about_bullets(soup: BeautifulSoup) -> list[str] | None:
|
|
container = soup.select_one("#feature-bullets")
|
|
if not container:
|
|
return None
|
|
items = []
|
|
for node in container.select("ul li span.a-list-item"):
|
|
text = node.get_text(" ", strip=True)
|
|
if text:
|
|
items.append(text)
|
|
return items or None
|
|
|
|
|
|
def _extract_description(soup: BeautifulSoup) -> str | None:
|
|
node = soup.select_one("#productDescription")
|
|
if not node:
|
|
return None
|
|
text = node.get_text(" ", strip=True)
|
|
return text or None
|
|
|
|
|
|
def _extract_table_kv(table) -> dict[str, str]:
|
|
data: dict[str, str] = {}
|
|
for row in table.select("tr"):
|
|
key = row.select_one("th")
|
|
value = row.select_one("td")
|
|
if not key or not value:
|
|
continue
|
|
key_text = key.get_text(" ", strip=True)
|
|
value_text = value.get_text(" ", strip=True)
|
|
if key_text and value_text:
|
|
data[key_text] = value_text
|
|
return data
|
|
|
|
|
|
def _extract_tables_from_selector(soup: BeautifulSoup, selector: str) -> list:
|
|
section = soup.select_one(selector)
|
|
if not section:
|
|
return []
|
|
if section.name == "table":
|
|
return [section]
|
|
return section.select("table")
|
|
|
|
|
|
def _extract_carateristique(soup: BeautifulSoup) -> dict[str, str] | None:
|
|
selectors = [
|
|
"[data-csa-c-content-id='voyager-expander-btn']",
|
|
"#productDetails_techSpec_section_1",
|
|
"#productDetails_techSpec_section_2",
|
|
]
|
|
specs: dict[str, str] = {}
|
|
for selector in selectors:
|
|
tables = _extract_tables_from_selector(soup, selector)
|
|
for table in tables:
|
|
specs.update(_extract_table_kv(table))
|
|
return specs or None
|
|
|
|
|
|
def _extract_details(soup: BeautifulSoup) -> dict[str, str] | None:
|
|
container = soup.select_one("[data-csa-c-content-id='voyager-expander-btn']")
|
|
carateristique_tables = set(container.select("table")) if container else set()
|
|
selectors = [
|
|
"#productDetails_techSpec_section_1",
|
|
"#productDetails_detailBullets_sections1",
|
|
"#productDetails_detailBullets_sections2",
|
|
"#productDetails",
|
|
]
|
|
details: dict[str, str] = {}
|
|
seen_tables = set()
|
|
for selector in selectors:
|
|
for table in _extract_tables_from_selector(soup, selector):
|
|
if table in carateristique_tables or table in seen_tables:
|
|
continue
|
|
seen_tables.add(table)
|
|
details.update(_extract_table_kv(table))
|
|
return details or None
|
|
|
|
|
|
def _parse_percent(text: str | None) -> int | None:
|
|
if not text:
|
|
return None
|
|
match = re.search(r"(-?\d+)", text.replace("\u00a0", " "))
|
|
if not match:
|
|
return None
|
|
try:
|
|
return int(match.group(1))
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
title = _safe_text_soup(soup, "#productTitle")
|
|
|
|
image_main_url = _safe_attr_soup(soup, "#landingImage", "src")
|
|
if not image_main_url:
|
|
image_main_url = _safe_attr_soup(soup, "#imgTagWrapperId img", "src")
|
|
|
|
price_text = _safe_text_soup(soup, "#corePriceDisplay_desktop_feature_div .a-offscreen")
|
|
if not price_text:
|
|
price_text = _safe_text_soup(soup, "#priceblock_ourprice")
|
|
if not price_text:
|
|
price_text = _safe_text_soup(soup, "#priceblock_dealprice")
|
|
if not price_text:
|
|
whole = _safe_text_soup(soup, ".a-price .a-price-whole")
|
|
fraction = _safe_text_soup(soup, ".a-price .a-price-fraction")
|
|
symbol = _safe_text_soup(soup, ".a-price .a-price-symbol")
|
|
price_text = _compose_price_from_parts(whole, fraction, symbol)
|
|
if not price_text:
|
|
price_text = _safe_attr_soup(soup, "#twister-plus-price-data-price", "value")
|
|
|
|
# prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
|
|
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
|
|
if not price_list_text:
|
|
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen")
|
|
if not price_list_text:
|
|
price_list_text = _safe_text_soup(soup, "#priceblock_strikeprice")
|
|
# fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
|
|
if not price_list_text:
|
|
price_list_text = _safe_text_soup(
|
|
soup, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen"
|
|
)
|
|
|
|
stock_text = _safe_text_soup(soup, "#availability span")
|
|
if not stock_text:
|
|
stock_text = _safe_text_soup(soup, "#availability")
|
|
|
|
in_stock, stock_text = parse_stock_status(stock_text)
|
|
|
|
rating_text = _safe_text_soup(soup, "#acrPopover .a-icon-alt")
|
|
rating_count_text = _safe_text_soup(soup, "#acrCustomerReviewText")
|
|
|
|
amazon_choice = _safe_text_soup(soup, "#acBadge_feature_div")
|
|
limited_time_deal = _safe_text_soup(soup, "#dealBadge_feature_div")
|
|
prime_eligible = None
|
|
if _has_selector_soup(soup, "#primeBadge"):
|
|
prime_eligible = True
|
|
elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
|
|
prime_eligible = True
|
|
elif _has_selector_soup(soup, "#priceBadging_feature_div i.a-icon-prime"):
|
|
prime_eligible = True
|
|
elif _has_selector_soup(soup, "#corePriceDisplay_desktop_feature_div i.a-icon-prime"):
|
|
prime_eligible = True
|
|
elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
|
|
prime_eligible = True
|
|
elif _has_selector_soup(soup, "i#prime-badge"):
|
|
prime_eligible = True
|
|
elif _has_selector_soup(soup, "i.a-icon-prime[aria-label*='prime']"):
|
|
prime_eligible = True
|
|
amazon_exclusive = "Exclusivité Amazon" if "Exclusivité Amazon" in soup.get_text() else None
|
|
|
|
# prix plus bas 30 jours (basisPrice avec mention "30 jours")
|
|
lowest_30d_text = _extract_lowest_30d_text_soup(soup)
|
|
lowest_30d_price = None
|
|
if lowest_30d_text:
|
|
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
|
|
|
# si le prix conseillé == prix min 30j, c'est une erreur de détection
|
|
# (le prix barré dans corePriceDisplay est en fait le prix min 30j, pas le conseillé)
|
|
price_list_value = parse_price_fr(price_list_text)
|
|
if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
|
|
price_list_text = None
|
|
price_list_value = None
|
|
|
|
# réductions
|
|
reduction_savings_text = _safe_text_soup(
|
|
soup, "#corePriceDisplay_desktop_feature_div .savingsPercentage"
|
|
)
|
|
reduction_conseille_text = _safe_text_soup(soup, ".srpSavingsPercentageBlock")
|
|
|
|
# attribuer correctement les réductions selon ce qui est présent
|
|
# - si prix min 30j présent, savingsPercentage = réduction par rapport au min 30j
|
|
# - si prix conseillé présent (srpPriceBlock), srpSavingsPercentageBlock = réduction par rapport au conseillé
|
|
reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
|
|
reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
|
|
# si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
|
|
if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
|
|
reduction_conseille = _parse_percent(reduction_savings_text)
|
|
|
|
a_propos = _extract_about_bullets(soup)
|
|
description = _extract_description(soup)
|
|
carateristique = _extract_carateristique(soup)
|
|
details = _extract_details(soup)
|
|
|
|
asin = _safe_attr_soup(soup, "input#ASIN", "value") or _extract_asin_from_url(url)
|
|
|
|
data = {
|
|
"url": url,
|
|
"asin": asin,
|
|
"titre": title,
|
|
"url_image_principale": image_main_url,
|
|
"prix_actuel": parse_price_fr(price_text),
|
|
"prix_conseille": price_list_value,
|
|
"prix_min_30j": lowest_30d_price,
|
|
"prix_conseille_reduction": reduction_conseille,
|
|
"prix_min_30j_reduction": reduction_min_30j,
|
|
"etat_stock": stock_text,
|
|
"en_stock": in_stock,
|
|
"note": parse_rating_value(rating_text),
|
|
"nombre_avis": parse_rating_count(rating_count_text),
|
|
"choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
|
|
"offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
|
|
"prime": True if prime_eligible else None,
|
|
"exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
|
|
"a_propos": a_propos,
|
|
"description": description,
|
|
"carateristique": carateristique,
|
|
"details": details,
|
|
}
|
|
|
|
missing = [key for key in ("titre", "prix_actuel", "note") if not data.get(key)]
|
|
if missing:
|
|
logger.warning("Champs manquants (html): {}", ", ".join(missing))
|
|
|
|
return data
|
|
|
|
|
|
def extract_product_data(page: Page, url: str) -> dict[str, Any]:
|
|
# champ titre
|
|
title = _safe_text(page, "#productTitle")
|
|
|
|
# image principale
|
|
image_main_url = _safe_attr(page, "#landingImage", "src")
|
|
if not image_main_url:
|
|
image_main_url = _safe_attr(page, "#imgTagWrapperId img", "src")
|
|
|
|
# prix actuel
|
|
price_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-offscreen")
|
|
if not price_text:
|
|
price_text = _safe_text(page, "#priceblock_ourprice")
|
|
if not price_text:
|
|
price_text = _safe_text(page, "#priceblock_dealprice")
|
|
if not price_text:
|
|
whole = _safe_text(page, ".a-price .a-price-whole")
|
|
fraction = _safe_text(page, ".a-price .a-price-fraction")
|
|
symbol = _safe_text(page, ".a-price .a-price-symbol")
|
|
price_text = _compose_price_from_parts(whole, fraction, symbol)
|
|
if not price_text:
|
|
price_text = _safe_attr(page, "#twister-plus-price-data-price", "value")
|
|
|
|
# prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
|
|
price_list_text = _safe_text(page, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
|
|
if not price_list_text:
|
|
price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen")
|
|
if not price_list_text:
|
|
price_list_text = _safe_text(page, "#priceblock_strikeprice")
|
|
# fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
|
|
if not price_list_text:
|
|
price_list_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen")
|
|
|
|
# stock
|
|
stock_text = _safe_text(page, "#availability span")
|
|
if not stock_text:
|
|
stock_text = _safe_text(page, "#availability")
|
|
|
|
in_stock, stock_text = parse_stock_status(stock_text)
|
|
|
|
# rating
|
|
rating_text = _safe_text(page, "#acrPopover .a-icon-alt")
|
|
rating_count_text = _safe_text(page, "#acrCustomerReviewText")
|
|
|
|
# badges
|
|
amazon_choice = _safe_text(page, "#acBadge_feature_div")
|
|
limited_time_deal = _safe_text(page, "#dealBadge_feature_div")
|
|
prime_eligible = None
|
|
if page.locator("#primeBadge").count() > 0:
|
|
prime_eligible = True
|
|
elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
|
|
prime_eligible = True
|
|
elif page.locator("#priceBadging_feature_div i.a-icon-prime").count() > 0:
|
|
prime_eligible = True
|
|
elif page.locator("#corePriceDisplay_desktop_feature_div i.a-icon-prime").count() > 0:
|
|
prime_eligible = True
|
|
elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
|
|
prime_eligible = True
|
|
elif page.locator("i#prime-badge").count() > 0:
|
|
prime_eligible = True
|
|
elif page.locator("i.a-icon-prime[aria-label*='prime']").count() > 0:
|
|
prime_eligible = True
|
|
|
|
amazon_exclusive = _safe_text(page, "text=Exclusivité Amazon")
|
|
|
|
# prix plus bas 30 jours (basisPrice ou corePriceDisplay avec mention "30 jours")
|
|
lowest_30d_text = None
|
|
lowest_30d_price = None
|
|
if page.locator(".basisPrice").count() > 0:
|
|
basis_text = page.locator(".basisPrice").first.inner_text()
|
|
if basis_text and re.search(r"prix.+(30|trente).+jour", basis_text.lower()):
|
|
lowest_30d_text = _safe_text(page, ".basisPrice .a-price .a-offscreen") or basis_text
|
|
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
|
# fallback sur corePriceDisplay si contient mention 30 jours
|
|
if lowest_30d_price is None and page.locator("#corePriceDisplay_desktop_feature_div .a-text-price").count() > 0:
|
|
core_text = page.locator("#corePriceDisplay_desktop_feature_div").first.inner_text()
|
|
if core_text and re.search(r"prix.+(30|trente).+jour", core_text.lower()):
|
|
lowest_30d_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price .a-offscreen")
|
|
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
|
if not lowest_30d_price and page.locator("#priceBadging_feature_div").count() > 0:
|
|
badging_text = page.locator("#priceBadging_feature_div").first.inner_text()
|
|
if badging_text and re.search(r"prix.+(30|trente).+jour", badging_text.lower()):
|
|
lowest_30d_text = _safe_text(page, "#priceBadging_feature_div .a-offscreen") or badging_text
|
|
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
|
|
|
# si le prix conseillé == prix min 30j, c'est une erreur de détection
|
|
price_list_value = parse_price_fr(price_list_text)
|
|
if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
|
|
price_list_text = None
|
|
price_list_value = None
|
|
|
|
# réductions
|
|
# savingsPercentage dans corePriceDisplay = réduction par rapport au prix min 30j (si présent)
|
|
# srpSavingsPercentageBlock = réduction par rapport au prix conseillé
|
|
reduction_savings_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .savingsPercentage")
|
|
reduction_conseille_text = _safe_text(page, ".srpSavingsPercentageBlock")
|
|
|
|
# attribuer correctement les réductions selon ce qui est présent
|
|
reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
|
|
reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
|
|
# si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
|
|
if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
|
|
reduction_conseille = _parse_percent(reduction_savings_text)
|
|
|
|
asin = _safe_attr(page, "input#ASIN", "value") or _extract_asin_from_url(url)
|
|
|
|
soup = BeautifulSoup(page.content(), "html.parser")
|
|
a_propos = _extract_about_bullets(soup)
|
|
description = _extract_description(soup)
|
|
carateristique = _extract_carateristique(soup)
|
|
details = _extract_details(soup)
|
|
|
|
data = {
|
|
"url": url,
|
|
"asin": asin,
|
|
"titre": title,
|
|
"url_image_principale": image_main_url,
|
|
"prix_actuel": parse_price_fr(price_text),
|
|
"prix_conseille": price_list_value,
|
|
"prix_min_30j": lowest_30d_price,
|
|
"prix_conseille_reduction": reduction_conseille,
|
|
"prix_min_30j_reduction": reduction_min_30j,
|
|
"etat_stock": stock_text,
|
|
"en_stock": in_stock,
|
|
"note": parse_rating_value(rating_text),
|
|
"nombre_avis": parse_rating_count(rating_count_text),
|
|
"choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
|
|
"offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
|
|
"prime": True if prime_eligible else None,
|
|
"exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
|
|
"a_propos": a_propos,
|
|
"description": description,
|
|
"carateristique": carateristique,
|
|
"details": details,
|
|
}
|
|
|
|
return data
|