Files
suivi_produit/backend/app/scraper/amazon/parser.py
2026-01-18 19:21:51 +01:00

460 lines
18 KiB
Python

from __future__ import annotations
import re
from typing import Any
from bs4 import BeautifulSoup
from loguru import logger
from playwright.sync_api import Page
from backend.app.scraper.normalize import (
parse_price_fr,
parse_rating_count,
parse_rating_value,
parse_stock_status,
)
def detect_blocked(html: str) -> bool:
# détection simple des blocages / captcha
lowered = html.lower()
if "captcha" in lowered or "robot" in lowered:
return True
if "saisissez les caractères" in lowered or "vérification" in lowered:
return True
return False
def _safe_text(page: Page, selector: str) -> str | None:
try:
locator = page.locator(selector)
if locator.count() == 0:
return None
value = locator.first.inner_text().strip()
return value or None
except Exception:
return None
def _safe_attr(page: Page, selector: str, attr: str) -> str | None:
try:
locator = page.locator(selector)
if locator.count() == 0:
return None
return locator.first.get_attribute(attr)
except Exception:
return None
def _extract_asin_from_url(url: str) -> str | None:
match = re.search(r"/dp/([A-Z0-9]{10})", url)
if match:
return match.group(1)
return None
def _safe_text_soup(soup: BeautifulSoup, selector: str) -> str | None:
node = soup.select_one(selector)
if not node:
return None
value = node.get_text(strip=True)
return value or None
def _safe_attr_soup(soup: BeautifulSoup, selector: str, attr: str) -> str | None:
node = soup.select_one(selector)
if not node:
return None
return node.get(attr)
def _has_selector_soup(soup: BeautifulSoup, selector: str) -> bool:
return soup.select_one(selector) is not None
def _compose_price_from_parts(whole: str | None, fraction: str | None, symbol: str | None) -> str | None:
if not whole:
return None
whole_digits = re.sub(r"[^\d]", "", whole)
if not whole_digits:
return None
fraction_digits = re.sub(r"[^\d]", "", fraction or "")
if not fraction_digits:
fraction_digits = "00"
fraction_digits = fraction_digits[:2].ljust(2, "0")
symbol = (symbol or "").strip()
return f"{whole_digits},{fraction_digits} {symbol}"
def _extract_lowest_30d_text_soup(soup: BeautifulSoup) -> str | None:
containers = []
container = soup.select_one("#priceBadging_feature_div")
if container:
containers.append(container)
containers.extend(soup.select(".basisPrice"))
for node in containers:
text = node.get_text(" ", strip=True)
if text and re.search(r"prix.+(30|trente).+jour", text.lower()):
price_node = node.select_one(".a-offscreen")
if price_node:
price_text = price_node.get_text(" ", strip=True)
if price_text:
return price_text
return text
return None
def _extract_about_bullets(soup: BeautifulSoup) -> list[str] | None:
container = soup.select_one("#feature-bullets")
if not container:
return None
items = []
for node in container.select("ul li span.a-list-item"):
text = node.get_text(" ", strip=True)
if text:
items.append(text)
return items or None
def _extract_description(soup: BeautifulSoup) -> str | None:
node = soup.select_one("#productDescription")
if not node:
return None
text = node.get_text(" ", strip=True)
return text or None
def _extract_table_kv(table) -> dict[str, str]:
data: dict[str, str] = {}
for row in table.select("tr"):
key = row.select_one("th")
value = row.select_one("td")
if not key or not value:
continue
key_text = key.get_text(" ", strip=True)
value_text = value.get_text(" ", strip=True)
if key_text and value_text:
data[key_text] = value_text
return data
def _extract_tables_from_selector(soup: BeautifulSoup, selector: str) -> list:
section = soup.select_one(selector)
if not section:
return []
if section.name == "table":
return [section]
return section.select("table")
def _extract_carateristique(soup: BeautifulSoup) -> dict[str, str] | None:
selectors = [
"[data-csa-c-content-id='voyager-expander-btn']",
"#productDetails_techSpec_section_1",
"#productDetails_techSpec_section_2",
]
specs: dict[str, str] = {}
for selector in selectors:
tables = _extract_tables_from_selector(soup, selector)
for table in tables:
specs.update(_extract_table_kv(table))
return specs or None
def _extract_details(soup: BeautifulSoup) -> dict[str, str] | None:
container = soup.select_one("[data-csa-c-content-id='voyager-expander-btn']")
carateristique_tables = set(container.select("table")) if container else set()
selectors = [
"#productDetails_techSpec_section_1",
"#productDetails_detailBullets_sections1",
"#productDetails_detailBullets_sections2",
"#productDetails",
]
details: dict[str, str] = {}
seen_tables = set()
for selector in selectors:
for table in _extract_tables_from_selector(soup, selector):
if table in carateristique_tables or table in seen_tables:
continue
seen_tables.add(table)
details.update(_extract_table_kv(table))
return details or None
def _parse_percent(text: str | None) -> int | None:
if not text:
return None
match = re.search(r"(-?\d+)", text.replace("\u00a0", " "))
if not match:
return None
try:
return int(match.group(1))
except ValueError:
return None
def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
soup = BeautifulSoup(html, "html.parser")
title = _safe_text_soup(soup, "#productTitle")
image_main_url = _safe_attr_soup(soup, "#landingImage", "src")
if not image_main_url:
image_main_url = _safe_attr_soup(soup, "#imgTagWrapperId img", "src")
price_text = _safe_text_soup(soup, "#corePriceDisplay_desktop_feature_div .a-offscreen")
if not price_text:
price_text = _safe_text_soup(soup, "#priceblock_ourprice")
if not price_text:
price_text = _safe_text_soup(soup, "#priceblock_dealprice")
if not price_text:
whole = _safe_text_soup(soup, ".a-price .a-price-whole")
fraction = _safe_text_soup(soup, ".a-price .a-price-fraction")
symbol = _safe_text_soup(soup, ".a-price .a-price-symbol")
price_text = _compose_price_from_parts(whole, fraction, symbol)
if not price_text:
price_text = _safe_attr_soup(soup, "#twister-plus-price-data-price", "value")
# prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
if not price_list_text:
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen")
if not price_list_text:
price_list_text = _safe_text_soup(soup, "#priceblock_strikeprice")
# fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
if not price_list_text:
price_list_text = _safe_text_soup(
soup, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen"
)
stock_text = _safe_text_soup(soup, "#availability span")
if not stock_text:
stock_text = _safe_text_soup(soup, "#availability")
in_stock, stock_text = parse_stock_status(stock_text)
rating_text = _safe_text_soup(soup, "#acrPopover .a-icon-alt")
rating_count_text = _safe_text_soup(soup, "#acrCustomerReviewText")
amazon_choice = _safe_text_soup(soup, "#acBadge_feature_div")
limited_time_deal = _safe_text_soup(soup, "#dealBadge_feature_div")
prime_eligible = None
if _has_selector_soup(soup, "#primeBadge"):
prime_eligible = True
elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
prime_eligible = True
elif _has_selector_soup(soup, "#priceBadging_feature_div i.a-icon-prime"):
prime_eligible = True
elif _has_selector_soup(soup, "#corePriceDisplay_desktop_feature_div i.a-icon-prime"):
prime_eligible = True
elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
prime_eligible = True
elif _has_selector_soup(soup, "i#prime-badge"):
prime_eligible = True
elif _has_selector_soup(soup, "i.a-icon-prime[aria-label*='prime']"):
prime_eligible = True
amazon_exclusive = "Exclusivité Amazon" if "Exclusivité Amazon" in soup.get_text() else None
# prix plus bas 30 jours (basisPrice avec mention "30 jours")
lowest_30d_text = _extract_lowest_30d_text_soup(soup)
lowest_30d_price = None
if lowest_30d_text:
lowest_30d_price = parse_price_fr(lowest_30d_text)
# si le prix conseillé == prix min 30j, c'est une erreur de détection
# (le prix barré dans corePriceDisplay est en fait le prix min 30j, pas le conseillé)
price_list_value = parse_price_fr(price_list_text)
if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
price_list_text = None
price_list_value = None
# réductions
reduction_savings_text = _safe_text_soup(
soup, "#corePriceDisplay_desktop_feature_div .savingsPercentage"
)
reduction_conseille_text = _safe_text_soup(soup, ".srpSavingsPercentageBlock")
# attribuer correctement les réductions selon ce qui est présent
# - si prix min 30j présent, savingsPercentage = réduction par rapport au min 30j
# - si prix conseillé présent (srpPriceBlock), srpSavingsPercentageBlock = réduction par rapport au conseillé
reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
# si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
reduction_conseille = _parse_percent(reduction_savings_text)
a_propos = _extract_about_bullets(soup)
description = _extract_description(soup)
carateristique = _extract_carateristique(soup)
details = _extract_details(soup)
asin = _safe_attr_soup(soup, "input#ASIN", "value") or _extract_asin_from_url(url)
data = {
"url": url,
"asin": asin,
"titre": title,
"url_image_principale": image_main_url,
"prix_actuel": parse_price_fr(price_text),
"prix_conseille": price_list_value,
"prix_min_30j": lowest_30d_price,
"prix_conseille_reduction": reduction_conseille,
"prix_min_30j_reduction": reduction_min_30j,
"etat_stock": stock_text,
"en_stock": in_stock,
"note": parse_rating_value(rating_text),
"nombre_avis": parse_rating_count(rating_count_text),
"choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
"offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
"prime": True if prime_eligible else None,
"exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
"a_propos": a_propos,
"description": description,
"carateristique": carateristique,
"details": details,
}
missing = [key for key in ("titre", "prix_actuel", "note") if not data.get(key)]
if missing:
logger.warning("Champs manquants (html): {}", ", ".join(missing))
return data
def extract_product_data(page: Page, url: str) -> dict[str, Any]:
# champ titre
title = _safe_text(page, "#productTitle")
# image principale
image_main_url = _safe_attr(page, "#landingImage", "src")
if not image_main_url:
image_main_url = _safe_attr(page, "#imgTagWrapperId img", "src")
# prix actuel
price_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-offscreen")
if not price_text:
price_text = _safe_text(page, "#priceblock_ourprice")
if not price_text:
price_text = _safe_text(page, "#priceblock_dealprice")
if not price_text:
whole = _safe_text(page, ".a-price .a-price-whole")
fraction = _safe_text(page, ".a-price .a-price-fraction")
symbol = _safe_text(page, ".a-price .a-price-symbol")
price_text = _compose_price_from_parts(whole, fraction, symbol)
if not price_text:
price_text = _safe_attr(page, "#twister-plus-price-data-price", "value")
# prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
price_list_text = _safe_text(page, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
if not price_list_text:
price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen")
if not price_list_text:
price_list_text = _safe_text(page, "#priceblock_strikeprice")
# fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
if not price_list_text:
price_list_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen")
# stock
stock_text = _safe_text(page, "#availability span")
if not stock_text:
stock_text = _safe_text(page, "#availability")
in_stock, stock_text = parse_stock_status(stock_text)
# rating
rating_text = _safe_text(page, "#acrPopover .a-icon-alt")
rating_count_text = _safe_text(page, "#acrCustomerReviewText")
# badges
amazon_choice = _safe_text(page, "#acBadge_feature_div")
limited_time_deal = _safe_text(page, "#dealBadge_feature_div")
prime_eligible = None
if page.locator("#primeBadge").count() > 0:
prime_eligible = True
elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
prime_eligible = True
elif page.locator("#priceBadging_feature_div i.a-icon-prime").count() > 0:
prime_eligible = True
elif page.locator("#corePriceDisplay_desktop_feature_div i.a-icon-prime").count() > 0:
prime_eligible = True
elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
prime_eligible = True
elif page.locator("i#prime-badge").count() > 0:
prime_eligible = True
elif page.locator("i.a-icon-prime[aria-label*='prime']").count() > 0:
prime_eligible = True
amazon_exclusive = _safe_text(page, "text=Exclusivité Amazon")
# prix plus bas 30 jours (basisPrice ou corePriceDisplay avec mention "30 jours")
lowest_30d_text = None
lowest_30d_price = None
if page.locator(".basisPrice").count() > 0:
basis_text = page.locator(".basisPrice").first.inner_text()
if basis_text and re.search(r"prix.+(30|trente).+jour", basis_text.lower()):
lowest_30d_text = _safe_text(page, ".basisPrice .a-price .a-offscreen") or basis_text
lowest_30d_price = parse_price_fr(lowest_30d_text)
# fallback sur corePriceDisplay si contient mention 30 jours
if lowest_30d_price is None and page.locator("#corePriceDisplay_desktop_feature_div .a-text-price").count() > 0:
core_text = page.locator("#corePriceDisplay_desktop_feature_div").first.inner_text()
if core_text and re.search(r"prix.+(30|trente).+jour", core_text.lower()):
lowest_30d_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price .a-offscreen")
lowest_30d_price = parse_price_fr(lowest_30d_text)
if not lowest_30d_price and page.locator("#priceBadging_feature_div").count() > 0:
badging_text = page.locator("#priceBadging_feature_div").first.inner_text()
if badging_text and re.search(r"prix.+(30|trente).+jour", badging_text.lower()):
lowest_30d_text = _safe_text(page, "#priceBadging_feature_div .a-offscreen") or badging_text
lowest_30d_price = parse_price_fr(lowest_30d_text)
# si le prix conseillé == prix min 30j, c'est une erreur de détection
price_list_value = parse_price_fr(price_list_text)
if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
price_list_text = None
price_list_value = None
# réductions
# savingsPercentage dans corePriceDisplay = réduction par rapport au prix min 30j (si présent)
# srpSavingsPercentageBlock = réduction par rapport au prix conseillé
reduction_savings_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .savingsPercentage")
reduction_conseille_text = _safe_text(page, ".srpSavingsPercentageBlock")
# attribuer correctement les réductions selon ce qui est présent
reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
# si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
reduction_conseille = _parse_percent(reduction_savings_text)
asin = _safe_attr(page, "input#ASIN", "value") or _extract_asin_from_url(url)
soup = BeautifulSoup(page.content(), "html.parser")
a_propos = _extract_about_bullets(soup)
description = _extract_description(soup)
carateristique = _extract_carateristique(soup)
details = _extract_details(soup)
data = {
"url": url,
"asin": asin,
"titre": title,
"url_image_principale": image_main_url,
"prix_actuel": parse_price_fr(price_text),
"prix_conseille": price_list_value,
"prix_min_30j": lowest_30d_price,
"prix_conseille_reduction": reduction_conseille,
"prix_min_30j_reduction": reduction_min_30j,
"etat_stock": stock_text,
"en_stock": in_stock,
"note": parse_rating_value(rating_text),
"nombre_avis": parse_rating_count(rating_count_text),
"choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
"offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
"prime": True if prime_eligible else None,
"exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
"a_propos": a_propos,
"description": description,
"carateristique": carateristique,
"details": details,
}
return data