1er
This commit is contained in:
438
backend/app/scraper/amazon/parser.py
Normal file
438
backend/app/scraper/amazon/parser.py
Normal file
@@ -0,0 +1,438 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from playwright.sync_api import Page
|
||||
|
||||
from backend.app.scraper.normalize import (
|
||||
parse_price_fr,
|
||||
parse_rating_count,
|
||||
parse_rating_value,
|
||||
parse_stock_status,
|
||||
)
|
||||
|
||||
|
||||
def detect_blocked(html: str) -> bool:
|
||||
# détection simple des blocages / captcha
|
||||
lowered = html.lower()
|
||||
if "captcha" in lowered or "robot" in lowered:
|
||||
return True
|
||||
if "saisissez les caractères" in lowered or "vérification" in lowered:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _safe_text(page: Page, selector: str) -> str | None:
|
||||
try:
|
||||
locator = page.locator(selector)
|
||||
if locator.count() == 0:
|
||||
return None
|
||||
value = locator.first.inner_text().strip()
|
||||
return value or None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _safe_attr(page: Page, selector: str, attr: str) -> str | None:
|
||||
try:
|
||||
locator = page.locator(selector)
|
||||
if locator.count() == 0:
|
||||
return None
|
||||
return locator.first.get_attribute(attr)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_asin_from_url(url: str) -> str | None:
|
||||
match = re.search(r"/dp/([A-Z0-9]{10})", url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _safe_text_soup(soup: BeautifulSoup, selector: str) -> str | None:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
return None
|
||||
value = node.get_text(strip=True)
|
||||
return value or None
|
||||
|
||||
|
||||
def _safe_attr_soup(soup: BeautifulSoup, selector: str, attr: str) -> str | None:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
return None
|
||||
return node.get(attr)
|
||||
|
||||
|
||||
def _has_selector_soup(soup: BeautifulSoup, selector: str) -> bool:
|
||||
return soup.select_one(selector) is not None
|
||||
|
||||
|
||||
def _compose_price_from_parts(whole: str | None, fraction: str | None, symbol: str | None) -> str | None:
|
||||
if not whole:
|
||||
return None
|
||||
whole_digits = re.sub(r"[^\d]", "", whole)
|
||||
if not whole_digits:
|
||||
return None
|
||||
fraction_digits = re.sub(r"[^\d]", "", fraction or "")
|
||||
if not fraction_digits:
|
||||
fraction_digits = "00"
|
||||
fraction_digits = fraction_digits[:2].ljust(2, "0")
|
||||
symbol = (symbol or "€").strip()
|
||||
return f"{whole_digits},{fraction_digits} {symbol}"
|
||||
|
||||
|
||||
def _extract_lowest_30d_text_soup(soup: BeautifulSoup) -> str | None:
|
||||
containers = []
|
||||
container = soup.select_one("#priceBadging_feature_div")
|
||||
if container:
|
||||
containers.append(container)
|
||||
containers.extend(soup.select(".basisPrice"))
|
||||
for node in containers:
|
||||
text = node.get_text(" ", strip=True)
|
||||
if text and re.search(r"prix.+(30|trente).+jour", text.lower()):
|
||||
price_node = node.select_one(".a-offscreen")
|
||||
if price_node:
|
||||
price_text = price_node.get_text(" ", strip=True)
|
||||
if price_text:
|
||||
return price_text
|
||||
return text
|
||||
return None
|
||||
|
||||
|
||||
def _extract_about_bullets(soup: BeautifulSoup) -> list[str] | None:
|
||||
container = soup.select_one("#feature-bullets")
|
||||
if not container:
|
||||
return None
|
||||
items = []
|
||||
for node in container.select("ul li span.a-list-item"):
|
||||
text = node.get_text(" ", strip=True)
|
||||
if text:
|
||||
items.append(text)
|
||||
return items or None
|
||||
|
||||
|
||||
def _extract_description(soup: BeautifulSoup) -> str | None:
|
||||
node = soup.select_one("#productDescription")
|
||||
if not node:
|
||||
return None
|
||||
text = node.get_text(" ", strip=True)
|
||||
return text or None
|
||||
|
||||
|
||||
def _extract_table_kv(table) -> dict[str, str]:
|
||||
data: dict[str, str] = {}
|
||||
for row in table.select("tr"):
|
||||
key = row.select_one("th")
|
||||
value = row.select_one("td")
|
||||
if not key or not value:
|
||||
continue
|
||||
key_text = key.get_text(" ", strip=True)
|
||||
value_text = value.get_text(" ", strip=True)
|
||||
if key_text and value_text:
|
||||
data[key_text] = value_text
|
||||
return data
|
||||
|
||||
|
||||
def _extract_tables_from_selector(soup: BeautifulSoup, selector: str) -> list:
|
||||
section = soup.select_one(selector)
|
||||
if not section:
|
||||
return []
|
||||
if section.name == "table":
|
||||
return [section]
|
||||
return section.select("table")
|
||||
|
||||
|
||||
def _extract_carateristique(soup: BeautifulSoup) -> dict[str, str] | None:
|
||||
selectors = [
|
||||
"[data-csa-c-content-id='voyager-expander-btn']",
|
||||
"#productDetails_techSpec_section_1",
|
||||
"#productDetails_techSpec_section_2",
|
||||
]
|
||||
specs: dict[str, str] = {}
|
||||
for selector in selectors:
|
||||
tables = _extract_tables_from_selector(soup, selector)
|
||||
for table in tables:
|
||||
specs.update(_extract_table_kv(table))
|
||||
return specs or None
|
||||
|
||||
|
||||
def _extract_details(soup: BeautifulSoup) -> dict[str, str] | None:
|
||||
container = soup.select_one("[data-csa-c-content-id='voyager-expander-btn']")
|
||||
carateristique_tables = set(container.select("table")) if container else set()
|
||||
selectors = [
|
||||
"#productDetails_techSpec_section_1",
|
||||
"#productDetails_detailBullets_sections1",
|
||||
"#productDetails_detailBullets_sections2",
|
||||
"#productDetails",
|
||||
]
|
||||
details: dict[str, str] = {}
|
||||
seen_tables = set()
|
||||
for selector in selectors:
|
||||
for table in _extract_tables_from_selector(soup, selector):
|
||||
if table in carateristique_tables or table in seen_tables:
|
||||
continue
|
||||
seen_tables.add(table)
|
||||
details.update(_extract_table_kv(table))
|
||||
return details or None
|
||||
|
||||
|
||||
def _parse_percent(text: str | None) -> int | None:
|
||||
if not text:
|
||||
return None
|
||||
match = re.search(r"(-?\d+)", text.replace("\u00a0", " "))
|
||||
if not match:
|
||||
return None
|
||||
try:
|
||||
return int(match.group(1))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
title = _safe_text_soup(soup, "#productTitle")
|
||||
|
||||
image_main_url = _safe_attr_soup(soup, "#landingImage", "src")
|
||||
if not image_main_url:
|
||||
image_main_url = _safe_attr_soup(soup, "#imgTagWrapperId img", "src")
|
||||
|
||||
price_text = _safe_text_soup(soup, "#corePriceDisplay_desktop_feature_div .a-offscreen")
|
||||
if not price_text:
|
||||
price_text = _safe_text_soup(soup, "#priceblock_ourprice")
|
||||
if not price_text:
|
||||
price_text = _safe_text_soup(soup, "#priceblock_dealprice")
|
||||
if not price_text:
|
||||
whole = _safe_text_soup(soup, ".a-price .a-price-whole")
|
||||
fraction = _safe_text_soup(soup, ".a-price .a-price-fraction")
|
||||
symbol = _safe_text_soup(soup, ".a-price .a-price-symbol")
|
||||
price_text = _compose_price_from_parts(whole, fraction, symbol)
|
||||
if not price_text:
|
||||
price_text = _safe_attr_soup(soup, "#twister-plus-price-data-price", "value")
|
||||
|
||||
price_list_text = _safe_text_soup(
|
||||
soup, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen"
|
||||
)
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, "#priceblock_strikeprice")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlockAUI .a-offscreen")
|
||||
|
||||
stock_text = _safe_text_soup(soup, "#availability span")
|
||||
if not stock_text:
|
||||
stock_text = _safe_text_soup(soup, "#availability")
|
||||
|
||||
in_stock, stock_text = parse_stock_status(stock_text)
|
||||
|
||||
rating_text = _safe_text_soup(soup, "#acrPopover .a-icon-alt")
|
||||
rating_count_text = _safe_text_soup(soup, "#acrCustomerReviewText")
|
||||
|
||||
amazon_choice = _safe_text_soup(soup, "#acBadge_feature_div")
|
||||
limited_time_deal = _safe_text_soup(soup, "#dealBadge_feature_div")
|
||||
prime_eligible = None
|
||||
if _has_selector_soup(soup, "#primeBadge"):
|
||||
prime_eligible = True
|
||||
elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
|
||||
prime_eligible = True
|
||||
elif _has_selector_soup(soup, "#priceBadging_feature_div i.a-icon-prime"):
|
||||
prime_eligible = True
|
||||
elif _has_selector_soup(soup, "#corePriceDisplay_desktop_feature_div i.a-icon-prime"):
|
||||
prime_eligible = True
|
||||
elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"):
|
||||
prime_eligible = True
|
||||
elif _has_selector_soup(soup, "i#prime-badge"):
|
||||
prime_eligible = True
|
||||
elif _has_selector_soup(soup, "i.a-icon-prime[aria-label*='prime']"):
|
||||
prime_eligible = True
|
||||
amazon_exclusive = "Exclusivité Amazon" if "Exclusivité Amazon" in soup.get_text() else None
|
||||
|
||||
lowest_30d_text = _extract_lowest_30d_text_soup(soup)
|
||||
lowest_30d_price = None
|
||||
if lowest_30d_text:
|
||||
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
||||
if lowest_30d_price is not None:
|
||||
candidate_list = parse_price_fr(price_list_text)
|
||||
if candidate_list == lowest_30d_price:
|
||||
price_list_text = None
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlockAUI .a-offscreen")
|
||||
|
||||
reduction_savings_text = _safe_text_soup(
|
||||
soup, "#corePriceDisplay_desktop_feature_div .savingsPercentage"
|
||||
)
|
||||
reduction_conseille_text = _safe_text_soup(soup, ".srpSavingsPercentageBlock")
|
||||
reduction_min_30j = _parse_percent(reduction_savings_text)
|
||||
reduction_conseille = _parse_percent(reduction_conseille_text)
|
||||
|
||||
a_propos = _extract_about_bullets(soup)
|
||||
description = _extract_description(soup)
|
||||
carateristique = _extract_carateristique(soup)
|
||||
details = _extract_details(soup)
|
||||
|
||||
asin = _safe_attr_soup(soup, "input#ASIN", "value") or _extract_asin_from_url(url)
|
||||
|
||||
data = {
|
||||
"url": url,
|
||||
"asin": asin,
|
||||
"titre": title,
|
||||
"url_image_principale": image_main_url,
|
||||
"prix_actuel": parse_price_fr(price_text),
|
||||
"prix_conseille": parse_price_fr(price_list_text),
|
||||
"prix_min_30j": lowest_30d_price,
|
||||
"prix_conseille_reduction": reduction_conseille,
|
||||
"prix_min_30j_reduction": reduction_min_30j,
|
||||
"etat_stock": stock_text,
|
||||
"en_stock": in_stock,
|
||||
"note": parse_rating_value(rating_text),
|
||||
"nombre_avis": parse_rating_count(rating_count_text),
|
||||
"choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
|
||||
"offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
|
||||
"prime": True if prime_eligible else None,
|
||||
"exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
|
||||
"a_propos": a_propos,
|
||||
"description": description,
|
||||
"carateristique": carateristique,
|
||||
"details": details,
|
||||
}
|
||||
|
||||
missing = [key for key in ("titre", "prix_actuel", "note") if not data.get(key)]
|
||||
if missing:
|
||||
logger.warning("Champs manquants (html): {}", ", ".join(missing))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def extract_product_data(page: Page, url: str) -> dict[str, Any]:
|
||||
# champ titre
|
||||
title = _safe_text(page, "#productTitle")
|
||||
|
||||
# image principale
|
||||
image_main_url = _safe_attr(page, "#landingImage", "src")
|
||||
if not image_main_url:
|
||||
image_main_url = _safe_attr(page, "#imgTagWrapperId img", "src")
|
||||
|
||||
# prix actuel
|
||||
price_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-offscreen")
|
||||
if not price_text:
|
||||
price_text = _safe_text(page, "#priceblock_ourprice")
|
||||
if not price_text:
|
||||
price_text = _safe_text(page, "#priceblock_dealprice")
|
||||
if not price_text:
|
||||
whole = _safe_text(page, ".a-price .a-price-whole")
|
||||
fraction = _safe_text(page, ".a-price .a-price-fraction")
|
||||
symbol = _safe_text(page, ".a-price .a-price-symbol")
|
||||
price_text = _compose_price_from_parts(whole, fraction, symbol)
|
||||
if not price_text:
|
||||
price_text = _safe_attr(page, "#twister-plus-price-data-price", "value")
|
||||
|
||||
# prix barré / conseillé
|
||||
price_list_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, "#priceblock_strikeprice")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlockAUI .a-offscreen")
|
||||
|
||||
# stock
|
||||
stock_text = _safe_text(page, "#availability span")
|
||||
if not stock_text:
|
||||
stock_text = _safe_text(page, "#availability")
|
||||
|
||||
in_stock, stock_text = parse_stock_status(stock_text)
|
||||
|
||||
# rating
|
||||
rating_text = _safe_text(page, "#acrPopover .a-icon-alt")
|
||||
rating_count_text = _safe_text(page, "#acrCustomerReviewText")
|
||||
|
||||
# badges
|
||||
amazon_choice = _safe_text(page, "#acBadge_feature_div")
|
||||
limited_time_deal = _safe_text(page, "#dealBadge_feature_div")
|
||||
prime_eligible = None
|
||||
if page.locator("#primeBadge").count() > 0:
|
||||
prime_eligible = True
|
||||
elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
|
||||
prime_eligible = True
|
||||
elif page.locator("#priceBadging_feature_div i.a-icon-prime").count() > 0:
|
||||
prime_eligible = True
|
||||
elif page.locator("#corePriceDisplay_desktop_feature_div i.a-icon-prime").count() > 0:
|
||||
prime_eligible = True
|
||||
elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0:
|
||||
prime_eligible = True
|
||||
elif page.locator("i#prime-badge").count() > 0:
|
||||
prime_eligible = True
|
||||
elif page.locator("i.a-icon-prime[aria-label*='prime']").count() > 0:
|
||||
prime_eligible = True
|
||||
|
||||
amazon_exclusive = _safe_text(page, "text=Exclusivité Amazon")
|
||||
|
||||
# prix plus bas 30 jours
|
||||
lowest_30d_text = None
|
||||
if page.locator(".basisPrice").count() > 0:
|
||||
basis_text = page.locator(".basisPrice").first.inner_text()
|
||||
if basis_text and re.search(r"prix.+(30|trente).+jour", basis_text.lower()):
|
||||
lowest_30d_text = _safe_text(page, ".basisPrice .a-offscreen") or basis_text
|
||||
if not lowest_30d_text and page.locator("#priceBadging_feature_div").count() > 0:
|
||||
badging_text = page.locator("#priceBadging_feature_div").first.inner_text()
|
||||
if badging_text and re.search(r"prix.+(30|trente).+jour", badging_text.lower()):
|
||||
lowest_30d_text = _safe_text(page, "#priceBadging_feature_div .a-offscreen") or badging_text
|
||||
if lowest_30d_text and not re.search(r"prix.+(30|trente).+jour", lowest_30d_text.lower()):
|
||||
lowest_30d_text = None
|
||||
lowest_30d_price = None
|
||||
if lowest_30d_text and "prix" in lowest_30d_text.lower():
|
||||
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
||||
if lowest_30d_price is not None:
|
||||
candidate_list = parse_price_fr(price_list_text)
|
||||
if candidate_list == lowest_30d_price:
|
||||
price_list_text = None
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlockAUI .a-offscreen")
|
||||
|
||||
reduction_savings_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .savingsPercentage")
|
||||
reduction_conseille_text = _safe_text(page, ".srpSavingsPercentageBlock")
|
||||
reduction_min_30j = _parse_percent(reduction_savings_text)
|
||||
reduction_conseille = _parse_percent(reduction_conseille_text)
|
||||
|
||||
asin = _safe_attr(page, "input#ASIN", "value") or _extract_asin_from_url(url)
|
||||
|
||||
soup = BeautifulSoup(page.content(), "html.parser")
|
||||
a_propos = _extract_about_bullets(soup)
|
||||
description = _extract_description(soup)
|
||||
carateristique = _extract_carateristique(soup)
|
||||
details = _extract_details(soup)
|
||||
|
||||
data = {
|
||||
"url": url,
|
||||
"asin": asin,
|
||||
"titre": title,
|
||||
"url_image_principale": image_main_url,
|
||||
"prix_actuel": parse_price_fr(price_text),
|
||||
"prix_conseille": parse_price_fr(price_list_text),
|
||||
"prix_min_30j": lowest_30d_price,
|
||||
"prix_conseille_reduction": reduction_conseille,
|
||||
"prix_min_30j_reduction": reduction_min_30j,
|
||||
"etat_stock": stock_text,
|
||||
"en_stock": in_stock,
|
||||
"note": parse_rating_value(rating_text),
|
||||
"nombre_avis": parse_rating_count(rating_count_text),
|
||||
"choix_amazon": bool(amazon_choice) if amazon_choice is not None else None,
|
||||
"offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None,
|
||||
"prime": True if prime_eligible else None,
|
||||
"exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None,
|
||||
"a_propos": a_propos,
|
||||
"description": description,
|
||||
"carateristique": carateristique,
|
||||
"details": details,
|
||||
}
|
||||
|
||||
return data
|
||||
Reference in New Issue
Block a user