from __future__ import annotations import re from typing import Any from bs4 import BeautifulSoup from loguru import logger from playwright.sync_api import Page from backend.app.scraper.normalize import ( parse_price_fr, parse_rating_count, parse_rating_value, parse_stock_status, ) def detect_blocked(html: str) -> bool: # détection simple des blocages / captcha lowered = html.lower() if "captcha" in lowered or "robot" in lowered: return True if "saisissez les caractères" in lowered or "vérification" in lowered: return True return False def _safe_text(page: Page, selector: str) -> str | None: try: locator = page.locator(selector) if locator.count() == 0: return None value = locator.first.inner_text().strip() return value or None except Exception: return None def _safe_attr(page: Page, selector: str, attr: str) -> str | None: try: locator = page.locator(selector) if locator.count() == 0: return None return locator.first.get_attribute(attr) except Exception: return None def _extract_asin_from_url(url: str) -> str | None: match = re.search(r"/dp/([A-Z0-9]{10})", url) if match: return match.group(1) return None def _safe_text_soup(soup: BeautifulSoup, selector: str) -> str | None: node = soup.select_one(selector) if not node: return None value = node.get_text(strip=True) return value or None def _safe_attr_soup(soup: BeautifulSoup, selector: str, attr: str) -> str | None: node = soup.select_one(selector) if not node: return None return node.get(attr) def _has_selector_soup(soup: BeautifulSoup, selector: str) -> bool: return soup.select_one(selector) is not None def _compose_price_from_parts(whole: str | None, fraction: str | None, symbol: str | None) -> str | None: if not whole: return None whole_digits = re.sub(r"[^\d]", "", whole) if not whole_digits: return None fraction_digits = re.sub(r"[^\d]", "", fraction or "") if not fraction_digits: fraction_digits = "00" fraction_digits = fraction_digits[:2].ljust(2, "0") symbol = (symbol or "€").strip() return f"{whole_digits},{fraction_digits} {symbol}" def _extract_lowest_30d_text_soup(soup: BeautifulSoup) -> str | None: containers = [] container = soup.select_one("#priceBadging_feature_div") if container: containers.append(container) containers.extend(soup.select(".basisPrice")) for node in containers: text = node.get_text(" ", strip=True) if text and re.search(r"prix.+(30|trente).+jour", text.lower()): price_node = node.select_one(".a-offscreen") if price_node: price_text = price_node.get_text(" ", strip=True) if price_text: return price_text return text return None def _extract_about_bullets(soup: BeautifulSoup) -> list[str] | None: container = soup.select_one("#feature-bullets") if not container: return None items = [] for node in container.select("ul li span.a-list-item"): text = node.get_text(" ", strip=True) if text: items.append(text) return items or None def _extract_description(soup: BeautifulSoup) -> str | None: node = soup.select_one("#productDescription") if not node: return None text = node.get_text(" ", strip=True) return text or None def _extract_table_kv(table) -> dict[str, str]: data: dict[str, str] = {} for row in table.select("tr"): key = row.select_one("th") value = row.select_one("td") if not key or not value: continue key_text = key.get_text(" ", strip=True) value_text = value.get_text(" ", strip=True) if key_text and value_text: data[key_text] = value_text return data def _extract_tables_from_selector(soup: BeautifulSoup, selector: str) -> list: section = soup.select_one(selector) if not section: return [] if section.name == "table": return [section] return section.select("table") def _extract_carateristique(soup: BeautifulSoup) -> dict[str, str] | None: selectors = [ "[data-csa-c-content-id='voyager-expander-btn']", "#productDetails_techSpec_section_1", "#productDetails_techSpec_section_2", ] specs: dict[str, str] = {} for selector in selectors: tables = _extract_tables_from_selector(soup, selector) for table in tables: specs.update(_extract_table_kv(table)) return specs or None def _extract_details(soup: BeautifulSoup) -> dict[str, str] | None: container = soup.select_one("[data-csa-c-content-id='voyager-expander-btn']") carateristique_tables = set(container.select("table")) if container else set() selectors = [ "#productDetails_techSpec_section_1", "#productDetails_detailBullets_sections1", "#productDetails_detailBullets_sections2", "#productDetails", ] details: dict[str, str] = {} seen_tables = set() for selector in selectors: for table in _extract_tables_from_selector(soup, selector): if table in carateristique_tables or table in seen_tables: continue seen_tables.add(table) details.update(_extract_table_kv(table)) return details or None def _parse_percent(text: str | None) -> int | None: if not text: return None match = re.search(r"(-?\d+)", text.replace("\u00a0", " ")) if not match: return None try: return int(match.group(1)) except ValueError: return None def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]: soup = BeautifulSoup(html, "html.parser") title = _safe_text_soup(soup, "#productTitle") image_main_url = _safe_attr_soup(soup, "#landingImage", "src") if not image_main_url: image_main_url = _safe_attr_soup(soup, "#imgTagWrapperId img", "src") price_text = _safe_text_soup(soup, "#corePriceDisplay_desktop_feature_div .a-offscreen") if not price_text: price_text = _safe_text_soup(soup, "#priceblock_ourprice") if not price_text: price_text = _safe_text_soup(soup, "#priceblock_dealprice") if not price_text: whole = _safe_text_soup(soup, ".a-price .a-price-whole") fraction = _safe_text_soup(soup, ".a-price .a-price-fraction") symbol = _safe_text_soup(soup, ".a-price .a-price-symbol") price_text = _compose_price_from_parts(whole, fraction, symbol) if not price_text: price_text = _safe_attr_soup(soup, "#twister-plus-price-data-price", "value") # prix conseillé (srpPriceBlock = "Prix conseillé : XXX €") price_list_text = _safe_text_soup(soup, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen") if not price_list_text: price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen") if not price_list_text: price_list_text = _safe_text_soup(soup, "#priceblock_strikeprice") # fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock if not price_list_text: price_list_text = _safe_text_soup( soup, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen" ) stock_text = _safe_text_soup(soup, "#availability span") if not stock_text: stock_text = _safe_text_soup(soup, "#availability") in_stock, stock_text = parse_stock_status(stock_text) rating_text = _safe_text_soup(soup, "#acrPopover .a-icon-alt") rating_count_text = _safe_text_soup(soup, "#acrCustomerReviewText") amazon_choice = _safe_text_soup(soup, "#acBadge_feature_div") limited_time_deal = _safe_text_soup(soup, "#dealBadge_feature_div") prime_eligible = None if _has_selector_soup(soup, "#primeBadge"): prime_eligible = True elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"): prime_eligible = True elif _has_selector_soup(soup, "#priceBadging_feature_div i.a-icon-prime"): prime_eligible = True elif _has_selector_soup(soup, "#corePriceDisplay_desktop_feature_div i.a-icon-prime"): prime_eligible = True elif _has_selector_soup(soup, "#priceBadging_feature_div #prime-badge"): prime_eligible = True elif _has_selector_soup(soup, "i#prime-badge"): prime_eligible = True elif _has_selector_soup(soup, "i.a-icon-prime[aria-label*='prime']"): prime_eligible = True amazon_exclusive = "Exclusivité Amazon" if "Exclusivité Amazon" in soup.get_text() else None # prix plus bas 30 jours (basisPrice avec mention "30 jours") lowest_30d_text = _extract_lowest_30d_text_soup(soup) lowest_30d_price = None if lowest_30d_text: lowest_30d_price = parse_price_fr(lowest_30d_text) # si le prix conseillé == prix min 30j, c'est une erreur de détection # (le prix barré dans corePriceDisplay est en fait le prix min 30j, pas le conseillé) price_list_value = parse_price_fr(price_list_text) if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price: price_list_text = None price_list_value = None # réductions reduction_savings_text = _safe_text_soup( soup, "#corePriceDisplay_desktop_feature_div .savingsPercentage" ) reduction_conseille_text = _safe_text_soup(soup, ".srpSavingsPercentageBlock") # attribuer correctement les réductions selon ce qui est présent # - si prix min 30j présent, savingsPercentage = réduction par rapport au min 30j # - si prix conseillé présent (srpPriceBlock), srpSavingsPercentageBlock = réduction par rapport au conseillé reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None # si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j) if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None: reduction_conseille = _parse_percent(reduction_savings_text) a_propos = _extract_about_bullets(soup) description = _extract_description(soup) carateristique = _extract_carateristique(soup) details = _extract_details(soup) asin = _safe_attr_soup(soup, "input#ASIN", "value") or _extract_asin_from_url(url) data = { "url": url, "asin": asin, "titre": title, "url_image_principale": image_main_url, "prix_actuel": parse_price_fr(price_text), "prix_conseille": price_list_value, "prix_min_30j": lowest_30d_price, "prix_conseille_reduction": reduction_conseille, "prix_min_30j_reduction": reduction_min_30j, "etat_stock": stock_text, "en_stock": in_stock, "note": parse_rating_value(rating_text), "nombre_avis": parse_rating_count(rating_count_text), "choix_amazon": bool(amazon_choice) if amazon_choice is not None else None, "offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None, "prime": True if prime_eligible else None, "exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None, "a_propos": a_propos, "description": description, "carateristique": carateristique, "details": details, } missing = [key for key in ("titre", "prix_actuel", "note") if not data.get(key)] if missing: logger.warning("Champs manquants (html): {}", ", ".join(missing)) return data def extract_product_data(page: Page, url: str) -> dict[str, Any]: # champ titre title = _safe_text(page, "#productTitle") # image principale image_main_url = _safe_attr(page, "#landingImage", "src") if not image_main_url: image_main_url = _safe_attr(page, "#imgTagWrapperId img", "src") # prix actuel price_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-offscreen") if not price_text: price_text = _safe_text(page, "#priceblock_ourprice") if not price_text: price_text = _safe_text(page, "#priceblock_dealprice") if not price_text: whole = _safe_text(page, ".a-price .a-price-whole") fraction = _safe_text(page, ".a-price .a-price-fraction") symbol = _safe_text(page, ".a-price .a-price-symbol") price_text = _compose_price_from_parts(whole, fraction, symbol) if not price_text: price_text = _safe_attr(page, "#twister-plus-price-data-price", "value") # prix conseillé (srpPriceBlock = "Prix conseillé : XXX €") price_list_text = _safe_text(page, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen") if not price_list_text: price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen") if not price_list_text: price_list_text = _safe_text(page, "#priceblock_strikeprice") # fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock if not price_list_text: price_list_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen") # stock stock_text = _safe_text(page, "#availability span") if not stock_text: stock_text = _safe_text(page, "#availability") in_stock, stock_text = parse_stock_status(stock_text) # rating rating_text = _safe_text(page, "#acrPopover .a-icon-alt") rating_count_text = _safe_text(page, "#acrCustomerReviewText") # badges amazon_choice = _safe_text(page, "#acBadge_feature_div") limited_time_deal = _safe_text(page, "#dealBadge_feature_div") prime_eligible = None if page.locator("#primeBadge").count() > 0: prime_eligible = True elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0: prime_eligible = True elif page.locator("#priceBadging_feature_div i.a-icon-prime").count() > 0: prime_eligible = True elif page.locator("#corePriceDisplay_desktop_feature_div i.a-icon-prime").count() > 0: prime_eligible = True elif page.locator("#priceBadging_feature_div #prime-badge").count() > 0: prime_eligible = True elif page.locator("i#prime-badge").count() > 0: prime_eligible = True elif page.locator("i.a-icon-prime[aria-label*='prime']").count() > 0: prime_eligible = True amazon_exclusive = _safe_text(page, "text=Exclusivité Amazon") # prix plus bas 30 jours (basisPrice ou corePriceDisplay avec mention "30 jours") lowest_30d_text = None lowest_30d_price = None if page.locator(".basisPrice").count() > 0: basis_text = page.locator(".basisPrice").first.inner_text() if basis_text and re.search(r"prix.+(30|trente).+jour", basis_text.lower()): lowest_30d_text = _safe_text(page, ".basisPrice .a-price .a-offscreen") or basis_text lowest_30d_price = parse_price_fr(lowest_30d_text) # fallback sur corePriceDisplay si contient mention 30 jours if lowest_30d_price is None and page.locator("#corePriceDisplay_desktop_feature_div .a-text-price").count() > 0: core_text = page.locator("#corePriceDisplay_desktop_feature_div").first.inner_text() if core_text and re.search(r"prix.+(30|trente).+jour", core_text.lower()): lowest_30d_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price .a-offscreen") lowest_30d_price = parse_price_fr(lowest_30d_text) if not lowest_30d_price and page.locator("#priceBadging_feature_div").count() > 0: badging_text = page.locator("#priceBadging_feature_div").first.inner_text() if badging_text and re.search(r"prix.+(30|trente).+jour", badging_text.lower()): lowest_30d_text = _safe_text(page, "#priceBadging_feature_div .a-offscreen") or badging_text lowest_30d_price = parse_price_fr(lowest_30d_text) # si le prix conseillé == prix min 30j, c'est une erreur de détection price_list_value = parse_price_fr(price_list_text) if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price: price_list_text = None price_list_value = None # réductions # savingsPercentage dans corePriceDisplay = réduction par rapport au prix min 30j (si présent) # srpSavingsPercentageBlock = réduction par rapport au prix conseillé reduction_savings_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .savingsPercentage") reduction_conseille_text = _safe_text(page, ".srpSavingsPercentageBlock") # attribuer correctement les réductions selon ce qui est présent reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None # si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j) if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None: reduction_conseille = _parse_percent(reduction_savings_text) asin = _safe_attr(page, "input#ASIN", "value") or _extract_asin_from_url(url) soup = BeautifulSoup(page.content(), "html.parser") a_propos = _extract_about_bullets(soup) description = _extract_description(soup) carateristique = _extract_carateristique(soup) details = _extract_details(soup) data = { "url": url, "asin": asin, "titre": title, "url_image_principale": image_main_url, "prix_actuel": parse_price_fr(price_text), "prix_conseille": price_list_value, "prix_min_30j": lowest_30d_price, "prix_conseille_reduction": reduction_conseille, "prix_min_30j_reduction": reduction_min_30j, "etat_stock": stock_text, "en_stock": in_stock, "note": parse_rating_value(rating_text), "nombre_avis": parse_rating_count(rating_count_text), "choix_amazon": bool(amazon_choice) if amazon_choice is not None else None, "offre_limitee": bool(limited_time_deal) if limited_time_deal is not None else None, "prime": True if prime_eligible else None, "exclusivite_amazon": bool(amazon_exclusive) if amazon_exclusive is not None else None, "a_propos": a_propos, "description": description, "carateristique": carateristique, "details": details, } return data