This commit is contained in:
2026-01-25 14:48:26 +01:00
parent 5c3e6b84a4
commit c56a4632a2
958 changed files with 1149102 additions and 123 deletions

View File

@@ -181,6 +181,23 @@ def _extract_details(soup: BeautifulSoup) -> dict[str, str] | None:
return details or None
def _extract_category(soup: BeautifulSoup) -> str | None:
"""Extrait la catégorie depuis le fil d'Ariane (breadcrumb) Amazon."""
breadcrumb = soup.select_one("#wayfinding-breadcrumbs_feature_div")
if not breadcrumb:
return None
# Récupérer tous les liens du breadcrumb
links = breadcrumb.select("ul li span.a-list-item a")
if not links:
return None
# Construire le chemin de catégorie
categories = [link.get_text(strip=True) for link in links if link.get_text(strip=True)]
if not categories:
return None
# Retourner le chemin complet séparé par " > "
return " > ".join(categories)
def _parse_percent(text: str | None) -> int | None:
if not text:
return None
@@ -286,6 +303,7 @@ def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
description = _extract_description(soup)
carateristique = _extract_carateristique(soup)
details = _extract_details(soup)
categorie_amazon = _extract_category(soup)
asin = _safe_attr_soup(soup, "input#ASIN", "value") or _extract_asin_from_url(url)
@@ -311,6 +329,7 @@ def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
"description": description,
"carateristique": carateristique,
"details": details,
"categorie_amazon": categorie_amazon,
}
missing = [key for key in ("titre", "prix_actuel", "note") if not data.get(key)]
@@ -431,6 +450,7 @@ def extract_product_data(page: Page, url: str) -> dict[str, Any]:
description = _extract_description(soup)
carateristique = _extract_carateristique(soup)
details = _extract_details(soup)
categorie_amazon = _extract_category(soup)
data = {
"url": url,
@@ -454,6 +474,7 @@ def extract_product_data(page: Page, url: str) -> dict[str, Any]:
"description": description,
"carateristique": carateristique,
"details": details,
"categorie_amazon": categorie_amazon,
}
return data