last
This commit is contained in:
@@ -181,6 +181,23 @@ def _extract_details(soup: BeautifulSoup) -> dict[str, str] | None:
|
||||
return details or None
|
||||
|
||||
|
||||
def _extract_category(soup: BeautifulSoup) -> str | None:
|
||||
"""Extrait la catégorie depuis le fil d'Ariane (breadcrumb) Amazon."""
|
||||
breadcrumb = soup.select_one("#wayfinding-breadcrumbs_feature_div")
|
||||
if not breadcrumb:
|
||||
return None
|
||||
# Récupérer tous les liens du breadcrumb
|
||||
links = breadcrumb.select("ul li span.a-list-item a")
|
||||
if not links:
|
||||
return None
|
||||
# Construire le chemin de catégorie
|
||||
categories = [link.get_text(strip=True) for link in links if link.get_text(strip=True)]
|
||||
if not categories:
|
||||
return None
|
||||
# Retourner le chemin complet séparé par " > "
|
||||
return " > ".join(categories)
|
||||
|
||||
|
||||
def _parse_percent(text: str | None) -> int | None:
|
||||
if not text:
|
||||
return None
|
||||
@@ -286,6 +303,7 @@ def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
|
||||
description = _extract_description(soup)
|
||||
carateristique = _extract_carateristique(soup)
|
||||
details = _extract_details(soup)
|
||||
categorie_amazon = _extract_category(soup)
|
||||
|
||||
asin = _safe_attr_soup(soup, "input#ASIN", "value") or _extract_asin_from_url(url)
|
||||
|
||||
@@ -311,6 +329,7 @@ def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
|
||||
"description": description,
|
||||
"carateristique": carateristique,
|
||||
"details": details,
|
||||
"categorie_amazon": categorie_amazon,
|
||||
}
|
||||
|
||||
missing = [key for key in ("titre", "prix_actuel", "note") if not data.get(key)]
|
||||
@@ -431,6 +450,7 @@ def extract_product_data(page: Page, url: str) -> dict[str, Any]:
|
||||
description = _extract_description(soup)
|
||||
carateristique = _extract_carateristique(soup)
|
||||
details = _extract_details(soup)
|
||||
categorie_amazon = _extract_category(soup)
|
||||
|
||||
data = {
|
||||
"url": url,
|
||||
@@ -454,6 +474,7 @@ def extract_product_data(page: Page, url: str) -> dict[str, Any]:
|
||||
"description": description,
|
||||
"carateristique": carateristique,
|
||||
"details": details,
|
||||
"categorie_amazon": categorie_amazon,
|
||||
}
|
||||
|
||||
return data
|
||||
|
||||
Reference in New Issue
Block a user