before claude

This commit is contained in:
Gilles Soulier
2026-01-18 06:26:17 +01:00
parent dc19315e5d
commit 740c3d7516
60 changed files with 3815 additions and 354 deletions

View File

@@ -15,6 +15,13 @@ price:
- "#priceblock_dealprice"
- ".a-price-range .a-price .a-offscreen"
# Texte de réduction explicite
discount_text:
- "#regularprice_savings"
- "#dealprice_savings"
- "#savingsPercentage"
- "span.savingsPercentage"
# Devise (généralement dans le symbole)
currency:
- "span.a-price-symbol"
@@ -32,6 +39,24 @@ stock_status:
- "#availability"
- ".a-declarative .a-size-medium"
# Note moyenne
rating_value:
- "#acrPopover"
- "#averageCustomerReviews .a-icon-alt"
- "#averageCustomerReviews span.a-icon-alt"
# Nombre d'évaluations
rating_count:
- "#acrCustomerReviewText"
- "#acrCustomerReviewLink"
# Badge Choix d'Amazon
amazon_choice:
- "#acBadge_feature_div"
- "#acBadge_feature_div .ac-badge"
- "#acBadge_feature_div .ac-badge-rectangle"
- "#acBadge_feature_div .ac-badge-rectangle-icon"
# Images produit
images:
- "#landingImage"
@@ -44,6 +69,13 @@ category:
- "#wayfinding-breadcrumbs_feature_div"
- ".a-breadcrumb"
# Description (détails de l'article)
description:
- "#detailBullets_feature_div"
- "#detailBulletsWrapper_feature_div"
- "#productDetails_detailBullets_sections1"
- "#feature-bullets"
# Caractéristiques techniques (table specs)
specs_table:
- "#productDetails_techSpec_section_1"

View File

@@ -130,13 +130,19 @@ class AmazonStore(BaseStore):
title = self._extract_title(soup, debug_info)
price = self._extract_price(soup, debug_info)
currency = self._extract_currency(soup, debug_info)
stock_status = self._extract_stock(soup, debug_info)
images = self._extract_images(soup, debug_info)
stock_status, stock_text, in_stock = self._extract_stock_details(soup, debug_info)
main_image, gallery_images, images = self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
rating_value = self._extract_rating_value(soup, debug_info)
rating_count = self._extract_rating_count(soup, debug_info)
amazon_choice, amazon_choice_label = self._extract_amazon_choice(soup, debug_info)
discount_text = self._extract_discount_text(soup, debug_info)
model_number, model_name = self._extract_model_details(specs)
asin = reference
# Déterminer le statut final (ne pas écraser FAILED)
if debug_info.status != DebugStatus.FAILED:
@@ -153,12 +159,24 @@ class AmazonStore(BaseStore):
currency=currency or "EUR",
shipping_cost=None, # Difficile à extraire
stock_status=stock_status,
stock_text=stock_text,
in_stock=in_stock,
reference=reference,
asin=asin,
category=category,
description=description,
images=images,
main_image=main_image,
gallery_images=gallery_images,
specs=specs,
msrp=msrp,
rating_value=rating_value,
rating_count=rating_count,
amazon_choice=amazon_choice,
amazon_choice_label=amazon_choice_label,
discount_text=discount_text,
model_number=model_number,
model_name=model_name,
debug=debug_info,
)
@@ -203,14 +221,26 @@ class AmazonStore(BaseStore):
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
"""Extrait la description depuis les détails de l'article."""
selectors = self.get_selector("description", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if not element:
continue
items = [
item.get_text(" ", strip=True)
for item in element.select("li")
if item.get_text(strip=True)
]
if items:
return "\n".join(items)
text = " ".join(element.stripped_strings)
if text:
return text
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
@@ -271,8 +301,10 @@ class AmazonStore(BaseStore):
# Défaut basé sur le domaine
return "EUR"
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
"""Extrait le statut de stock."""
def _extract_stock_details(
self, soup: BeautifulSoup, debug: DebugInfo
) -> tuple[StockStatus, Optional[str], Optional[bool]]:
"""Extrait le statut de stock avec texte brut."""
selectors = self.get_selector("stock_status", [])
if isinstance(selectors, str):
selectors = [selectors]
@@ -280,22 +312,27 @@ class AmazonStore(BaseStore):
for selector in selectors:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True).lower()
if "en stock" in text or "available" in text or "in stock" in text:
return StockStatus.IN_STOCK
text = element.get_text(strip=True)
normalized = text.lower()
if "en stock" in normalized or "available" in normalized or "in stock" in normalized:
return StockStatus.IN_STOCK, text, True
elif (
"rupture" in text
or "indisponible" in text
or "out of stock" in text
"rupture" in normalized
or "indisponible" in normalized
or "out of stock" in normalized
):
return StockStatus.OUT_OF_STOCK
return StockStatus.OUT_OF_STOCK, text, False
return StockStatus.UNKNOWN
return StockStatus.UNKNOWN, None, None
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
"""Extrait les URLs d'images."""
images = []
seen = set()
def _extract_images(
self, soup: BeautifulSoup, debug: DebugInfo
) -> tuple[Optional[str], list[str], list[str]]:
"""Extrait l'image principale et la galerie."""
images: list[str] = []
seen: set[str] = set()
main_image: Optional[str] = None
max_gallery = 15
selectors = self.get_selector("images", [])
if isinstance(selectors, str):
selectors = [selectors]
@@ -309,6 +346,8 @@ class AmazonStore(BaseStore):
if self._is_product_image(url) and url not in seen:
images.append(url)
seen.add(url)
if main_image is None:
main_image = url
dynamic = element.get("data-a-dynamic-image")
if dynamic:
urls = self._extract_dynamic_images(dynamic)
@@ -316,6 +355,8 @@ class AmazonStore(BaseStore):
if self._is_product_image(dyn_url) and dyn_url not in seen:
images.append(dyn_url)
seen.add(dyn_url)
if main_image is None:
main_image = dyn_url
# Fallback: chercher tous les img tags si aucune image trouvée
if not images:
@@ -326,8 +367,15 @@ class AmazonStore(BaseStore):
if url not in seen:
images.append(url)
seen.add(url)
if main_image is None:
main_image = url
return images
if main_image is None and images:
main_image = images[0]
gallery_images = [url for url in images if url != main_image]
gallery_images = gallery_images[:max_gallery]
final_images = [main_image] + gallery_images if main_image else gallery_images
return main_image, gallery_images, final_images
def _extract_dynamic_images(self, raw: str) -> list[str]:
"""Extrait les URLs du JSON data-a-dynamic-image."""
@@ -393,8 +441,111 @@ class AmazonStore(BaseStore):
if key and value:
specs[key] = value
# Détails de l'article sous forme de liste
detail_list = soup.select("#detailBullets_feature_div li")
for item in detail_list:
text = item.get_text(" ", strip=True)
if ":" not in text:
continue
key, value = text.split(":", 1)
key = key.strip()
value = value.strip()
if key and value and key not in specs:
specs[key] = value
return specs
def _extract_rating_value(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait la note moyenne."""
selectors = self.get_selector("rating_value", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if not element:
continue
text = element.get_text(" ", strip=True) or element.get("title", "").strip()
match = re.search(r"([\d.,]+)", text)
if match:
value = match.group(1).replace(",", ".")
try:
return float(value)
except ValueError:
continue
return None
def _extract_rating_count(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[int]:
"""Extrait le nombre d'évaluations."""
selectors = self.get_selector("rating_count", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if not element:
continue
text = element.get_text(" ", strip=True)
match = re.search(r"([\d\s\u202f\u00a0]+)", text)
if match:
numeric = re.sub(r"[^\d]", "", match.group(1))
if numeric:
return int(numeric)
return None
def _extract_amazon_choice(
self, soup: BeautifulSoup, debug: DebugInfo
) -> tuple[Optional[bool], Optional[str]]:
"""Extrait le badge Choix d'Amazon."""
selectors = self.get_selector("amazon_choice", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
label_candidates = [
element.get_text(" ", strip=True),
element.get("aria-label", "").strip(),
element.get("title", "").strip(),
element.get("data-a-badge-label", "").strip(),
]
label = next((item for item in label_candidates if item), "")
normalized = label.lower()
if "choix d'amazon" in normalized or "amazon's choice" in normalized:
return True, label
if label:
return True, label
return True, None
return None, None
def _extract_discount_text(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait le texte de réduction explicite."""
selectors = self.get_selector("discount_text", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if not element:
continue
text = element.get_text(" ", strip=True)
if text:
return text
return None
def _extract_model_details(self, specs: dict[str, str]) -> tuple[Optional[str], Optional[str]]:
"""Extrait le numero et le nom du modele depuis les specs."""
model_number = None
model_name = None
for key, value in specs.items():
normalized = key.lower()
if "numéro du modèle de l'article" in normalized or "numero du modele de l'article" in normalized:
model_number = value
if "nom du modèle" in normalized or "nom du modele" in normalized:
model_name = value
return model_number, model_name
def _extract_asin_from_html(self, soup: BeautifulSoup) -> Optional[str]:
"""Extrait l'ASIN depuis le HTML (fallback)."""
selectors = self.get_selector("asin", [])