before claude
This commit is contained in:
Binary file not shown.
@@ -15,6 +15,13 @@ price:
|
||||
- "#priceblock_dealprice"
|
||||
- ".a-price-range .a-price .a-offscreen"
|
||||
|
||||
# Texte de réduction explicite
|
||||
discount_text:
|
||||
- "#regularprice_savings"
|
||||
- "#dealprice_savings"
|
||||
- "#savingsPercentage"
|
||||
- "span.savingsPercentage"
|
||||
|
||||
# Devise (généralement dans le symbole)
|
||||
currency:
|
||||
- "span.a-price-symbol"
|
||||
@@ -32,6 +39,24 @@ stock_status:
|
||||
- "#availability"
|
||||
- ".a-declarative .a-size-medium"
|
||||
|
||||
# Note moyenne
|
||||
rating_value:
|
||||
- "#acrPopover"
|
||||
- "#averageCustomerReviews .a-icon-alt"
|
||||
- "#averageCustomerReviews span.a-icon-alt"
|
||||
|
||||
# Nombre d'évaluations
|
||||
rating_count:
|
||||
- "#acrCustomerReviewText"
|
||||
- "#acrCustomerReviewLink"
|
||||
|
||||
# Badge Choix d'Amazon
|
||||
amazon_choice:
|
||||
- "#acBadge_feature_div"
|
||||
- "#acBadge_feature_div .ac-badge"
|
||||
- "#acBadge_feature_div .ac-badge-rectangle"
|
||||
- "#acBadge_feature_div .ac-badge-rectangle-icon"
|
||||
|
||||
# Images produit
|
||||
images:
|
||||
- "#landingImage"
|
||||
@@ -44,6 +69,13 @@ category:
|
||||
- "#wayfinding-breadcrumbs_feature_div"
|
||||
- ".a-breadcrumb"
|
||||
|
||||
# Description (détails de l'article)
|
||||
description:
|
||||
- "#detailBullets_feature_div"
|
||||
- "#detailBulletsWrapper_feature_div"
|
||||
- "#productDetails_detailBullets_sections1"
|
||||
- "#feature-bullets"
|
||||
|
||||
# Caractéristiques techniques (table specs)
|
||||
specs_table:
|
||||
- "#productDetails_techSpec_section_1"
|
||||
|
||||
@@ -130,13 +130,19 @@ class AmazonStore(BaseStore):
|
||||
title = self._extract_title(soup, debug_info)
|
||||
price = self._extract_price(soup, debug_info)
|
||||
currency = self._extract_currency(soup, debug_info)
|
||||
stock_status = self._extract_stock(soup, debug_info)
|
||||
images = self._extract_images(soup, debug_info)
|
||||
stock_status, stock_text, in_stock = self._extract_stock_details(soup, debug_info)
|
||||
main_image, gallery_images, images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
|
||||
rating_value = self._extract_rating_value(soup, debug_info)
|
||||
rating_count = self._extract_rating_count(soup, debug_info)
|
||||
amazon_choice, amazon_choice_label = self._extract_amazon_choice(soup, debug_info)
|
||||
discount_text = self._extract_discount_text(soup, debug_info)
|
||||
model_number, model_name = self._extract_model_details(specs)
|
||||
asin = reference
|
||||
|
||||
# Déterminer le statut final (ne pas écraser FAILED)
|
||||
if debug_info.status != DebugStatus.FAILED:
|
||||
@@ -153,12 +159,24 @@ class AmazonStore(BaseStore):
|
||||
currency=currency or "EUR",
|
||||
shipping_cost=None, # Difficile à extraire
|
||||
stock_status=stock_status,
|
||||
stock_text=stock_text,
|
||||
in_stock=in_stock,
|
||||
reference=reference,
|
||||
asin=asin,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
main_image=main_image,
|
||||
gallery_images=gallery_images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
rating_value=rating_value,
|
||||
rating_count=rating_count,
|
||||
amazon_choice=amazon_choice,
|
||||
amazon_choice_label=amazon_choice_label,
|
||||
discount_text=discount_text,
|
||||
model_number=model_number,
|
||||
model_name=model_name,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -203,14 +221,26 @@ class AmazonStore(BaseStore):
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
"""Extrait la description depuis les détails de l'article."""
|
||||
selectors = self.get_selector("description", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
items = [
|
||||
item.get_text(" ", strip=True)
|
||||
for item in element.select("li")
|
||||
if item.get_text(strip=True)
|
||||
]
|
||||
if items:
|
||||
return "\n".join(items)
|
||||
text = " ".join(element.stripped_strings)
|
||||
if text:
|
||||
return text
|
||||
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
@@ -271,8 +301,10 @@ class AmazonStore(BaseStore):
|
||||
# Défaut basé sur le domaine
|
||||
return "EUR"
|
||||
|
||||
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
||||
"""Extrait le statut de stock."""
|
||||
def _extract_stock_details(
|
||||
self, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> tuple[StockStatus, Optional[str], Optional[bool]]:
|
||||
"""Extrait le statut de stock avec texte brut."""
|
||||
selectors = self.get_selector("stock_status", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
@@ -280,22 +312,27 @@ class AmazonStore(BaseStore):
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True).lower()
|
||||
if "en stock" in text or "available" in text or "in stock" in text:
|
||||
return StockStatus.IN_STOCK
|
||||
text = element.get_text(strip=True)
|
||||
normalized = text.lower()
|
||||
if "en stock" in normalized or "available" in normalized or "in stock" in normalized:
|
||||
return StockStatus.IN_STOCK, text, True
|
||||
elif (
|
||||
"rupture" in text
|
||||
or "indisponible" in text
|
||||
or "out of stock" in text
|
||||
"rupture" in normalized
|
||||
or "indisponible" in normalized
|
||||
or "out of stock" in normalized
|
||||
):
|
||||
return StockStatus.OUT_OF_STOCK
|
||||
return StockStatus.OUT_OF_STOCK, text, False
|
||||
|
||||
return StockStatus.UNKNOWN
|
||||
return StockStatus.UNKNOWN, None, None
|
||||
|
||||
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
||||
"""Extrait les URLs d'images."""
|
||||
images = []
|
||||
seen = set()
|
||||
def _extract_images(
|
||||
self, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> tuple[Optional[str], list[str], list[str]]:
|
||||
"""Extrait l'image principale et la galerie."""
|
||||
images: list[str] = []
|
||||
seen: set[str] = set()
|
||||
main_image: Optional[str] = None
|
||||
max_gallery = 15
|
||||
selectors = self.get_selector("images", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
@@ -309,6 +346,8 @@ class AmazonStore(BaseStore):
|
||||
if self._is_product_image(url) and url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
if main_image is None:
|
||||
main_image = url
|
||||
dynamic = element.get("data-a-dynamic-image")
|
||||
if dynamic:
|
||||
urls = self._extract_dynamic_images(dynamic)
|
||||
@@ -316,6 +355,8 @@ class AmazonStore(BaseStore):
|
||||
if self._is_product_image(dyn_url) and dyn_url not in seen:
|
||||
images.append(dyn_url)
|
||||
seen.add(dyn_url)
|
||||
if main_image is None:
|
||||
main_image = dyn_url
|
||||
|
||||
# Fallback: chercher tous les img tags si aucune image trouvée
|
||||
if not images:
|
||||
@@ -326,8 +367,15 @@ class AmazonStore(BaseStore):
|
||||
if url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
if main_image is None:
|
||||
main_image = url
|
||||
|
||||
return images
|
||||
if main_image is None and images:
|
||||
main_image = images[0]
|
||||
gallery_images = [url for url in images if url != main_image]
|
||||
gallery_images = gallery_images[:max_gallery]
|
||||
final_images = [main_image] + gallery_images if main_image else gallery_images
|
||||
return main_image, gallery_images, final_images
|
||||
|
||||
def _extract_dynamic_images(self, raw: str) -> list[str]:
|
||||
"""Extrait les URLs du JSON data-a-dynamic-image."""
|
||||
@@ -393,8 +441,111 @@ class AmazonStore(BaseStore):
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
# Détails de l'article sous forme de liste
|
||||
detail_list = soup.select("#detailBullets_feature_div li")
|
||||
for item in detail_list:
|
||||
text = item.get_text(" ", strip=True)
|
||||
if ":" not in text:
|
||||
continue
|
||||
key, value = text.split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
if key and value and key not in specs:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
|
||||
def _extract_rating_value(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait la note moyenne."""
|
||||
selectors = self.get_selector("rating_value", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
text = element.get_text(" ", strip=True) or element.get("title", "").strip()
|
||||
match = re.search(r"([\d.,]+)", text)
|
||||
if match:
|
||||
value = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
def _extract_rating_count(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[int]:
|
||||
"""Extrait le nombre d'évaluations."""
|
||||
selectors = self.get_selector("rating_count", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
text = element.get_text(" ", strip=True)
|
||||
match = re.search(r"([\d\s\u202f\u00a0]+)", text)
|
||||
if match:
|
||||
numeric = re.sub(r"[^\d]", "", match.group(1))
|
||||
if numeric:
|
||||
return int(numeric)
|
||||
return None
|
||||
|
||||
def _extract_amazon_choice(
|
||||
self, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> tuple[Optional[bool], Optional[str]]:
|
||||
"""Extrait le badge Choix d'Amazon."""
|
||||
selectors = self.get_selector("amazon_choice", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
label_candidates = [
|
||||
element.get_text(" ", strip=True),
|
||||
element.get("aria-label", "").strip(),
|
||||
element.get("title", "").strip(),
|
||||
element.get("data-a-badge-label", "").strip(),
|
||||
]
|
||||
label = next((item for item in label_candidates if item), "")
|
||||
normalized = label.lower()
|
||||
if "choix d'amazon" in normalized or "amazon's choice" in normalized:
|
||||
return True, label
|
||||
if label:
|
||||
return True, label
|
||||
return True, None
|
||||
return None, None
|
||||
|
||||
def _extract_discount_text(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait le texte de réduction explicite."""
|
||||
selectors = self.get_selector("discount_text", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
text = element.get_text(" ", strip=True)
|
||||
if text:
|
||||
return text
|
||||
return None
|
||||
|
||||
def _extract_model_details(self, specs: dict[str, str]) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Extrait le numero et le nom du modele depuis les specs."""
|
||||
model_number = None
|
||||
model_name = None
|
||||
for key, value in specs.items():
|
||||
normalized = key.lower()
|
||||
if "numéro du modèle de l'article" in normalized or "numero du modele de l'article" in normalized:
|
||||
model_number = value
|
||||
if "nom du modèle" in normalized or "nom du modele" in normalized:
|
||||
model_name = value
|
||||
return model_number, model_name
|
||||
|
||||
def _extract_asin_from_html(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extrait l'ASIN depuis le HTML (fallback)."""
|
||||
selectors = self.get_selector("asin", [])
|
||||
|
||||
Reference in New Issue
Block a user