This commit is contained in:
Gilles Soulier
2026-01-14 21:54:55 +01:00
parent c91c0f1fc9
commit d0b73b9319
140 changed files with 5822 additions and 161 deletions

View File

View File

@@ -54,12 +54,12 @@ specs_table:
# ASIN (parfois dans les métadonnées)
asin:
- "input[name='ASIN']"
- "th:contains('ASIN') + td"
- "th:-soup-contains('ASIN') + td"
# Messages captcha / robot check
captcha_indicators:
- "form[action*='validateCaptcha']"
- "p.a-last:contains('Sorry')"
- "p.a-last:-soup-contains('Sorry')"
- "img[alt*='captcha']"
# Notes pour le parsing:

View File

@@ -4,7 +4,9 @@ Store Amazon - Parsing de produits Amazon.fr et Amazon.com.
Supporte l'extraction de: titre, prix, ASIN, images, specs, etc.
"""
import json
import re
from html import unescape
from datetime import datetime
from pathlib import Path
from typing import Optional
@@ -21,6 +23,7 @@ from pricewatch.app.core.schema import (
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text
logger = get_logger("stores.amazon")
@@ -131,6 +134,8 @@ class AmazonStore(BaseStore):
images = self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
# Déterminer le statut final (ne pas écraser FAILED)
@@ -150,8 +155,10 @@ class AmazonStore(BaseStore):
stock_status=stock_status,
reference=reference,
category=category,
description=description,
images=images,
specs=specs,
msrp=msrp,
debug=debug_info,
)
@@ -195,6 +202,17 @@ class AmazonStore(BaseStore):
debug.errors.append("Titre non trouvé")
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
selectors = self.get_selector("price", [])
@@ -205,14 +223,9 @@ class AmazonStore(BaseStore):
elements = soup.select(selector)
for element in elements:
text = element.get_text(strip=True)
# Extraire nombre (format: "299,99" ou "299.99")
match = re.search(r"(\d+)[.,](\d+)", text)
if match:
price_str = f"{match.group(1)}.{match.group(2)}"
try:
return float(price_str)
except ValueError:
continue
price = parse_price_text(text)
if price is not None:
return price
# Fallback: chercher les spans séparés a-price-whole et a-price-fraction
whole = soup.select_one("span.a-price-whole")
@@ -220,15 +233,24 @@ class AmazonStore(BaseStore):
if whole and fraction:
whole_text = whole.get_text(strip=True)
fraction_text = fraction.get_text(strip=True)
try:
price_str = f"{whole_text}.{fraction_text}"
return float(price_str)
except ValueError:
pass
price = parse_price_text(f"{whole_text}.{fraction_text}")
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille."""
strike = soup.select_one("span.priceBlockStrikePriceString") or soup.select_one(
"span.a-text-price span.a-offscreen"
)
if strike:
price = parse_price_text(strike.get_text(strip=True))
if price is not None:
return price
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la devise."""
selectors = self.get_selector("currency", [])
@@ -270,6 +292,7 @@ class AmazonStore(BaseStore):
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
"""Extrait les URLs d'images."""
images = []
seen = set()
selectors = self.get_selector("images", [])
if isinstance(selectors, str):
selectors = [selectors]
@@ -278,19 +301,57 @@ class AmazonStore(BaseStore):
elements = soup.select(selector)
for element in elements:
# Attribut src ou data-src
url = element.get("src") or element.get("data-src")
url = element.get("src") or element.get("data-src") or element.get("data-old-hires")
if url and url.startswith("http"):
images.append(url)
if self._is_product_image(url) and url not in seen:
images.append(url)
seen.add(url)
dynamic = element.get("data-a-dynamic-image")
if dynamic:
urls = self._extract_dynamic_images(dynamic)
for dyn_url in urls:
if self._is_product_image(dyn_url) and dyn_url not in seen:
images.append(dyn_url)
seen.add(dyn_url)
# Fallback: chercher tous les img tags si aucune image trouvée
if not images:
all_imgs = soup.find_all("img")
for img in all_imgs:
url = img.get("src") or img.get("data-src")
if url and url.startswith("http"):
images.append(url)
if url and url.startswith("http") and self._is_product_image(url):
if url not in seen:
images.append(url)
seen.add(url)
return list(set(images)) # Dédupliquer
return images
def _extract_dynamic_images(self, raw: str) -> list[str]:
"""Extrait les URLs du JSON data-a-dynamic-image."""
try:
data = json.loads(unescape(raw))
except (TypeError, json.JSONDecodeError):
return []
urls = []
if isinstance(data, dict):
candidates = []
for url, dims in data.items():
if not isinstance(url, str) or not url.startswith("http"):
continue
size = dims[0] if isinstance(dims, list) and dims else 0
candidates.append((size, url))
candidates.sort(key=lambda item: item[0], reverse=True)
for _, url in candidates:
urls.append(url)
return urls
def _is_product_image(self, url: str) -> bool:
"""Filtre basique pour eviter les logos et sprites."""
lowered = url.lower()
if "prime_logo" in lowered or "sprite" in lowered:
return False
return True
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la catégorie depuis les breadcrumbs."""