- Add SPA support for Playwright with wait_for_network_idle and extra_wait_ms - Add BaseStore.get_spa_config() and requires_playwright() methods - Implement AliExpress SPA config with JSON price extraction patterns - Fix Amazon price parsing to prioritize whole+fraction combination - Fix AliExpress regex patterns (remove double backslashes) - Add CLI tests: detect, doctor, fetch, parse, run commands - Add API tests: auth, logs, products, scraping_logs, webhooks Tests: 417 passed, 85% coverage Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
440 lines
15 KiB
Python
Executable File
440 lines
15 KiB
Python
Executable File
"""
|
|
Store AliExpress - Parsing de produits AliExpress.com.
|
|
|
|
Supporte l'extraction de: titre, prix, SKU, images, etc.
|
|
Spécificité: Rendu client-side (SPA) - nécessite Playwright avec attente.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
from urllib.parse import urlparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from pricewatch.app.core.logging import get_logger
|
|
from pricewatch.app.core.schema import (
|
|
DebugInfo,
|
|
DebugStatus,
|
|
FetchMethod,
|
|
ProductSnapshot,
|
|
StockStatus,
|
|
)
|
|
from pricewatch.app.stores.base import BaseStore
|
|
from pricewatch.app.stores.price_parser import parse_price_text
|
|
|
|
logger = get_logger("stores.aliexpress")
|
|
|
|
|
|
class AliexpressStore(BaseStore):
|
|
"""Store pour AliExpress.com (marketplace chinois).
|
|
|
|
AliExpress est une SPA (Single Page Application) qui charge
|
|
le contenu via JavaScript/AJAX. Nécessite Playwright avec
|
|
attente du chargement dynamique.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialise le store AliExpress avec ses sélecteurs."""
|
|
selectors_path = Path(__file__).parent / "selectors.yml"
|
|
super().__init__(store_id="aliexpress", selectors_path=selectors_path)
|
|
|
|
def get_spa_config(self) -> dict:
|
|
"""
|
|
Configuration SPA pour AliExpress.
|
|
|
|
AliExpress charge les données produit (prix, titre) via AJAX.
|
|
Il faut attendre que le réseau soit inactif ET ajouter un délai
|
|
pour laisser le JS terminer le rendu.
|
|
|
|
Returns:
|
|
Configuration Playwright pour SPA
|
|
"""
|
|
return {
|
|
"wait_for_network_idle": True,
|
|
"wait_for_selector": "h1", # Titre du produit
|
|
"extra_wait_ms": 2000, # 2s pour le rendu JS
|
|
}
|
|
|
|
def requires_playwright(self) -> bool:
|
|
"""AliExpress nécessite Playwright pour le rendu SPA."""
|
|
return True
|
|
|
|
def match(self, url: str) -> float:
|
|
"""
|
|
Détecte si l'URL est AliExpress.
|
|
|
|
Returns:
|
|
0.9 pour aliexpress.com/aliexpress.fr
|
|
0.0 sinon
|
|
"""
|
|
if not url:
|
|
return 0.0
|
|
|
|
url_lower = url.lower()
|
|
|
|
if "aliexpress.com" in url_lower or "aliexpress.fr" in url_lower:
|
|
# Vérifier que c'est bien une page produit
|
|
if "/item/" in url_lower:
|
|
return 0.9
|
|
else:
|
|
return 0.5 # C'est AliExpress mais pas une page produit
|
|
|
|
return 0.0
|
|
|
|
def canonicalize(self, url: str) -> str:
|
|
"""
|
|
Normalise l'URL AliExpress.
|
|
|
|
Les URLs AliExpress ont généralement la forme:
|
|
https://fr.aliexpress.com/item/{ID}.html?params...
|
|
|
|
On garde juste: https://fr.aliexpress.com/item/{ID}.html
|
|
"""
|
|
if not url:
|
|
return url
|
|
|
|
parsed = urlparse(url)
|
|
|
|
# Extraire le path de base (sans query params)
|
|
path = parsed.path
|
|
|
|
# Garder seulement /item/{ID}.html
|
|
match = re.search(r"(/item/\d+\.html)", path)
|
|
if match:
|
|
clean_path = match.group(1)
|
|
return f"{parsed.scheme}://{parsed.netloc}{clean_path}"
|
|
|
|
# Si le pattern ne matche pas, retirer juste query params
|
|
return f"{parsed.scheme}://{parsed.netloc}{path}"
|
|
|
|
def extract_reference(self, url: str) -> Optional[str]:
|
|
"""
|
|
Extrait le SKU (Product ID) depuis l'URL.
|
|
|
|
Format typique: /item/{ID}.html
|
|
Exemple: /item/1005007187023722.html → "1005007187023722"
|
|
"""
|
|
if not url:
|
|
return None
|
|
|
|
# Pattern: /item/{ID}.html
|
|
match = re.search(r"/item/(\d+)\.html", url, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
def parse(self, html: str, url: str) -> ProductSnapshot:
|
|
"""
|
|
Parse le HTML AliExpress vers ProductSnapshot.
|
|
|
|
AliExpress utilise un rendu client-side (SPA), donc:
|
|
- Extraction prioritaire depuis meta tags (og:title, og:image)
|
|
- Prix extrait par regex (pas de sélecteur stable)
|
|
- Images extraites depuis window._d_c_.DCData JSON
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
debug_info = DebugInfo(
|
|
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
|
|
status=DebugStatus.SUCCESS,
|
|
errors=[],
|
|
notes=[],
|
|
)
|
|
|
|
# Extraction des champs
|
|
title = self._extract_title(soup, debug_info)
|
|
price = self._extract_price(html, soup, debug_info)
|
|
currency = self._extract_currency(url, soup, debug_info)
|
|
stock_status = self._extract_stock(soup, debug_info)
|
|
images = self._extract_images(html, soup, debug_info)
|
|
category = self._extract_category(soup, debug_info)
|
|
specs = self._extract_specs(soup, debug_info)
|
|
description = self._extract_description(soup, debug_info)
|
|
msrp = self._extract_msrp(html, debug_info)
|
|
reference = self.extract_reference(url)
|
|
|
|
# Note sur le rendu client-side
|
|
if len(html) < 200000: # HTML trop petit = pas de rendu complet
|
|
debug_info.notes.append(
|
|
"HTML court (<200KB) - possiblement non rendu. Utiliser Playwright avec wait."
|
|
)
|
|
|
|
# Déterminer le statut final
|
|
if not title or price is None:
|
|
debug_info.status = DebugStatus.PARTIAL
|
|
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
|
|
|
|
snapshot = ProductSnapshot(
|
|
source=self.store_id,
|
|
url=self.canonicalize(url),
|
|
fetched_at=datetime.now(),
|
|
title=title,
|
|
price=price,
|
|
currency=currency,
|
|
shipping_cost=None,
|
|
stock_status=stock_status,
|
|
reference=reference,
|
|
category=category,
|
|
description=description,
|
|
images=images,
|
|
specs=specs,
|
|
msrp=msrp,
|
|
debug=debug_info,
|
|
)
|
|
|
|
logger.info(
|
|
f"[AliExpress] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
|
|
f"title={bool(title)}, price={price is not None}"
|
|
)
|
|
|
|
return snapshot
|
|
|
|
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
|
"""Extrait le titre du produit."""
|
|
# Priorité 1: h1 (apparaît après rendu AJAX)
|
|
h1 = soup.find("h1")
|
|
if h1:
|
|
title = h1.get_text(strip=True)
|
|
if title and len(title) > 10: # Titre valide
|
|
return title
|
|
|
|
# Priorité 2: og:title (dans meta tags)
|
|
og_title = soup.find("meta", property="og:title")
|
|
if og_title:
|
|
title = og_title.get("content", "")
|
|
if title:
|
|
# Nettoyer " - AliExpress" à la fin
|
|
title = re.sub(r"\s*-\s*AliExpress.*$", "", title)
|
|
return title.strip()
|
|
|
|
debug.errors.append("Titre non trouvé")
|
|
return None
|
|
|
|
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
|
"""Extrait la description (meta tags)."""
|
|
meta = soup.find("meta", property="og:description") or soup.find(
|
|
"meta", attrs={"name": "description"}
|
|
)
|
|
if meta:
|
|
description = meta.get("content", "").strip()
|
|
if description:
|
|
return description
|
|
return None
|
|
|
|
def _extract_price(
|
|
self, html: str, soup: BeautifulSoup, debug: DebugInfo
|
|
) -> Optional[float]:
|
|
"""
|
|
Extrait le prix.
|
|
|
|
AliExpress n'a PAS de sélecteur CSS stable pour le prix.
|
|
Stratégie multi-niveaux:
|
|
1. Chercher dans les données JSON embarquées
|
|
2. Chercher dans les spans avec classes contenant "price"
|
|
3. Regex sur le HTML brut
|
|
4. Meta tags og:price
|
|
"""
|
|
# Priorité 1: Extraire depuis JSON embarqué (skuActivityAmount, formattedActivityPrice)
|
|
json_patterns = [
|
|
r'"skuActivityAmount"\s*:\s*\{\s*"value"\s*:\s*(\d+(?:\.\d+)?)', # {"value": 123.45}
|
|
r'"formattedActivityPrice"\s*:\s*"([0-9,.\s]+)\s*€"', # "123,45 €"
|
|
r'"formattedActivityPrice"\s*:\s*"€\s*([0-9,.\s]+)"', # "€ 123.45"
|
|
r'"minPrice"\s*:\s*"([0-9,.\s]+)"', # "minPrice": "123.45"
|
|
r'"price"\s*:\s*"([0-9,.\s]+)"', # "price": "123.45"
|
|
r'"activityAmount"\s*:\s*\{\s*"value"\s*:\s*(\d+(?:\.\d+)?)', # activityAmount.value
|
|
]
|
|
for pattern in json_patterns:
|
|
match = re.search(pattern, html)
|
|
if match:
|
|
price = parse_price_text(match.group(1))
|
|
if price is not None and price > 0:
|
|
debug.notes.append(f"Prix extrait depuis JSON: {price}")
|
|
return price
|
|
|
|
# Priorité 2: Chercher dans les spans/divs avec classes contenant "price"
|
|
price_selectors = [
|
|
'span[class*="price--current"]',
|
|
'span[class*="price--sale"]',
|
|
'div[class*="price--current"]',
|
|
'span[class*="product-price"]',
|
|
'span[class*="Price_Price"]',
|
|
'div[class*="es--wrap"]', # Structure AliExpress spécifique
|
|
]
|
|
for selector in price_selectors:
|
|
elements = soup.select(selector)
|
|
for elem in elements:
|
|
text = elem.get_text(strip=True)
|
|
# Chercher un prix dans le texte
|
|
price_match = re.search(r'(\d+[,.\s]*\d*)\s*€|€\s*(\d+[,.\s]*\d*)', text)
|
|
if price_match:
|
|
price_str = price_match.group(1) or price_match.group(2)
|
|
price = parse_price_text(price_str)
|
|
if price is not None and price > 0:
|
|
debug.notes.append(f"Prix extrait depuis sélecteur {selector}")
|
|
return price
|
|
|
|
# Priorité 3: Prix avant € (ex: "136,69€" ou "136,69 €")
|
|
match = re.search(r'(\d+[,.\s\u00a0\u202f\u2009]*\d*)\s*€', html)
|
|
if match:
|
|
price = parse_price_text(match.group(1))
|
|
if price is not None and price > 0:
|
|
return price
|
|
|
|
# Priorité 4: € avant prix (ex: "€136.69" ou "€ 136.69")
|
|
match = re.search(r'€\s*(\d+[,.\s\u00a0\u202f\u2009]*\d*)', html)
|
|
if match:
|
|
price = parse_price_text(match.group(1))
|
|
if price is not None and price > 0:
|
|
return price
|
|
|
|
# Priorité 5: Chercher dans meta tags (moins fiable)
|
|
og_price = soup.find("meta", property="og:price:amount")
|
|
if og_price:
|
|
price_str = og_price.get("content", "")
|
|
price = parse_price_text(price_str)
|
|
if price is not None and price > 0:
|
|
return price
|
|
|
|
debug.errors.append("Prix non trouvé")
|
|
return None
|
|
|
|
def _extract_msrp(self, html: str, debug: DebugInfo) -> Optional[float]:
|
|
"""Extrait le prix conseille si present."""
|
|
match = re.search(r'originalPrice"\s*:\s*"([0-9\s.,]+)"', html)
|
|
if match:
|
|
price = parse_price_text(match.group(1))
|
|
if price is not None:
|
|
return price
|
|
return None
|
|
|
|
def _extract_currency(
|
|
self, url: str, soup: BeautifulSoup, debug: DebugInfo
|
|
) -> str:
|
|
"""Extrait la devise."""
|
|
# Priorité 1: og:price:currency
|
|
og_currency = soup.find("meta", property="og:price:currency")
|
|
if og_currency:
|
|
currency = og_currency.get("content", "")
|
|
if currency:
|
|
return currency.upper()
|
|
|
|
# Priorité 2: Détecter depuis l'URL
|
|
if "fr.aliexpress" in url.lower():
|
|
return "EUR"
|
|
elif "aliexpress.com" in url.lower():
|
|
return "USD"
|
|
|
|
# Défaut
|
|
return "EUR"
|
|
|
|
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
|
"""Extrait le statut de stock."""
|
|
# Chercher le bouton "Add to cart" / "Ajouter au panier"
|
|
buttons = soup.find_all("button")
|
|
for btn in buttons:
|
|
text = btn.get_text(strip=True).lower()
|
|
if any(
|
|
keyword in text
|
|
for keyword in ["add to cart", "ajouter", "buy now", "acheter"]
|
|
):
|
|
# Bouton trouvé et pas disabled
|
|
if not btn.get("disabled"):
|
|
return StockStatus.IN_STOCK
|
|
|
|
# Fallback: chercher texte indiquant la disponibilité
|
|
text_lower = soup.get_text().lower()
|
|
if "out of stock" in text_lower or "rupture" in text_lower:
|
|
return StockStatus.OUT_OF_STOCK
|
|
|
|
return StockStatus.UNKNOWN
|
|
|
|
def _extract_images(
|
|
self, html: str, soup: BeautifulSoup, debug: DebugInfo
|
|
) -> list[str]:
|
|
"""
|
|
Extrait les URLs d'images.
|
|
|
|
Priorité: window._d_c_.DCData.imagePathList (JSON embarqué)
|
|
"""
|
|
images = []
|
|
|
|
# Priorité 1: Extraire depuis DCData JSON
|
|
match = re.search(
|
|
r"window\._d_c_\.DCData\s*=\s*(\{[^;]*\});", html, re.DOTALL
|
|
)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group(1))
|
|
if "imagePathList" in data:
|
|
image_list = data["imagePathList"]
|
|
if isinstance(image_list, list):
|
|
images.extend(image_list)
|
|
debug.notes.append(
|
|
f"Images extraites depuis DCData: {len(images)}"
|
|
)
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
# Priorité 2: og:image
|
|
if not images:
|
|
og_image = soup.find("meta", property="og:image")
|
|
if og_image:
|
|
img_url = og_image.get("content", "")
|
|
if img_url:
|
|
images.append(img_url)
|
|
|
|
# Priorité 3: Chercher dans les <img> avec alicdn.com
|
|
if not images:
|
|
img_elems = soup.find_all("img", src=True)
|
|
for img in img_elems:
|
|
src = img.get("src", "")
|
|
if "alicdn.com" in src and not any(
|
|
x in src for x in ["logo", "icon", "avatar"]
|
|
):
|
|
if src not in images:
|
|
images.append(src)
|
|
|
|
return images
|
|
|
|
def _extract_category(
|
|
self, soup: BeautifulSoup, debug: DebugInfo
|
|
) -> Optional[str]:
|
|
"""Extrait la catégorie depuis le breadcrumb."""
|
|
selectors = self.get_selector("category", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
if elements:
|
|
# Prendre le dernier élément du breadcrumb
|
|
categories = [
|
|
elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)
|
|
]
|
|
if categories:
|
|
return categories[-1]
|
|
|
|
return None
|
|
|
|
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
|
"""Extrait les caractéristiques techniques."""
|
|
specs = {}
|
|
|
|
# Chercher les dl (definition lists)
|
|
dls = soup.find_all("dl")
|
|
for dl in dls:
|
|
dts = dl.find_all("dt")
|
|
dds = dl.find_all("dd")
|
|
|
|
for dt, dd in zip(dts, dds):
|
|
key = dt.get_text(strip=True)
|
|
value = dd.get_text(strip=True)
|
|
if key and value:
|
|
specs[key] = value
|
|
|
|
return specs
|