feat: improve SPA scraping and increase test coverage

- Add SPA support for Playwright with wait_for_network_idle and extra_wait_ms
- Add BaseStore.get_spa_config() and requires_playwright() methods
- Implement AliExpress SPA config with JSON price extraction patterns
- Fix Amazon price parsing to prioritize whole+fraction combination
- Fix AliExpress regex patterns (remove double backslashes)
- Add CLI tests: detect, doctor, fetch, parse, run commands
- Add API tests: auth, logs, products, scraping_logs, webhooks

Tests: 417 passed, 85% coverage

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Gilles Soulier
2026-01-17 14:46:55 +01:00
parent cf7c415e22
commit 152c2724fc
14 changed files with 1307 additions and 22 deletions

View File

@@ -45,6 +45,8 @@ def fetch_playwright(
timeout_ms: int = 60000,
save_screenshot: bool = False,
wait_for_selector: Optional[str] = None,
wait_for_network_idle: bool = False,
extra_wait_ms: int = 0,
) -> PlaywrightFetchResult:
"""
Récupère une page avec Playwright.
@@ -55,6 +57,8 @@ def fetch_playwright(
timeout_ms: Timeout en millisecondes
save_screenshot: Prendre un screenshot
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
wait_for_network_idle: Attendre que le réseau soit inactif (pour SPA)
extra_wait_ms: Délai supplémentaire après chargement (pour JS lent)
Returns:
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
@@ -65,6 +69,8 @@ def fetch_playwright(
- Headful disponible pour debug visuel
- Screenshot optionnel pour diagnostiquer les échecs
- wait_for_selector permet d'attendre le chargement dynamique
- wait_for_network_idle utile pour les SPA qui chargent via AJAX
- extra_wait_ms pour les sites avec JS lent après DOM ready
"""
if not url or not url.strip():
logger.error("URL vide fournie")
@@ -101,7 +107,8 @@ def fetch_playwright(
# Naviguer vers la page
logger.debug(f"[Playwright] Navigation vers {url}")
response = page.goto(url, wait_until="domcontentloaded")
wait_until = "networkidle" if wait_for_network_idle else "domcontentloaded"
response = page.goto(url, wait_until=wait_until)
if not response:
raise Exception("Pas de réponse du serveur")
@@ -116,6 +123,11 @@ def fetch_playwright(
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
)
# Délai supplémentaire pour JS lent (SPA)
if extra_wait_ms > 0:
logger.debug(f"[Playwright] Attente supplémentaire: {extra_wait_ms}ms")
page.wait_for_timeout(extra_wait_ms)
# Récupérer le HTML
html = page.content()