feat: improve SPA scraping and increase test coverage
- Add SPA support for Playwright with wait_for_network_idle and extra_wait_ms - Add BaseStore.get_spa_config() and requires_playwright() methods - Implement AliExpress SPA config with JSON price extraction patterns - Fix Amazon price parsing to prioritize whole+fraction combination - Fix AliExpress regex patterns (remove double backslashes) - Add CLI tests: detect, doctor, fetch, parse, run commands - Add API tests: auth, logs, products, scraping_logs, webhooks Tests: 417 passed, 85% coverage Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -45,6 +45,8 @@ def fetch_playwright(
|
||||
timeout_ms: int = 60000,
|
||||
save_screenshot: bool = False,
|
||||
wait_for_selector: Optional[str] = None,
|
||||
wait_for_network_idle: bool = False,
|
||||
extra_wait_ms: int = 0,
|
||||
) -> PlaywrightFetchResult:
|
||||
"""
|
||||
Récupère une page avec Playwright.
|
||||
@@ -55,6 +57,8 @@ def fetch_playwright(
|
||||
timeout_ms: Timeout en millisecondes
|
||||
save_screenshot: Prendre un screenshot
|
||||
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
|
||||
wait_for_network_idle: Attendre que le réseau soit inactif (pour SPA)
|
||||
extra_wait_ms: Délai supplémentaire après chargement (pour JS lent)
|
||||
|
||||
Returns:
|
||||
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
|
||||
@@ -65,6 +69,8 @@ def fetch_playwright(
|
||||
- Headful disponible pour debug visuel
|
||||
- Screenshot optionnel pour diagnostiquer les échecs
|
||||
- wait_for_selector permet d'attendre le chargement dynamique
|
||||
- wait_for_network_idle utile pour les SPA qui chargent via AJAX
|
||||
- extra_wait_ms pour les sites avec JS lent après DOM ready
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.error("URL vide fournie")
|
||||
@@ -101,7 +107,8 @@ def fetch_playwright(
|
||||
|
||||
# Naviguer vers la page
|
||||
logger.debug(f"[Playwright] Navigation vers {url}")
|
||||
response = page.goto(url, wait_until="domcontentloaded")
|
||||
wait_until = "networkidle" if wait_for_network_idle else "domcontentloaded"
|
||||
response = page.goto(url, wait_until=wait_until)
|
||||
|
||||
if not response:
|
||||
raise Exception("Pas de réponse du serveur")
|
||||
@@ -116,6 +123,11 @@ def fetch_playwright(
|
||||
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
|
||||
)
|
||||
|
||||
# Délai supplémentaire pour JS lent (SPA)
|
||||
if extra_wait_ms > 0:
|
||||
logger.debug(f"[Playwright] Attente supplémentaire: {extra_wait_ms}ms")
|
||||
page.wait_for_timeout(extra_wait_ms)
|
||||
|
||||
# Récupérer le HTML
|
||||
html = page.content()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user