Files
scrap/pricewatch/app/scraping/pw_fetch.py
Gilles Soulier 152c2724fc feat: improve SPA scraping and increase test coverage
- Add SPA support for Playwright with wait_for_network_idle and extra_wait_ms
- Add BaseStore.get_spa_config() and requires_playwright() methods
- Implement AliExpress SPA config with JSON price extraction patterns
- Fix Amazon price parsing to prioritize whole+fraction combination
- Fix AliExpress regex patterns (remove double backslashes)
- Add CLI tests: detect, doctor, fetch, parse, run commands
- Add API tests: auth, logs, products, scraping_logs, webhooks

Tests: 417 passed, 85% coverage

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 14:46:55 +01:00

251 lines
7.6 KiB
Python
Executable File

"""
Récupération avec Playwright (fallback anti-bot).
Utilisé quand HTTP échoue (403, captcha, etc.).
Plus lent mais plus robuste contre les protections anti-scraping.
"""
import time
from typing import Optional
from playwright.sync_api import (
Browser,
Page,
Playwright,
sync_playwright,
TimeoutError as PlaywrightTimeout,
)
from pricewatch.app.core.logging import get_logger
logger = get_logger("scraping.playwright")
class PlaywrightFetchResult:
"""Résultat d'une récupération Playwright."""
def __init__(
self,
success: bool,
html: Optional[str] = None,
screenshot: Optional[bytes] = None,
error: Optional[str] = None,
duration_ms: Optional[int] = None,
):
self.success = success
self.html = html
self.screenshot = screenshot
self.error = error
self.duration_ms = duration_ms
def fetch_playwright(
url: str,
headless: bool = True,
timeout_ms: int = 60000,
save_screenshot: bool = False,
wait_for_selector: Optional[str] = None,
wait_for_network_idle: bool = False,
extra_wait_ms: int = 0,
) -> PlaywrightFetchResult:
"""
Récupère une page avec Playwright.
Args:
url: URL à récupérer
headless: Mode headless (True) ou visible (False)
timeout_ms: Timeout en millisecondes
save_screenshot: Prendre un screenshot
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
wait_for_network_idle: Attendre que le réseau soit inactif (pour SPA)
extra_wait_ms: Délai supplémentaire après chargement (pour JS lent)
Returns:
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
Justification technique:
- Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots
- Headless par défaut pour performance
- Headful disponible pour debug visuel
- Screenshot optionnel pour diagnostiquer les échecs
- wait_for_selector permet d'attendre le chargement dynamique
- wait_for_network_idle utile pour les SPA qui chargent via AJAX
- extra_wait_ms pour les sites avec JS lent après DOM ready
"""
if not url or not url.strip():
logger.error("URL vide fournie")
return PlaywrightFetchResult(success=False, error="URL vide")
start_time = time.time()
logger.info(f"[Playwright] Récupération: {url} (headless={headless})")
playwright: Optional[Playwright] = None
browser: Optional[Browser] = None
page: Optional[Page] = None
try:
playwright = sync_playwright().start()
# Lancer le navigateur Chromium
browser = playwright.chromium.launch(headless=headless)
# Créer un contexte avec User-Agent réaliste
context = browser.new_context(
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
viewport={"width": 1920, "height": 1080},
locale="fr-FR",
)
page = context.new_page()
# Définir le timeout
page.set_default_timeout(timeout_ms)
# Naviguer vers la page
logger.debug(f"[Playwright] Navigation vers {url}")
wait_until = "networkidle" if wait_for_network_idle else "domcontentloaded"
response = page.goto(url, wait_until=wait_until)
if not response:
raise Exception("Pas de réponse du serveur")
# Attendre un sélecteur spécifique si demandé
if wait_for_selector:
logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}")
try:
page.wait_for_selector(wait_for_selector, timeout=timeout_ms)
except PlaywrightTimeout:
logger.warning(
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
)
# Délai supplémentaire pour JS lent (SPA)
if extra_wait_ms > 0:
logger.debug(f"[Playwright] Attente supplémentaire: {extra_wait_ms}ms")
page.wait_for_timeout(extra_wait_ms)
# Récupérer le HTML
html = page.content()
# Screenshot optionnel
screenshot = None
if save_screenshot:
logger.debug("[Playwright] Capture du screenshot")
screenshot = page.screenshot(full_page=False)
duration_ms = int((time.time() - start_time) * 1000)
logger.info(
f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, "
f"status={response.status}"
)
return PlaywrightFetchResult(
success=True,
html=html,
screenshot=screenshot,
duration_ms=duration_ms,
)
except PlaywrightTimeout:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Timeout après {timeout_ms}ms"
logger.error(f"[Playwright] {error}")
# Tenter un screenshot même en cas d'erreur
screenshot = None
if save_screenshot and page:
try:
screenshot = page.screenshot(full_page=False)
except Exception:
pass
return PlaywrightFetchResult(
success=False,
error=error,
screenshot=screenshot,
duration_ms=duration_ms,
)
except Exception as e:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Erreur Playwright: {str(e)}"
logger.error(f"[Playwright] {error}")
# Tenter un screenshot même en cas d'erreur
screenshot = None
if save_screenshot and page:
try:
screenshot = page.screenshot(full_page=False)
except Exception:
pass
return PlaywrightFetchResult(
success=False,
error=error,
screenshot=screenshot,
duration_ms=duration_ms,
)
finally:
# Nettoyage
try:
if page:
page.close()
if browser:
browser.close()
if playwright:
playwright.stop()
except Exception as e:
logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}")
def fetch_with_fallback(
url: str,
try_http_first: bool = True,
playwright_options: Optional[dict] = None,
) -> PlaywrightFetchResult:
"""
Stratégie de récupération avec fallback HTTP → Playwright.
Args:
url: URL à récupérer
try_http_first: Tenter HTTP d'abord (plus rapide)
playwright_options: Options pour Playwright si nécessaire
Returns:
PlaywrightFetchResult
Justification technique:
- HTTP d'abord car beaucoup plus rapide (~1s vs ~10s)
- Fallback Playwright si HTTP échoue (403, timeout, etc.)
- Économise des ressources quand HTTP suffit
"""
from pricewatch.app.scraping.http_fetch import fetch_http
playwright_options = playwright_options or {}
if try_http_first:
logger.info(f"[Fallback] Tentative HTTP d'abord: {url}")
http_result = fetch_http(url)
if http_result.success:
logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright")
return PlaywrightFetchResult(
success=True,
html=http_result.html,
duration_ms=http_result.duration_ms,
)
logger.warning(
f"[Fallback] HTTP échoué ({http_result.error}), "
"fallback vers Playwright"
)
# Playwright en fallback ou en méthode principale
return fetch_playwright(url, **playwright_options)