Files
scrap/pricewatch/app/scraping/pw_fetch.py
2026-01-13 19:49:04 +01:00

239 lines
7.0 KiB
Python
Executable File

"""
Récupération avec Playwright (fallback anti-bot).
Utilisé quand HTTP échoue (403, captcha, etc.).
Plus lent mais plus robuste contre les protections anti-scraping.
"""
import time
from typing import Optional
from playwright.sync_api import (
Browser,
Page,
Playwright,
sync_playwright,
TimeoutError as PlaywrightTimeout,
)
from pricewatch.app.core.logging import get_logger
logger = get_logger("scraping.playwright")
class PlaywrightFetchResult:
"""Résultat d'une récupération Playwright."""
def __init__(
self,
success: bool,
html: Optional[str] = None,
screenshot: Optional[bytes] = None,
error: Optional[str] = None,
duration_ms: Optional[int] = None,
):
self.success = success
self.html = html
self.screenshot = screenshot
self.error = error
self.duration_ms = duration_ms
def fetch_playwright(
url: str,
headless: bool = True,
timeout_ms: int = 60000,
save_screenshot: bool = False,
wait_for_selector: Optional[str] = None,
) -> PlaywrightFetchResult:
"""
Récupère une page avec Playwright.
Args:
url: URL à récupérer
headless: Mode headless (True) ou visible (False)
timeout_ms: Timeout en millisecondes
save_screenshot: Prendre un screenshot
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
Returns:
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
Justification technique:
- Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots
- Headless par défaut pour performance
- Headful disponible pour debug visuel
- Screenshot optionnel pour diagnostiquer les échecs
- wait_for_selector permet d'attendre le chargement dynamique
"""
if not url or not url.strip():
logger.error("URL vide fournie")
return PlaywrightFetchResult(success=False, error="URL vide")
start_time = time.time()
logger.info(f"[Playwright] Récupération: {url} (headless={headless})")
playwright: Optional[Playwright] = None
browser: Optional[Browser] = None
page: Optional[Page] = None
try:
playwright = sync_playwright().start()
# Lancer le navigateur Chromium
browser = playwright.chromium.launch(headless=headless)
# Créer un contexte avec User-Agent réaliste
context = browser.new_context(
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
viewport={"width": 1920, "height": 1080},
locale="fr-FR",
)
page = context.new_page()
# Définir le timeout
page.set_default_timeout(timeout_ms)
# Naviguer vers la page
logger.debug(f"[Playwright] Navigation vers {url}")
response = page.goto(url, wait_until="domcontentloaded")
if not response:
raise Exception("Pas de réponse du serveur")
# Attendre un sélecteur spécifique si demandé
if wait_for_selector:
logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}")
try:
page.wait_for_selector(wait_for_selector, timeout=timeout_ms)
except PlaywrightTimeout:
logger.warning(
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
)
# Récupérer le HTML
html = page.content()
# Screenshot optionnel
screenshot = None
if save_screenshot:
logger.debug("[Playwright] Capture du screenshot")
screenshot = page.screenshot(full_page=False)
duration_ms = int((time.time() - start_time) * 1000)
logger.info(
f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, "
f"status={response.status}"
)
return PlaywrightFetchResult(
success=True,
html=html,
screenshot=screenshot,
duration_ms=duration_ms,
)
except PlaywrightTimeout:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Timeout après {timeout_ms}ms"
logger.error(f"[Playwright] {error}")
# Tenter un screenshot même en cas d'erreur
screenshot = None
if save_screenshot and page:
try:
screenshot = page.screenshot(full_page=False)
except Exception:
pass
return PlaywrightFetchResult(
success=False,
error=error,
screenshot=screenshot,
duration_ms=duration_ms,
)
except Exception as e:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Erreur Playwright: {str(e)}"
logger.error(f"[Playwright] {error}")
# Tenter un screenshot même en cas d'erreur
screenshot = None
if save_screenshot and page:
try:
screenshot = page.screenshot(full_page=False)
except Exception:
pass
return PlaywrightFetchResult(
success=False,
error=error,
screenshot=screenshot,
duration_ms=duration_ms,
)
finally:
# Nettoyage
try:
if page:
page.close()
if browser:
browser.close()
if playwright:
playwright.stop()
except Exception as e:
logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}")
def fetch_with_fallback(
url: str,
try_http_first: bool = True,
playwright_options: Optional[dict] = None,
) -> PlaywrightFetchResult:
"""
Stratégie de récupération avec fallback HTTP → Playwright.
Args:
url: URL à récupérer
try_http_first: Tenter HTTP d'abord (plus rapide)
playwright_options: Options pour Playwright si nécessaire
Returns:
PlaywrightFetchResult
Justification technique:
- HTTP d'abord car beaucoup plus rapide (~1s vs ~10s)
- Fallback Playwright si HTTP échoue (403, timeout, etc.)
- Économise des ressources quand HTTP suffit
"""
from pricewatch.app.scraping.http_fetch import fetch_http
playwright_options = playwright_options or {}
if try_http_first:
logger.info(f"[Fallback] Tentative HTTP d'abord: {url}")
http_result = fetch_http(url)
if http_result.success:
logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright")
return PlaywrightFetchResult(
success=True,
html=http_result.html,
duration_ms=http_result.duration_ms,
)
logger.warning(
f"[Fallback] HTTP échoué ({http_result.error}), "
"fallback vers Playwright"
)
# Playwright en fallback ou en méthode principale
return fetch_playwright(url, **playwright_options)