239 lines
7.0 KiB
Python
Executable File
239 lines
7.0 KiB
Python
Executable File
"""
|
|
Récupération avec Playwright (fallback anti-bot).
|
|
|
|
Utilisé quand HTTP échoue (403, captcha, etc.).
|
|
Plus lent mais plus robuste contre les protections anti-scraping.
|
|
"""
|
|
|
|
import time
|
|
from typing import Optional
|
|
|
|
from playwright.sync_api import (
|
|
Browser,
|
|
Page,
|
|
Playwright,
|
|
sync_playwright,
|
|
TimeoutError as PlaywrightTimeout,
|
|
)
|
|
|
|
from pricewatch.app.core.logging import get_logger
|
|
|
|
logger = get_logger("scraping.playwright")
|
|
|
|
|
|
class PlaywrightFetchResult:
|
|
"""Résultat d'une récupération Playwright."""
|
|
|
|
def __init__(
|
|
self,
|
|
success: bool,
|
|
html: Optional[str] = None,
|
|
screenshot: Optional[bytes] = None,
|
|
error: Optional[str] = None,
|
|
duration_ms: Optional[int] = None,
|
|
):
|
|
self.success = success
|
|
self.html = html
|
|
self.screenshot = screenshot
|
|
self.error = error
|
|
self.duration_ms = duration_ms
|
|
|
|
|
|
def fetch_playwright(
|
|
url: str,
|
|
headless: bool = True,
|
|
timeout_ms: int = 60000,
|
|
save_screenshot: bool = False,
|
|
wait_for_selector: Optional[str] = None,
|
|
) -> PlaywrightFetchResult:
|
|
"""
|
|
Récupère une page avec Playwright.
|
|
|
|
Args:
|
|
url: URL à récupérer
|
|
headless: Mode headless (True) ou visible (False)
|
|
timeout_ms: Timeout en millisecondes
|
|
save_screenshot: Prendre un screenshot
|
|
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
|
|
|
|
Returns:
|
|
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
|
|
|
|
Justification technique:
|
|
- Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots
|
|
- Headless par défaut pour performance
|
|
- Headful disponible pour debug visuel
|
|
- Screenshot optionnel pour diagnostiquer les échecs
|
|
- wait_for_selector permet d'attendre le chargement dynamique
|
|
"""
|
|
if not url or not url.strip():
|
|
logger.error("URL vide fournie")
|
|
return PlaywrightFetchResult(success=False, error="URL vide")
|
|
|
|
start_time = time.time()
|
|
logger.info(f"[Playwright] Récupération: {url} (headless={headless})")
|
|
|
|
playwright: Optional[Playwright] = None
|
|
browser: Optional[Browser] = None
|
|
page: Optional[Page] = None
|
|
|
|
try:
|
|
playwright = sync_playwright().start()
|
|
|
|
# Lancer le navigateur Chromium
|
|
browser = playwright.chromium.launch(headless=headless)
|
|
|
|
# Créer un contexte avec User-Agent réaliste
|
|
context = browser.new_context(
|
|
user_agent=(
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
viewport={"width": 1920, "height": 1080},
|
|
locale="fr-FR",
|
|
)
|
|
|
|
page = context.new_page()
|
|
|
|
# Définir le timeout
|
|
page.set_default_timeout(timeout_ms)
|
|
|
|
# Naviguer vers la page
|
|
logger.debug(f"[Playwright] Navigation vers {url}")
|
|
response = page.goto(url, wait_until="domcontentloaded")
|
|
|
|
if not response:
|
|
raise Exception("Pas de réponse du serveur")
|
|
|
|
# Attendre un sélecteur spécifique si demandé
|
|
if wait_for_selector:
|
|
logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}")
|
|
try:
|
|
page.wait_for_selector(wait_for_selector, timeout=timeout_ms)
|
|
except PlaywrightTimeout:
|
|
logger.warning(
|
|
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
|
|
)
|
|
|
|
# Récupérer le HTML
|
|
html = page.content()
|
|
|
|
# Screenshot optionnel
|
|
screenshot = None
|
|
if save_screenshot:
|
|
logger.debug("[Playwright] Capture du screenshot")
|
|
screenshot = page.screenshot(full_page=False)
|
|
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
|
|
logger.info(
|
|
f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, "
|
|
f"status={response.status}"
|
|
)
|
|
|
|
return PlaywrightFetchResult(
|
|
success=True,
|
|
html=html,
|
|
screenshot=screenshot,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
except PlaywrightTimeout:
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
error = f"Timeout après {timeout_ms}ms"
|
|
logger.error(f"[Playwright] {error}")
|
|
|
|
# Tenter un screenshot même en cas d'erreur
|
|
screenshot = None
|
|
if save_screenshot and page:
|
|
try:
|
|
screenshot = page.screenshot(full_page=False)
|
|
except Exception:
|
|
pass
|
|
|
|
return PlaywrightFetchResult(
|
|
success=False,
|
|
error=error,
|
|
screenshot=screenshot,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
except Exception as e:
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
error = f"Erreur Playwright: {str(e)}"
|
|
logger.error(f"[Playwright] {error}")
|
|
|
|
# Tenter un screenshot même en cas d'erreur
|
|
screenshot = None
|
|
if save_screenshot and page:
|
|
try:
|
|
screenshot = page.screenshot(full_page=False)
|
|
except Exception:
|
|
pass
|
|
|
|
return PlaywrightFetchResult(
|
|
success=False,
|
|
error=error,
|
|
screenshot=screenshot,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
finally:
|
|
# Nettoyage
|
|
try:
|
|
if page:
|
|
page.close()
|
|
if browser:
|
|
browser.close()
|
|
if playwright:
|
|
playwright.stop()
|
|
except Exception as e:
|
|
logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}")
|
|
|
|
|
|
def fetch_with_fallback(
|
|
url: str,
|
|
try_http_first: bool = True,
|
|
playwright_options: Optional[dict] = None,
|
|
) -> PlaywrightFetchResult:
|
|
"""
|
|
Stratégie de récupération avec fallback HTTP → Playwright.
|
|
|
|
Args:
|
|
url: URL à récupérer
|
|
try_http_first: Tenter HTTP d'abord (plus rapide)
|
|
playwright_options: Options pour Playwright si nécessaire
|
|
|
|
Returns:
|
|
PlaywrightFetchResult
|
|
|
|
Justification technique:
|
|
- HTTP d'abord car beaucoup plus rapide (~1s vs ~10s)
|
|
- Fallback Playwright si HTTP échoue (403, timeout, etc.)
|
|
- Économise des ressources quand HTTP suffit
|
|
"""
|
|
from pricewatch.app.scraping.http_fetch import fetch_http
|
|
|
|
playwright_options = playwright_options or {}
|
|
|
|
if try_http_first:
|
|
logger.info(f"[Fallback] Tentative HTTP d'abord: {url}")
|
|
http_result = fetch_http(url)
|
|
|
|
if http_result.success:
|
|
logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright")
|
|
return PlaywrightFetchResult(
|
|
success=True,
|
|
html=http_result.html,
|
|
duration_ms=http_result.duration_ms,
|
|
)
|
|
|
|
logger.warning(
|
|
f"[Fallback] HTTP échoué ({http_result.error}), "
|
|
"fallback vers Playwright"
|
|
)
|
|
|
|
# Playwright en fallback ou en méthode principale
|
|
return fetch_playwright(url, **playwright_options)
|