""" Récupération avec Playwright (fallback anti-bot). Utilisé quand HTTP échoue (403, captcha, etc.). Plus lent mais plus robuste contre les protections anti-scraping. """ import time from typing import Optional from playwright.sync_api import ( Browser, Page, Playwright, sync_playwright, TimeoutError as PlaywrightTimeout, ) from pricewatch.app.core.logging import get_logger logger = get_logger("scraping.playwright") class PlaywrightFetchResult: """Résultat d'une récupération Playwright.""" def __init__( self, success: bool, html: Optional[str] = None, screenshot: Optional[bytes] = None, error: Optional[str] = None, duration_ms: Optional[int] = None, ): self.success = success self.html = html self.screenshot = screenshot self.error = error self.duration_ms = duration_ms def fetch_playwright( url: str, headless: bool = True, timeout_ms: int = 60000, save_screenshot: bool = False, wait_for_selector: Optional[str] = None, ) -> PlaywrightFetchResult: """ Récupère une page avec Playwright. Args: url: URL à récupérer headless: Mode headless (True) ou visible (False) timeout_ms: Timeout en millisecondes save_screenshot: Prendre un screenshot wait_for_selector: Attendre un sélecteur CSS avant de récupérer Returns: PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur Justification technique: - Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots - Headless par défaut pour performance - Headful disponible pour debug visuel - Screenshot optionnel pour diagnostiquer les échecs - wait_for_selector permet d'attendre le chargement dynamique """ if not url or not url.strip(): logger.error("URL vide fournie") return PlaywrightFetchResult(success=False, error="URL vide") start_time = time.time() logger.info(f"[Playwright] Récupération: {url} (headless={headless})") playwright: Optional[Playwright] = None browser: Optional[Browser] = None page: Optional[Page] = None try: playwright = sync_playwright().start() # Lancer le navigateur Chromium browser = playwright.chromium.launch(headless=headless) # Créer un contexte avec User-Agent réaliste context = browser.new_context( user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), viewport={"width": 1920, "height": 1080}, locale="fr-FR", ) page = context.new_page() # Définir le timeout page.set_default_timeout(timeout_ms) # Naviguer vers la page logger.debug(f"[Playwright] Navigation vers {url}") response = page.goto(url, wait_until="domcontentloaded") if not response: raise Exception("Pas de réponse du serveur") # Attendre un sélecteur spécifique si demandé if wait_for_selector: logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}") try: page.wait_for_selector(wait_for_selector, timeout=timeout_ms) except PlaywrightTimeout: logger.warning( f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}" ) # Récupérer le HTML html = page.content() # Screenshot optionnel screenshot = None if save_screenshot: logger.debug("[Playwright] Capture du screenshot") screenshot = page.screenshot(full_page=False) duration_ms = int((time.time() - start_time) * 1000) logger.info( f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, " f"status={response.status}" ) return PlaywrightFetchResult( success=True, html=html, screenshot=screenshot, duration_ms=duration_ms, ) except PlaywrightTimeout: duration_ms = int((time.time() - start_time) * 1000) error = f"Timeout après {timeout_ms}ms" logger.error(f"[Playwright] {error}") # Tenter un screenshot même en cas d'erreur screenshot = None if save_screenshot and page: try: screenshot = page.screenshot(full_page=False) except Exception: pass return PlaywrightFetchResult( success=False, error=error, screenshot=screenshot, duration_ms=duration_ms, ) except Exception as e: duration_ms = int((time.time() - start_time) * 1000) error = f"Erreur Playwright: {str(e)}" logger.error(f"[Playwright] {error}") # Tenter un screenshot même en cas d'erreur screenshot = None if save_screenshot and page: try: screenshot = page.screenshot(full_page=False) except Exception: pass return PlaywrightFetchResult( success=False, error=error, screenshot=screenshot, duration_ms=duration_ms, ) finally: # Nettoyage try: if page: page.close() if browser: browser.close() if playwright: playwright.stop() except Exception as e: logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}") def fetch_with_fallback( url: str, try_http_first: bool = True, playwright_options: Optional[dict] = None, ) -> PlaywrightFetchResult: """ Stratégie de récupération avec fallback HTTP → Playwright. Args: url: URL à récupérer try_http_first: Tenter HTTP d'abord (plus rapide) playwright_options: Options pour Playwright si nécessaire Returns: PlaywrightFetchResult Justification technique: - HTTP d'abord car beaucoup plus rapide (~1s vs ~10s) - Fallback Playwright si HTTP échoue (403, timeout, etc.) - Économise des ressources quand HTTP suffit """ from pricewatch.app.scraping.http_fetch import fetch_http playwright_options = playwright_options or {} if try_http_first: logger.info(f"[Fallback] Tentative HTTP d'abord: {url}") http_result = fetch_http(url) if http_result.success: logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright") return PlaywrightFetchResult( success=True, html=http_result.html, duration_ms=http_result.duration_ms, ) logger.warning( f"[Fallback] HTTP échoué ({http_result.error}), " "fallback vers Playwright" ) # Playwright en fallback ou en méthode principale return fetch_playwright(url, **playwright_options)