""" Récupération HTTP simple pour le scraping. Utilise requests avec rotation de User-Agent et gestion des erreurs. Méthode prioritaire avant le fallback Playwright (plus lent). """ import random import time from typing import Optional import requests from requests.exceptions import RequestException, Timeout from pricewatch.app.core.logging import get_logger logger = get_logger("scraping.http") # Liste de User-Agents réalistes pour éviter les blocages USER_AGENTS = [ # Chrome on Windows "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # Chrome on macOS "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # Firefox on Windows "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", # Firefox on macOS "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0", # Safari on macOS "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", # Edge on Windows "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0", ] class FetchResult: """Résultat d'une récupération HTTP.""" def __init__( self, success: bool, html: Optional[str] = None, error: Optional[str] = None, status_code: Optional[int] = None, duration_ms: Optional[int] = None, ): self.success = success self.html = html self.error = error self.status_code = status_code self.duration_ms = duration_ms def fetch_http( url: str, timeout: int = 30, headers: Optional[dict] = None, follow_redirects: bool = True, ) -> FetchResult: """ Récupère une page via HTTP simple avec requests. Args: url: URL à récupérer timeout: Timeout en secondes headers: Headers HTTP personnalisés (optionnel) follow_redirects: Suivre les redirections automatiquement Returns: FetchResult avec le HTML ou l'erreur Justification technique: - User-Agent aléatoire pour éviter les blocages basiques - Timeout configuré pour ne pas bloquer indéfiniment - Gestion explicite des codes d'erreur (403, 404, 429, etc.) - Headers Accept pour indiquer qu'on veut du HTML """ if not url or not url.strip(): logger.error("URL vide fournie") return FetchResult(success=False, error="URL vide") start_time = time.time() # Headers par défaut default_headers = { "User-Agent": random.choice(USER_AGENTS), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7", "Accept-Encoding": "gzip, deflate, br", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } # Merge avec headers personnalisés if headers: default_headers.update(headers) logger.info(f"[HTTP] Récupération: {url}") logger.debug(f"[HTTP] User-Agent: {default_headers['User-Agent'][:50]}...") try: response = requests.get( url, headers=default_headers, timeout=timeout, allow_redirects=follow_redirects, ) duration_ms = int((time.time() - start_time) * 1000) # Vérifier le code de statut if response.status_code == 200: html = response.text logger.info( f"[HTTP] Succès: {len(html)} chars, {duration_ms}ms, " f"status={response.status_code}" ) return FetchResult( success=True, html=html, status_code=response.status_code, duration_ms=duration_ms, ) # Codes d'erreur courants elif response.status_code == 403: error = "403 Forbidden - Anti-bot détecté" logger.warning(f"[HTTP] {error}") return FetchResult( success=False, error=error, status_code=response.status_code, duration_ms=duration_ms, ) elif response.status_code == 404: error = "404 Not Found - Page introuvable" logger.warning(f"[HTTP] {error}") return FetchResult( success=False, error=error, status_code=response.status_code, duration_ms=duration_ms, ) elif response.status_code == 429: error = "429 Too Many Requests - Rate limit atteint" logger.warning(f"[HTTP] {error}") return FetchResult( success=False, error=error, status_code=response.status_code, duration_ms=duration_ms, ) elif response.status_code >= 500: error = f"{response.status_code} Server Error - Erreur serveur" logger.warning(f"[HTTP] {error}") return FetchResult( success=False, error=error, status_code=response.status_code, duration_ms=duration_ms, ) else: error = f"HTTP {response.status_code} - Erreur inconnue" logger.warning(f"[HTTP] {error}") return FetchResult( success=False, error=error, status_code=response.status_code, duration_ms=duration_ms, ) except Timeout: duration_ms = int((time.time() - start_time) * 1000) error = f"Timeout après {timeout}s" logger.error(f"[HTTP] {error}") return FetchResult(success=False, error=error, duration_ms=duration_ms) except RequestException as e: duration_ms = int((time.time() - start_time) * 1000) error = f"Erreur réseau: {str(e)}" logger.error(f"[HTTP] {error}") return FetchResult(success=False, error=error, duration_ms=duration_ms) except Exception as e: duration_ms = int((time.time() - start_time) * 1000) error = f"Erreur inattendue: {str(e)}" logger.error(f"[HTTP] {error}") return FetchResult(success=False, error=error, duration_ms=duration_ms)