Files
scrap/pricewatch/app/scraping/http_fetch.py
2026-01-13 19:49:04 +01:00

194 lines
6.4 KiB
Python
Executable File

"""
Récupération HTTP simple pour le scraping.
Utilise requests avec rotation de User-Agent et gestion des erreurs.
Méthode prioritaire avant le fallback Playwright (plus lent).
"""
import random
import time
from typing import Optional
import requests
from requests.exceptions import RequestException, Timeout
from pricewatch.app.core.logging import get_logger
logger = get_logger("scraping.http")
# Liste de User-Agents réalistes pour éviter les blocages
USER_AGENTS = [
# Chrome on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Chrome on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Firefox on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
# Firefox on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
# Safari on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
# Edge on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
]
class FetchResult:
"""Résultat d'une récupération HTTP."""
def __init__(
self,
success: bool,
html: Optional[str] = None,
error: Optional[str] = None,
status_code: Optional[int] = None,
duration_ms: Optional[int] = None,
):
self.success = success
self.html = html
self.error = error
self.status_code = status_code
self.duration_ms = duration_ms
def fetch_http(
url: str,
timeout: int = 30,
headers: Optional[dict] = None,
follow_redirects: bool = True,
) -> FetchResult:
"""
Récupère une page via HTTP simple avec requests.
Args:
url: URL à récupérer
timeout: Timeout en secondes
headers: Headers HTTP personnalisés (optionnel)
follow_redirects: Suivre les redirections automatiquement
Returns:
FetchResult avec le HTML ou l'erreur
Justification technique:
- User-Agent aléatoire pour éviter les blocages basiques
- Timeout configuré pour ne pas bloquer indéfiniment
- Gestion explicite des codes d'erreur (403, 404, 429, etc.)
- Headers Accept pour indiquer qu'on veut du HTML
"""
if not url or not url.strip():
logger.error("URL vide fournie")
return FetchResult(success=False, error="URL vide")
start_time = time.time()
# Headers par défaut
default_headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
# Merge avec headers personnalisés
if headers:
default_headers.update(headers)
logger.info(f"[HTTP] Récupération: {url}")
logger.debug(f"[HTTP] User-Agent: {default_headers['User-Agent'][:50]}...")
try:
response = requests.get(
url,
headers=default_headers,
timeout=timeout,
allow_redirects=follow_redirects,
)
duration_ms = int((time.time() - start_time) * 1000)
# Vérifier le code de statut
if response.status_code == 200:
html = response.text
logger.info(
f"[HTTP] Succès: {len(html)} chars, {duration_ms}ms, "
f"status={response.status_code}"
)
return FetchResult(
success=True,
html=html,
status_code=response.status_code,
duration_ms=duration_ms,
)
# Codes d'erreur courants
elif response.status_code == 403:
error = "403 Forbidden - Anti-bot détecté"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
elif response.status_code == 404:
error = "404 Not Found - Page introuvable"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
elif response.status_code == 429:
error = "429 Too Many Requests - Rate limit atteint"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
elif response.status_code >= 500:
error = f"{response.status_code} Server Error - Erreur serveur"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
else:
error = f"HTTP {response.status_code} - Erreur inconnue"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
except Timeout:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Timeout après {timeout}s"
logger.error(f"[HTTP] {error}")
return FetchResult(success=False, error=error, duration_ms=duration_ms)
except RequestException as e:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Erreur réseau: {str(e)}"
logger.error(f"[HTTP] {error}")
return FetchResult(success=False, error=error, duration_ms=duration_ms)
except Exception as e:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Erreur inattendue: {str(e)}"
logger.error(f"[HTTP] {error}")
return FetchResult(success=False, error=error, duration_ms=duration_ms)