194 lines
6.4 KiB
Python
Executable File
194 lines
6.4 KiB
Python
Executable File
"""
|
|
Récupération HTTP simple pour le scraping.
|
|
|
|
Utilise requests avec rotation de User-Agent et gestion des erreurs.
|
|
Méthode prioritaire avant le fallback Playwright (plus lent).
|
|
"""
|
|
|
|
import random
|
|
import time
|
|
from typing import Optional
|
|
|
|
import requests
|
|
from requests.exceptions import RequestException, Timeout
|
|
|
|
from pricewatch.app.core.logging import get_logger
|
|
|
|
logger = get_logger("scraping.http")
|
|
|
|
# Liste de User-Agents réalistes pour éviter les blocages
|
|
USER_AGENTS = [
|
|
# Chrome on Windows
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
# Chrome on macOS
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
# Firefox on Windows
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
# Firefox on macOS
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
# Safari on macOS
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
|
# Edge on Windows
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
|
]
|
|
|
|
|
|
class FetchResult:
|
|
"""Résultat d'une récupération HTTP."""
|
|
|
|
def __init__(
|
|
self,
|
|
success: bool,
|
|
html: Optional[str] = None,
|
|
error: Optional[str] = None,
|
|
status_code: Optional[int] = None,
|
|
duration_ms: Optional[int] = None,
|
|
):
|
|
self.success = success
|
|
self.html = html
|
|
self.error = error
|
|
self.status_code = status_code
|
|
self.duration_ms = duration_ms
|
|
|
|
|
|
def fetch_http(
|
|
url: str,
|
|
timeout: int = 30,
|
|
headers: Optional[dict] = None,
|
|
follow_redirects: bool = True,
|
|
) -> FetchResult:
|
|
"""
|
|
Récupère une page via HTTP simple avec requests.
|
|
|
|
Args:
|
|
url: URL à récupérer
|
|
timeout: Timeout en secondes
|
|
headers: Headers HTTP personnalisés (optionnel)
|
|
follow_redirects: Suivre les redirections automatiquement
|
|
|
|
Returns:
|
|
FetchResult avec le HTML ou l'erreur
|
|
|
|
Justification technique:
|
|
- User-Agent aléatoire pour éviter les blocages basiques
|
|
- Timeout configuré pour ne pas bloquer indéfiniment
|
|
- Gestion explicite des codes d'erreur (403, 404, 429, etc.)
|
|
- Headers Accept pour indiquer qu'on veut du HTML
|
|
"""
|
|
if not url or not url.strip():
|
|
logger.error("URL vide fournie")
|
|
return FetchResult(success=False, error="URL vide")
|
|
|
|
start_time = time.time()
|
|
|
|
# Headers par défaut
|
|
default_headers = {
|
|
"User-Agent": random.choice(USER_AGENTS),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
}
|
|
|
|
# Merge avec headers personnalisés
|
|
if headers:
|
|
default_headers.update(headers)
|
|
|
|
logger.info(f"[HTTP] Récupération: {url}")
|
|
logger.debug(f"[HTTP] User-Agent: {default_headers['User-Agent'][:50]}...")
|
|
|
|
try:
|
|
response = requests.get(
|
|
url,
|
|
headers=default_headers,
|
|
timeout=timeout,
|
|
allow_redirects=follow_redirects,
|
|
)
|
|
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
|
|
# Vérifier le code de statut
|
|
if response.status_code == 200:
|
|
html = response.text
|
|
logger.info(
|
|
f"[HTTP] Succès: {len(html)} chars, {duration_ms}ms, "
|
|
f"status={response.status_code}"
|
|
)
|
|
return FetchResult(
|
|
success=True,
|
|
html=html,
|
|
status_code=response.status_code,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
# Codes d'erreur courants
|
|
elif response.status_code == 403:
|
|
error = "403 Forbidden - Anti-bot détecté"
|
|
logger.warning(f"[HTTP] {error}")
|
|
return FetchResult(
|
|
success=False,
|
|
error=error,
|
|
status_code=response.status_code,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
elif response.status_code == 404:
|
|
error = "404 Not Found - Page introuvable"
|
|
logger.warning(f"[HTTP] {error}")
|
|
return FetchResult(
|
|
success=False,
|
|
error=error,
|
|
status_code=response.status_code,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
elif response.status_code == 429:
|
|
error = "429 Too Many Requests - Rate limit atteint"
|
|
logger.warning(f"[HTTP] {error}")
|
|
return FetchResult(
|
|
success=False,
|
|
error=error,
|
|
status_code=response.status_code,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
elif response.status_code >= 500:
|
|
error = f"{response.status_code} Server Error - Erreur serveur"
|
|
logger.warning(f"[HTTP] {error}")
|
|
return FetchResult(
|
|
success=False,
|
|
error=error,
|
|
status_code=response.status_code,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
else:
|
|
error = f"HTTP {response.status_code} - Erreur inconnue"
|
|
logger.warning(f"[HTTP] {error}")
|
|
return FetchResult(
|
|
success=False,
|
|
error=error,
|
|
status_code=response.status_code,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
except Timeout:
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
error = f"Timeout après {timeout}s"
|
|
logger.error(f"[HTTP] {error}")
|
|
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
|
|
|
except RequestException as e:
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
error = f"Erreur réseau: {str(e)}"
|
|
logger.error(f"[HTTP] {error}")
|
|
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
|
|
|
except Exception as e:
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
error = f"Erreur inattendue: {str(e)}"
|
|
logger.error(f"[HTTP] {error}")
|
|
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|