chore: sync project files
This commit is contained in:
193
pricewatch/app/scraping/http_fetch.py
Executable file
193
pricewatch/app/scraping/http_fetch.py
Executable file
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Récupération HTTP simple pour le scraping.
|
||||
|
||||
Utilise requests avec rotation de User-Agent et gestion des erreurs.
|
||||
Méthode prioritaire avant le fallback Playwright (plus lent).
|
||||
"""
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from requests.exceptions import RequestException, Timeout
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
|
||||
logger = get_logger("scraping.http")
|
||||
|
||||
# Liste de User-Agents réalistes pour éviter les blocages
|
||||
USER_AGENTS = [
|
||||
# Chrome on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
# Chrome on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
# Firefox on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
# Firefox on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
# Safari on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
||||
# Edge on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
||||
]
|
||||
|
||||
|
||||
class FetchResult:
|
||||
"""Résultat d'une récupération HTTP."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
html: Optional[str] = None,
|
||||
error: Optional[str] = None,
|
||||
status_code: Optional[int] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
):
|
||||
self.success = success
|
||||
self.html = html
|
||||
self.error = error
|
||||
self.status_code = status_code
|
||||
self.duration_ms = duration_ms
|
||||
|
||||
|
||||
def fetch_http(
|
||||
url: str,
|
||||
timeout: int = 30,
|
||||
headers: Optional[dict] = None,
|
||||
follow_redirects: bool = True,
|
||||
) -> FetchResult:
|
||||
"""
|
||||
Récupère une page via HTTP simple avec requests.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
timeout: Timeout en secondes
|
||||
headers: Headers HTTP personnalisés (optionnel)
|
||||
follow_redirects: Suivre les redirections automatiquement
|
||||
|
||||
Returns:
|
||||
FetchResult avec le HTML ou l'erreur
|
||||
|
||||
Justification technique:
|
||||
- User-Agent aléatoire pour éviter les blocages basiques
|
||||
- Timeout configuré pour ne pas bloquer indéfiniment
|
||||
- Gestion explicite des codes d'erreur (403, 404, 429, etc.)
|
||||
- Headers Accept pour indiquer qu'on veut du HTML
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.error("URL vide fournie")
|
||||
return FetchResult(success=False, error="URL vide")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Headers par défaut
|
||||
default_headers = {
|
||||
"User-Agent": random.choice(USER_AGENTS),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"DNT": "1",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
# Merge avec headers personnalisés
|
||||
if headers:
|
||||
default_headers.update(headers)
|
||||
|
||||
logger.info(f"[HTTP] Récupération: {url}")
|
||||
logger.debug(f"[HTTP] User-Agent: {default_headers['User-Agent'][:50]}...")
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
url,
|
||||
headers=default_headers,
|
||||
timeout=timeout,
|
||||
allow_redirects=follow_redirects,
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Vérifier le code de statut
|
||||
if response.status_code == 200:
|
||||
html = response.text
|
||||
logger.info(
|
||||
f"[HTTP] Succès: {len(html)} chars, {duration_ms}ms, "
|
||||
f"status={response.status_code}"
|
||||
)
|
||||
return FetchResult(
|
||||
success=True,
|
||||
html=html,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
# Codes d'erreur courants
|
||||
elif response.status_code == 403:
|
||||
error = "403 Forbidden - Anti-bot détecté"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code == 404:
|
||||
error = "404 Not Found - Page introuvable"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code == 429:
|
||||
error = "429 Too Many Requests - Rate limit atteint"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code >= 500:
|
||||
error = f"{response.status_code} Server Error - Erreur serveur"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
else:
|
||||
error = f"HTTP {response.status_code} - Erreur inconnue"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except Timeout:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Timeout après {timeout}s"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
|
||||
except RequestException as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur réseau: {str(e)}"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur inattendue: {str(e)}"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
Reference in New Issue
Block a user