chore: sync project files

This commit is contained in:
Gilles Soulier
2026-01-13 19:49:04 +01:00
parent 53f8227941
commit ecda149a4b
149 changed files with 65272 additions and 1 deletions

View File

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,193 @@
"""
Récupération HTTP simple pour le scraping.
Utilise requests avec rotation de User-Agent et gestion des erreurs.
Méthode prioritaire avant le fallback Playwright (plus lent).
"""
import random
import time
from typing import Optional
import requests
from requests.exceptions import RequestException, Timeout
from pricewatch.app.core.logging import get_logger
logger = get_logger("scraping.http")
# Liste de User-Agents réalistes pour éviter les blocages
USER_AGENTS = [
# Chrome on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Chrome on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Firefox on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
# Firefox on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
# Safari on macOS
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
# Edge on Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
]
class FetchResult:
"""Résultat d'une récupération HTTP."""
def __init__(
self,
success: bool,
html: Optional[str] = None,
error: Optional[str] = None,
status_code: Optional[int] = None,
duration_ms: Optional[int] = None,
):
self.success = success
self.html = html
self.error = error
self.status_code = status_code
self.duration_ms = duration_ms
def fetch_http(
url: str,
timeout: int = 30,
headers: Optional[dict] = None,
follow_redirects: bool = True,
) -> FetchResult:
"""
Récupère une page via HTTP simple avec requests.
Args:
url: URL à récupérer
timeout: Timeout en secondes
headers: Headers HTTP personnalisés (optionnel)
follow_redirects: Suivre les redirections automatiquement
Returns:
FetchResult avec le HTML ou l'erreur
Justification technique:
- User-Agent aléatoire pour éviter les blocages basiques
- Timeout configuré pour ne pas bloquer indéfiniment
- Gestion explicite des codes d'erreur (403, 404, 429, etc.)
- Headers Accept pour indiquer qu'on veut du HTML
"""
if not url or not url.strip():
logger.error("URL vide fournie")
return FetchResult(success=False, error="URL vide")
start_time = time.time()
# Headers par défaut
default_headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
# Merge avec headers personnalisés
if headers:
default_headers.update(headers)
logger.info(f"[HTTP] Récupération: {url}")
logger.debug(f"[HTTP] User-Agent: {default_headers['User-Agent'][:50]}...")
try:
response = requests.get(
url,
headers=default_headers,
timeout=timeout,
allow_redirects=follow_redirects,
)
duration_ms = int((time.time() - start_time) * 1000)
# Vérifier le code de statut
if response.status_code == 200:
html = response.text
logger.info(
f"[HTTP] Succès: {len(html)} chars, {duration_ms}ms, "
f"status={response.status_code}"
)
return FetchResult(
success=True,
html=html,
status_code=response.status_code,
duration_ms=duration_ms,
)
# Codes d'erreur courants
elif response.status_code == 403:
error = "403 Forbidden - Anti-bot détecté"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
elif response.status_code == 404:
error = "404 Not Found - Page introuvable"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
elif response.status_code == 429:
error = "429 Too Many Requests - Rate limit atteint"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
elif response.status_code >= 500:
error = f"{response.status_code} Server Error - Erreur serveur"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
else:
error = f"HTTP {response.status_code} - Erreur inconnue"
logger.warning(f"[HTTP] {error}")
return FetchResult(
success=False,
error=error,
status_code=response.status_code,
duration_ms=duration_ms,
)
except Timeout:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Timeout après {timeout}s"
logger.error(f"[HTTP] {error}")
return FetchResult(success=False, error=error, duration_ms=duration_ms)
except RequestException as e:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Erreur réseau: {str(e)}"
logger.error(f"[HTTP] {error}")
return FetchResult(success=False, error=error, duration_ms=duration_ms)
except Exception as e:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Erreur inattendue: {str(e)}"
logger.error(f"[HTTP] {error}")
return FetchResult(success=False, error=error, duration_ms=duration_ms)

View File

@@ -0,0 +1,238 @@
"""
Récupération avec Playwright (fallback anti-bot).
Utilisé quand HTTP échoue (403, captcha, etc.).
Plus lent mais plus robuste contre les protections anti-scraping.
"""
import time
from typing import Optional
from playwright.sync_api import (
Browser,
Page,
Playwright,
sync_playwright,
TimeoutError as PlaywrightTimeout,
)
from pricewatch.app.core.logging import get_logger
logger = get_logger("scraping.playwright")
class PlaywrightFetchResult:
"""Résultat d'une récupération Playwright."""
def __init__(
self,
success: bool,
html: Optional[str] = None,
screenshot: Optional[bytes] = None,
error: Optional[str] = None,
duration_ms: Optional[int] = None,
):
self.success = success
self.html = html
self.screenshot = screenshot
self.error = error
self.duration_ms = duration_ms
def fetch_playwright(
url: str,
headless: bool = True,
timeout_ms: int = 60000,
save_screenshot: bool = False,
wait_for_selector: Optional[str] = None,
) -> PlaywrightFetchResult:
"""
Récupère une page avec Playwright.
Args:
url: URL à récupérer
headless: Mode headless (True) ou visible (False)
timeout_ms: Timeout en millisecondes
save_screenshot: Prendre un screenshot
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
Returns:
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
Justification technique:
- Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots
- Headless par défaut pour performance
- Headful disponible pour debug visuel
- Screenshot optionnel pour diagnostiquer les échecs
- wait_for_selector permet d'attendre le chargement dynamique
"""
if not url or not url.strip():
logger.error("URL vide fournie")
return PlaywrightFetchResult(success=False, error="URL vide")
start_time = time.time()
logger.info(f"[Playwright] Récupération: {url} (headless={headless})")
playwright: Optional[Playwright] = None
browser: Optional[Browser] = None
page: Optional[Page] = None
try:
playwright = sync_playwright().start()
# Lancer le navigateur Chromium
browser = playwright.chromium.launch(headless=headless)
# Créer un contexte avec User-Agent réaliste
context = browser.new_context(
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
viewport={"width": 1920, "height": 1080},
locale="fr-FR",
)
page = context.new_page()
# Définir le timeout
page.set_default_timeout(timeout_ms)
# Naviguer vers la page
logger.debug(f"[Playwright] Navigation vers {url}")
response = page.goto(url, wait_until="domcontentloaded")
if not response:
raise Exception("Pas de réponse du serveur")
# Attendre un sélecteur spécifique si demandé
if wait_for_selector:
logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}")
try:
page.wait_for_selector(wait_for_selector, timeout=timeout_ms)
except PlaywrightTimeout:
logger.warning(
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
)
# Récupérer le HTML
html = page.content()
# Screenshot optionnel
screenshot = None
if save_screenshot:
logger.debug("[Playwright] Capture du screenshot")
screenshot = page.screenshot(full_page=False)
duration_ms = int((time.time() - start_time) * 1000)
logger.info(
f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, "
f"status={response.status}"
)
return PlaywrightFetchResult(
success=True,
html=html,
screenshot=screenshot,
duration_ms=duration_ms,
)
except PlaywrightTimeout:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Timeout après {timeout_ms}ms"
logger.error(f"[Playwright] {error}")
# Tenter un screenshot même en cas d'erreur
screenshot = None
if save_screenshot and page:
try:
screenshot = page.screenshot(full_page=False)
except Exception:
pass
return PlaywrightFetchResult(
success=False,
error=error,
screenshot=screenshot,
duration_ms=duration_ms,
)
except Exception as e:
duration_ms = int((time.time() - start_time) * 1000)
error = f"Erreur Playwright: {str(e)}"
logger.error(f"[Playwright] {error}")
# Tenter un screenshot même en cas d'erreur
screenshot = None
if save_screenshot and page:
try:
screenshot = page.screenshot(full_page=False)
except Exception:
pass
return PlaywrightFetchResult(
success=False,
error=error,
screenshot=screenshot,
duration_ms=duration_ms,
)
finally:
# Nettoyage
try:
if page:
page.close()
if browser:
browser.close()
if playwright:
playwright.stop()
except Exception as e:
logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}")
def fetch_with_fallback(
url: str,
try_http_first: bool = True,
playwright_options: Optional[dict] = None,
) -> PlaywrightFetchResult:
"""
Stratégie de récupération avec fallback HTTP → Playwright.
Args:
url: URL à récupérer
try_http_first: Tenter HTTP d'abord (plus rapide)
playwright_options: Options pour Playwright si nécessaire
Returns:
PlaywrightFetchResult
Justification technique:
- HTTP d'abord car beaucoup plus rapide (~1s vs ~10s)
- Fallback Playwright si HTTP échoue (403, timeout, etc.)
- Économise des ressources quand HTTP suffit
"""
from pricewatch.app.scraping.http_fetch import fetch_http
playwright_options = playwright_options or {}
if try_http_first:
logger.info(f"[Fallback] Tentative HTTP d'abord: {url}")
http_result = fetch_http(url)
if http_result.success:
logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright")
return PlaywrightFetchResult(
success=True,
html=http_result.html,
duration_ms=http_result.duration_ms,
)
logger.warning(
f"[Fallback] HTTP échoué ({http_result.error}), "
"fallback vers Playwright"
)
# Playwright en fallback ou en méthode principale
return fetch_playwright(url, **playwright_options)