chore: sync project files
This commit is contained in:
0
pricewatch/app/scraping/__init__.py
Executable file
0
pricewatch/app/scraping/__init__.py
Executable file
BIN
pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/scraping/__pycache__/http_fetch.cpython-313.pyc
Executable file
BIN
pricewatch/app/scraping/__pycache__/http_fetch.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/scraping/__pycache__/pw_fetch.cpython-313.pyc
Executable file
BIN
pricewatch/app/scraping/__pycache__/pw_fetch.cpython-313.pyc
Executable file
Binary file not shown.
193
pricewatch/app/scraping/http_fetch.py
Executable file
193
pricewatch/app/scraping/http_fetch.py
Executable file
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Récupération HTTP simple pour le scraping.
|
||||
|
||||
Utilise requests avec rotation de User-Agent et gestion des erreurs.
|
||||
Méthode prioritaire avant le fallback Playwright (plus lent).
|
||||
"""
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from requests.exceptions import RequestException, Timeout
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
|
||||
logger = get_logger("scraping.http")
|
||||
|
||||
# Liste de User-Agents réalistes pour éviter les blocages
|
||||
USER_AGENTS = [
|
||||
# Chrome on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
# Chrome on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
# Firefox on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
# Firefox on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
# Safari on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
||||
# Edge on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
||||
]
|
||||
|
||||
|
||||
class FetchResult:
|
||||
"""Résultat d'une récupération HTTP."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
html: Optional[str] = None,
|
||||
error: Optional[str] = None,
|
||||
status_code: Optional[int] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
):
|
||||
self.success = success
|
||||
self.html = html
|
||||
self.error = error
|
||||
self.status_code = status_code
|
||||
self.duration_ms = duration_ms
|
||||
|
||||
|
||||
def fetch_http(
|
||||
url: str,
|
||||
timeout: int = 30,
|
||||
headers: Optional[dict] = None,
|
||||
follow_redirects: bool = True,
|
||||
) -> FetchResult:
|
||||
"""
|
||||
Récupère une page via HTTP simple avec requests.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
timeout: Timeout en secondes
|
||||
headers: Headers HTTP personnalisés (optionnel)
|
||||
follow_redirects: Suivre les redirections automatiquement
|
||||
|
||||
Returns:
|
||||
FetchResult avec le HTML ou l'erreur
|
||||
|
||||
Justification technique:
|
||||
- User-Agent aléatoire pour éviter les blocages basiques
|
||||
- Timeout configuré pour ne pas bloquer indéfiniment
|
||||
- Gestion explicite des codes d'erreur (403, 404, 429, etc.)
|
||||
- Headers Accept pour indiquer qu'on veut du HTML
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.error("URL vide fournie")
|
||||
return FetchResult(success=False, error="URL vide")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Headers par défaut
|
||||
default_headers = {
|
||||
"User-Agent": random.choice(USER_AGENTS),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"DNT": "1",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
# Merge avec headers personnalisés
|
||||
if headers:
|
||||
default_headers.update(headers)
|
||||
|
||||
logger.info(f"[HTTP] Récupération: {url}")
|
||||
logger.debug(f"[HTTP] User-Agent: {default_headers['User-Agent'][:50]}...")
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
url,
|
||||
headers=default_headers,
|
||||
timeout=timeout,
|
||||
allow_redirects=follow_redirects,
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Vérifier le code de statut
|
||||
if response.status_code == 200:
|
||||
html = response.text
|
||||
logger.info(
|
||||
f"[HTTP] Succès: {len(html)} chars, {duration_ms}ms, "
|
||||
f"status={response.status_code}"
|
||||
)
|
||||
return FetchResult(
|
||||
success=True,
|
||||
html=html,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
# Codes d'erreur courants
|
||||
elif response.status_code == 403:
|
||||
error = "403 Forbidden - Anti-bot détecté"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code == 404:
|
||||
error = "404 Not Found - Page introuvable"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code == 429:
|
||||
error = "429 Too Many Requests - Rate limit atteint"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code >= 500:
|
||||
error = f"{response.status_code} Server Error - Erreur serveur"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
else:
|
||||
error = f"HTTP {response.status_code} - Erreur inconnue"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except Timeout:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Timeout après {timeout}s"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
|
||||
except RequestException as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur réseau: {str(e)}"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur inattendue: {str(e)}"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
238
pricewatch/app/scraping/pw_fetch.py
Executable file
238
pricewatch/app/scraping/pw_fetch.py
Executable file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Récupération avec Playwright (fallback anti-bot).
|
||||
|
||||
Utilisé quand HTTP échoue (403, captcha, etc.).
|
||||
Plus lent mais plus robuste contre les protections anti-scraping.
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from playwright.sync_api import (
|
||||
Browser,
|
||||
Page,
|
||||
Playwright,
|
||||
sync_playwright,
|
||||
TimeoutError as PlaywrightTimeout,
|
||||
)
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
|
||||
logger = get_logger("scraping.playwright")
|
||||
|
||||
|
||||
class PlaywrightFetchResult:
|
||||
"""Résultat d'une récupération Playwright."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
html: Optional[str] = None,
|
||||
screenshot: Optional[bytes] = None,
|
||||
error: Optional[str] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
):
|
||||
self.success = success
|
||||
self.html = html
|
||||
self.screenshot = screenshot
|
||||
self.error = error
|
||||
self.duration_ms = duration_ms
|
||||
|
||||
|
||||
def fetch_playwright(
|
||||
url: str,
|
||||
headless: bool = True,
|
||||
timeout_ms: int = 60000,
|
||||
save_screenshot: bool = False,
|
||||
wait_for_selector: Optional[str] = None,
|
||||
) -> PlaywrightFetchResult:
|
||||
"""
|
||||
Récupère une page avec Playwright.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
headless: Mode headless (True) ou visible (False)
|
||||
timeout_ms: Timeout en millisecondes
|
||||
save_screenshot: Prendre un screenshot
|
||||
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
|
||||
|
||||
Returns:
|
||||
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
|
||||
|
||||
Justification technique:
|
||||
- Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots
|
||||
- Headless par défaut pour performance
|
||||
- Headful disponible pour debug visuel
|
||||
- Screenshot optionnel pour diagnostiquer les échecs
|
||||
- wait_for_selector permet d'attendre le chargement dynamique
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.error("URL vide fournie")
|
||||
return PlaywrightFetchResult(success=False, error="URL vide")
|
||||
|
||||
start_time = time.time()
|
||||
logger.info(f"[Playwright] Récupération: {url} (headless={headless})")
|
||||
|
||||
playwright: Optional[Playwright] = None
|
||||
browser: Optional[Browser] = None
|
||||
page: Optional[Page] = None
|
||||
|
||||
try:
|
||||
playwright = sync_playwright().start()
|
||||
|
||||
# Lancer le navigateur Chromium
|
||||
browser = playwright.chromium.launch(headless=headless)
|
||||
|
||||
# Créer un contexte avec User-Agent réaliste
|
||||
context = browser.new_context(
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
locale="fr-FR",
|
||||
)
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# Définir le timeout
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
# Naviguer vers la page
|
||||
logger.debug(f"[Playwright] Navigation vers {url}")
|
||||
response = page.goto(url, wait_until="domcontentloaded")
|
||||
|
||||
if not response:
|
||||
raise Exception("Pas de réponse du serveur")
|
||||
|
||||
# Attendre un sélecteur spécifique si demandé
|
||||
if wait_for_selector:
|
||||
logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}")
|
||||
try:
|
||||
page.wait_for_selector(wait_for_selector, timeout=timeout_ms)
|
||||
except PlaywrightTimeout:
|
||||
logger.warning(
|
||||
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
|
||||
)
|
||||
|
||||
# Récupérer le HTML
|
||||
html = page.content()
|
||||
|
||||
# Screenshot optionnel
|
||||
screenshot = None
|
||||
if save_screenshot:
|
||||
logger.debug("[Playwright] Capture du screenshot")
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
logger.info(
|
||||
f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, "
|
||||
f"status={response.status}"
|
||||
)
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=True,
|
||||
html=html,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except PlaywrightTimeout:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Timeout après {timeout_ms}ms"
|
||||
logger.error(f"[Playwright] {error}")
|
||||
|
||||
# Tenter un screenshot même en cas d'erreur
|
||||
screenshot = None
|
||||
if save_screenshot and page:
|
||||
try:
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur Playwright: {str(e)}"
|
||||
logger.error(f"[Playwright] {error}")
|
||||
|
||||
# Tenter un screenshot même en cas d'erreur
|
||||
screenshot = None
|
||||
if save_screenshot and page:
|
||||
try:
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
finally:
|
||||
# Nettoyage
|
||||
try:
|
||||
if page:
|
||||
page.close()
|
||||
if browser:
|
||||
browser.close()
|
||||
if playwright:
|
||||
playwright.stop()
|
||||
except Exception as e:
|
||||
logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}")
|
||||
|
||||
|
||||
def fetch_with_fallback(
|
||||
url: str,
|
||||
try_http_first: bool = True,
|
||||
playwright_options: Optional[dict] = None,
|
||||
) -> PlaywrightFetchResult:
|
||||
"""
|
||||
Stratégie de récupération avec fallback HTTP → Playwright.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
try_http_first: Tenter HTTP d'abord (plus rapide)
|
||||
playwright_options: Options pour Playwright si nécessaire
|
||||
|
||||
Returns:
|
||||
PlaywrightFetchResult
|
||||
|
||||
Justification technique:
|
||||
- HTTP d'abord car beaucoup plus rapide (~1s vs ~10s)
|
||||
- Fallback Playwright si HTTP échoue (403, timeout, etc.)
|
||||
- Économise des ressources quand HTTP suffit
|
||||
"""
|
||||
from pricewatch.app.scraping.http_fetch import fetch_http
|
||||
|
||||
playwright_options = playwright_options or {}
|
||||
|
||||
if try_http_first:
|
||||
logger.info(f"[Fallback] Tentative HTTP d'abord: {url}")
|
||||
http_result = fetch_http(url)
|
||||
|
||||
if http_result.success:
|
||||
logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright")
|
||||
return PlaywrightFetchResult(
|
||||
success=True,
|
||||
html=http_result.html,
|
||||
duration_ms=http_result.duration_ms,
|
||||
)
|
||||
|
||||
logger.warning(
|
||||
f"[Fallback] HTTP échoué ({http_result.error}), "
|
||||
"fallback vers Playwright"
|
||||
)
|
||||
|
||||
# Playwright en fallback ou en méthode principale
|
||||
return fetch_playwright(url, **playwright_options)
|
||||
Reference in New Issue
Block a user