chore: sync project files
This commit is contained in:
238
pricewatch/app/scraping/pw_fetch.py
Executable file
238
pricewatch/app/scraping/pw_fetch.py
Executable file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Récupération avec Playwright (fallback anti-bot).
|
||||
|
||||
Utilisé quand HTTP échoue (403, captcha, etc.).
|
||||
Plus lent mais plus robuste contre les protections anti-scraping.
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from playwright.sync_api import (
|
||||
Browser,
|
||||
Page,
|
||||
Playwright,
|
||||
sync_playwright,
|
||||
TimeoutError as PlaywrightTimeout,
|
||||
)
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
|
||||
logger = get_logger("scraping.playwright")
|
||||
|
||||
|
||||
class PlaywrightFetchResult:
|
||||
"""Résultat d'une récupération Playwright."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
html: Optional[str] = None,
|
||||
screenshot: Optional[bytes] = None,
|
||||
error: Optional[str] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
):
|
||||
self.success = success
|
||||
self.html = html
|
||||
self.screenshot = screenshot
|
||||
self.error = error
|
||||
self.duration_ms = duration_ms
|
||||
|
||||
|
||||
def fetch_playwright(
|
||||
url: str,
|
||||
headless: bool = True,
|
||||
timeout_ms: int = 60000,
|
||||
save_screenshot: bool = False,
|
||||
wait_for_selector: Optional[str] = None,
|
||||
) -> PlaywrightFetchResult:
|
||||
"""
|
||||
Récupère une page avec Playwright.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
headless: Mode headless (True) ou visible (False)
|
||||
timeout_ms: Timeout en millisecondes
|
||||
save_screenshot: Prendre un screenshot
|
||||
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
|
||||
|
||||
Returns:
|
||||
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
|
||||
|
||||
Justification technique:
|
||||
- Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots
|
||||
- Headless par défaut pour performance
|
||||
- Headful disponible pour debug visuel
|
||||
- Screenshot optionnel pour diagnostiquer les échecs
|
||||
- wait_for_selector permet d'attendre le chargement dynamique
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.error("URL vide fournie")
|
||||
return PlaywrightFetchResult(success=False, error="URL vide")
|
||||
|
||||
start_time = time.time()
|
||||
logger.info(f"[Playwright] Récupération: {url} (headless={headless})")
|
||||
|
||||
playwright: Optional[Playwright] = None
|
||||
browser: Optional[Browser] = None
|
||||
page: Optional[Page] = None
|
||||
|
||||
try:
|
||||
playwright = sync_playwright().start()
|
||||
|
||||
# Lancer le navigateur Chromium
|
||||
browser = playwright.chromium.launch(headless=headless)
|
||||
|
||||
# Créer un contexte avec User-Agent réaliste
|
||||
context = browser.new_context(
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
locale="fr-FR",
|
||||
)
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# Définir le timeout
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
# Naviguer vers la page
|
||||
logger.debug(f"[Playwright] Navigation vers {url}")
|
||||
response = page.goto(url, wait_until="domcontentloaded")
|
||||
|
||||
if not response:
|
||||
raise Exception("Pas de réponse du serveur")
|
||||
|
||||
# Attendre un sélecteur spécifique si demandé
|
||||
if wait_for_selector:
|
||||
logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}")
|
||||
try:
|
||||
page.wait_for_selector(wait_for_selector, timeout=timeout_ms)
|
||||
except PlaywrightTimeout:
|
||||
logger.warning(
|
||||
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
|
||||
)
|
||||
|
||||
# Récupérer le HTML
|
||||
html = page.content()
|
||||
|
||||
# Screenshot optionnel
|
||||
screenshot = None
|
||||
if save_screenshot:
|
||||
logger.debug("[Playwright] Capture du screenshot")
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
logger.info(
|
||||
f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, "
|
||||
f"status={response.status}"
|
||||
)
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=True,
|
||||
html=html,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except PlaywrightTimeout:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Timeout après {timeout_ms}ms"
|
||||
logger.error(f"[Playwright] {error}")
|
||||
|
||||
# Tenter un screenshot même en cas d'erreur
|
||||
screenshot = None
|
||||
if save_screenshot and page:
|
||||
try:
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur Playwright: {str(e)}"
|
||||
logger.error(f"[Playwright] {error}")
|
||||
|
||||
# Tenter un screenshot même en cas d'erreur
|
||||
screenshot = None
|
||||
if save_screenshot and page:
|
||||
try:
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
finally:
|
||||
# Nettoyage
|
||||
try:
|
||||
if page:
|
||||
page.close()
|
||||
if browser:
|
||||
browser.close()
|
||||
if playwright:
|
||||
playwright.stop()
|
||||
except Exception as e:
|
||||
logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}")
|
||||
|
||||
|
||||
def fetch_with_fallback(
|
||||
url: str,
|
||||
try_http_first: bool = True,
|
||||
playwright_options: Optional[dict] = None,
|
||||
) -> PlaywrightFetchResult:
|
||||
"""
|
||||
Stratégie de récupération avec fallback HTTP → Playwright.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
try_http_first: Tenter HTTP d'abord (plus rapide)
|
||||
playwright_options: Options pour Playwright si nécessaire
|
||||
|
||||
Returns:
|
||||
PlaywrightFetchResult
|
||||
|
||||
Justification technique:
|
||||
- HTTP d'abord car beaucoup plus rapide (~1s vs ~10s)
|
||||
- Fallback Playwright si HTTP échoue (403, timeout, etc.)
|
||||
- Économise des ressources quand HTTP suffit
|
||||
"""
|
||||
from pricewatch.app.scraping.http_fetch import fetch_http
|
||||
|
||||
playwright_options = playwright_options or {}
|
||||
|
||||
if try_http_first:
|
||||
logger.info(f"[Fallback] Tentative HTTP d'abord: {url}")
|
||||
http_result = fetch_http(url)
|
||||
|
||||
if http_result.success:
|
||||
logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright")
|
||||
return PlaywrightFetchResult(
|
||||
success=True,
|
||||
html=http_result.html,
|
||||
duration_ms=http_result.duration_ms,
|
||||
)
|
||||
|
||||
logger.warning(
|
||||
f"[Fallback] HTTP échoué ({http_result.error}), "
|
||||
"fallback vers Playwright"
|
||||
)
|
||||
|
||||
# Playwright en fallback ou en méthode principale
|
||||
return fetch_playwright(url, **playwright_options)
|
||||
Reference in New Issue
Block a user