This commit is contained in:
2026-01-19 06:16:38 +01:00
parent 4ff5d3ee79
commit dcb25e0163
74 changed files with 232377 additions and 177 deletions

View File

@@ -13,7 +13,13 @@ from sqlalchemy.orm import Session
from backend.app.core.config import load_config
from backend.app.db import database, models
from backend.app.scraper.amazon.parser import detect_blocked, extract_product_data
from backend.app.scraper.amazon.parser import extract_product_data
# Répertoires de stockage
SAMPLES_DIR = Path(__file__).resolve().parent.parent / "samples"
DEBUG_DIR = SAMPLES_DIR / "debug"
STORAGE_STATE_PATH = SAMPLES_DIR / "storage_state.json"
RAW_DATA_DIR = Path(__file__).resolve().parent.parent.parent / "data" / "raw"
def _create_run(session: Session) -> models.ScrapeRun:
@@ -32,9 +38,8 @@ def _finalize_run(run: models.ScrapeRun, session: Session, status: str) -> None:
def _save_raw_json(payload: dict, product_id: int) -> Path:
base_dir = Path(__file__).resolve().parent.parent.parent / "data" / "raw"
timestamp = datetime.utcnow().strftime("%Y-%m-%d")
folder = base_dir / timestamp
folder = RAW_DATA_DIR / timestamp
folder.mkdir(parents=True, exist_ok=True)
filename = f"{product_id}_{datetime.utcnow().strftime('%H%M%S')}.json"
path = folder / filename
@@ -42,15 +47,24 @@ def _save_raw_json(payload: dict, product_id: int) -> Path:
return path
def _save_debug_artifacts(page, product_id: int) -> tuple[Path, Path]:
base_dir = Path(__file__).resolve().parent.parent.parent / "data" / "screenshots"
base_dir.mkdir(parents=True, exist_ok=True)
def _save_debug_artifacts(page, product_id: int, suffix: str = "capture") -> dict:
"""Sauvegarde screenshot et HTML dans le répertoire debug."""
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
screenshot_path = base_dir / f"{product_id}_{stamp}.png"
html_path = base_dir / f"{product_id}_{stamp}.html"
page.screenshot(path=str(screenshot_path), full_page=True)
html_path.write_text(page.content())
return screenshot_path, html_path
debug_files = {}
try:
screenshot_path = DEBUG_DIR / f"{product_id}_{stamp}_{suffix}.png"
html_path = DEBUG_DIR / f"{product_id}_{stamp}_{suffix}.html"
page.screenshot(path=str(screenshot_path), full_page=True)
html_path.write_text(page.content(), encoding="utf-8")
debug_files = {
"screenshot": str(screenshot_path),
"html": str(html_path),
}
logger.info("Artifacts debug sauvegardés: screenshot={}, html={}", screenshot_path.name, html_path.name)
except Exception as e:
logger.warning("Impossible de générer les artifacts de debug: {}", e)
return debug_files
def _update_product_from_scrape(
@@ -101,77 +115,130 @@ def _create_snapshot(
session.commit()
def _create_browser_context(playwright, config):
"""Crée un contexte navigateur avec storage_state si disponible."""
browser = playwright.chromium.launch(headless=config.scrape.headless)
context_kwargs = {
"locale": config.scrape.locale,
"timezone_id": config.scrape.timezone,
"user_agent": config.scrape.user_agent,
"viewport": config.scrape.viewport,
}
# Charger la session persistée si disponible
if STORAGE_STATE_PATH.exists():
context_kwargs["storage_state"] = str(STORAGE_STATE_PATH)
logger.info("Session persistée chargée: {}", STORAGE_STATE_PATH)
context = browser.new_context(**context_kwargs)
return browser, context
def _save_storage_state(context) -> None:
"""Sauvegarde l'état de session pour réutilisation."""
try:
context.storage_state(path=str(STORAGE_STATE_PATH))
logger.info("Session persistée sauvegardée: {}", STORAGE_STATE_PATH)
except Exception as e:
logger.warning("Impossible de sauvegarder la session: {}", e)
def _process_product(
page,
session: Session,
product: models.Product,
run: models.ScrapeRun,
config,
) -> tuple[bool, dict]:
"""Scrape un produit et retourne (success, data)."""
logger.info("Scraping produit {} ({})", product.id, product.url)
page.goto(product.url, wait_until="domcontentloaded", timeout=config.scrape.timeout_ms)
# Toujours sauvegarder les artifacts de debug
debug_files = _save_debug_artifacts(page, product.id, "capture")
# Extraire les données
data = extract_product_data(page, product.url)
# Vérifier si bloqué (pas de titre = probable blocage)
if not data.get("titre"):
logger.warning("Titre absent pour produit {}, probable blocage Amazon", product.id)
data["bloque"] = True
data["debug_files"] = debug_files
raw_path = _save_raw_json(data, product.id)
_create_snapshot(
session,
product,
run,
data,
status="bloque",
raw_json_path=raw_path,
error_message=f"Blocage détecté - debug: {debug_files.get('screenshot', 'N/A')}",
)
return False, data
# Succès ou partiel
data["debug_files"] = debug_files
raw_path = _save_raw_json(data, product.id)
required = ["titre", "prix_actuel"]
missing = [field for field in required if not data.get(field)]
status = "champs_manquants" if missing else "ok"
_create_snapshot(
session,
product,
run,
data,
status=status,
raw_json_path=raw_path,
error_message=", ".join(missing) if missing else None,
)
if missing:
logger.warning("Champs manquants pour {}: {}", product.id, missing)
return False, data
logger.info("Scraping OK pour {} (titre={})", product.id, data.get("titre", "")[:50])
return True, data
def scrape_product(product_id: int) -> None:
logger.info("Déclenchement du scraping pour le produit %s", product_id)
logger.info("Déclenchement du scraping pour le produit {}", product_id)
session = database.SessionLocal()
run = _create_run(session)
try:
product = session.get(models.Product, product_id)
if not product:
logger.warning("Produit %s introuvable", product_id)
logger.warning("Produit {} introuvable", product_id)
_finalize_run(run, session, "echec")
return
config = load_config()
run.nb_total = 1
session.commit()
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=config.scrape.headless)
context = browser.new_context(
locale=config.scrape.locale,
timezone_id=config.scrape.timezone,
user_agent=config.scrape.user_agent,
viewport=config.scrape.viewport,
)
browser, context = _create_browser_context(playwright, config)
page = context.new_page()
page.set_default_timeout(config.scrape.timeout_ms)
try:
page.goto(product.url, wait_until="domcontentloaded", timeout=config.scrape.timeout_ms)
success, _ = _process_product(page, session, product, run, config)
run.nb_ok = 1 if success else 0
run.nb_echec = 0 if success else 1
_finalize_run(run, session, "succes" if success else "partiel")
html = page.content()
if detect_blocked(html):
screenshot_path, html_path = _save_debug_artifacts(page, product.id)
data = {"url": product.url, "asin": product.asin, "bloque": True}
raw_path = _save_raw_json(data, product.id)
_create_snapshot(
session,
product,
run,
data,
status="bloque",
raw_json_path=raw_path,
error_message=f"Bloque: {screenshot_path.name} / {html_path.name}",
)
run.nb_echec = 1
_finalize_run(run, session, "partiel")
return
data = extract_product_data(page, product.url)
raw_path = _save_raw_json(data, product.id)
required = ["titre", "prix_actuel", "note"]
missing = [field for field in required if not data.get(field)]
status = "champs_manquants" if missing else "ok"
_create_snapshot(
session,
product,
run,
data,
status=status,
raw_json_path=raw_path,
error_message=", ".join(missing) if missing else None,
)
run.nb_ok = 1 if not missing else 0
run.nb_echec = 0 if not missing else 1
_finalize_run(run, session, "succes" if not missing else "partiel")
# Sauvegarder la session pour réutilisation
_save_storage_state(context)
# Délai anti-blocage
delay_min, delay_max = config.scrape.delay_range_ms
time.sleep(random.uniform(delay_min, delay_max) / 1000.0)
finally:
# fermeture propre du navigateur
context.close()
browser.close()
except Exception: # pragma: no cover
logger.exception("Erreur pendant le scraping de %s", product_id)
except Exception as e:
logger.exception("Erreur pendant le scraping de {}: {}", product_id, e)
_finalize_run(run, session, "erreur")
finally:
session.close()
@@ -183,20 +250,19 @@ def scrape_all(product_ids: Iterable[int] | None = None) -> None:
run = _create_run(session)
try:
config = load_config()
products = session.query(models.Product).all()
products = session.query(models.Product).filter(models.Product.actif == True).all()
if product_ids:
products = [product for product in products if product.id in product_ids]
run.nb_total = len(products)
session.commit()
if not products:
logger.info("Aucun produit actif à scraper")
_finalize_run(run, session, "succes")
return
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=config.scrape.headless)
context = browser.new_context(
locale=config.scrape.locale,
timezone_id=config.scrape.timezone,
user_agent=config.scrape.user_agent,
viewport=config.scrape.viewport,
)
browser, context = _create_browser_context(playwright, config)
page = context.new_page()
page.set_default_timeout(config.scrape.timeout_ms)
@@ -205,55 +271,31 @@ def scrape_all(product_ids: Iterable[int] | None = None) -> None:
try:
for product in products:
page.goto(product.url, wait_until="domcontentloaded", timeout=config.scrape.timeout_ms)
html = page.content()
if detect_blocked(html):
screenshot_path, html_path = _save_debug_artifacts(page, product.id)
data = {"url": product.url, "asin": product.asin, "bloque": True}
raw_path = _save_raw_json(data, product.id)
_create_snapshot(
session,
product,
run,
data,
status="bloque",
raw_json_path=raw_path,
error_message=f"Bloque: {screenshot_path.name} / {html_path.name}",
)
try:
success, _ = _process_product(page, session, product, run, config)
if success:
nb_ok += 1
else:
nb_echec += 1
except Exception as e:
logger.error("Erreur scraping produit {}: {}", product.id, e)
nb_echec += 1
continue
data = extract_product_data(page, product.url)
raw_path = _save_raw_json(data, product.id)
required = ["titre", "prix_actuel", "note"]
missing = [field for field in required if not data.get(field)]
status = "champs_manquants" if missing else "ok"
_create_snapshot(
session,
product,
run,
data,
status=status,
raw_json_path=raw_path,
error_message=", ".join(missing) if missing else None,
)
if missing:
nb_echec += 1
else:
nb_ok += 1
# Délai anti-blocage entre les produits
delay_min, delay_max = config.scrape.delay_range_ms
time.sleep(random.uniform(delay_min, delay_max) / 1000.0)
run.nb_ok = nb_ok
run.nb_echec = nb_echec
_finalize_run(run, session, "succes" if nb_echec == 0 else "partiel")
# Sauvegarder la session pour réutilisation
_save_storage_state(context)
finally:
# fermeture propre du navigateur
context.close()
browser.close()
except Exception: # pragma: no cover
logger.exception("Erreur du scraping global")
except Exception as e:
logger.exception("Erreur du scraping global: {}", e)
_finalize_run(run, session, "erreur")
finally:
session.close()