claude
@@ -1,11 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Body, HTTPException
|
||||
|
||||
from backend.app.core.config import BackendConfig, CONFIG_PATH, load_config
|
||||
|
||||
router = APIRouter(prefix="/config", tags=["config"])
|
||||
|
||||
# Chemin vers la config frontend
|
||||
FRONTEND_CONFIG_PATH = Path(__file__).resolve().parent.parent.parent.parent / "frontend" / "config_frontend.json"
|
||||
|
||||
|
||||
@router.get("/backend", response_model=BackendConfig)
|
||||
def read_backend_config() -> BackendConfig:
|
||||
@@ -18,9 +24,55 @@ def update_backend_config(payload: dict = Body(...)) -> BackendConfig:
|
||||
current = load_config()
|
||||
try:
|
||||
# validation via Pydantic avant écriture
|
||||
updated = current.copy(update=payload)
|
||||
CONFIG_PATH.write_text(updated.json(indent=2, ensure_ascii=False))
|
||||
updated = current.model_copy(update=payload)
|
||||
CONFIG_PATH.write_text(updated.model_dump_json(indent=2), encoding="utf-8")
|
||||
load_config.cache_clear()
|
||||
return load_config()
|
||||
except Exception as exc: # pragma: no cover
|
||||
raise HTTPException(status_code=400, detail=str(exc))
|
||||
|
||||
|
||||
@router.get("/frontend")
|
||||
def read_frontend_config() -> dict:
|
||||
"""Retourne la configuration frontend."""
|
||||
if not FRONTEND_CONFIG_PATH.exists():
|
||||
raise HTTPException(status_code=404, detail="Config frontend introuvable")
|
||||
return json.loads(FRONTEND_CONFIG_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
@router.put("/frontend")
|
||||
def update_frontend_config(payload: dict = Body(...)) -> dict:
|
||||
"""Met à jour la configuration frontend."""
|
||||
try:
|
||||
# Charger la config actuelle
|
||||
current = {}
|
||||
if FRONTEND_CONFIG_PATH.exists():
|
||||
current = json.loads(FRONTEND_CONFIG_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
# Fusion profonde des configs
|
||||
def deep_merge(base: dict, update: dict) -> dict:
|
||||
result = base.copy()
|
||||
for key, value in update.items():
|
||||
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
||||
result[key] = deep_merge(result[key], value)
|
||||
else:
|
||||
result[key] = value
|
||||
return result
|
||||
|
||||
updated = deep_merge(current, payload)
|
||||
FRONTEND_CONFIG_PATH.write_text(
|
||||
json.dumps(updated, indent=2, ensure_ascii=False),
|
||||
encoding="utf-8"
|
||||
)
|
||||
|
||||
# Mettre à jour aussi dans public/ pour le frontend dev
|
||||
public_config = FRONTEND_CONFIG_PATH.parent / "public" / "config_frontend.json"
|
||||
if public_config.parent.exists():
|
||||
public_config.write_text(
|
||||
json.dumps(updated, indent=2, ensure_ascii=False),
|
||||
encoding="utf-8"
|
||||
)
|
||||
|
||||
return updated
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc))
|
||||
|
||||
@@ -10,10 +10,10 @@ from backend.app.scraper.runner import scrape_product
|
||||
router = APIRouter(prefix="/products", tags=["products"])
|
||||
|
||||
|
||||
@router.get("", response_model=list[schemas.ProductRead])
|
||||
def list_products(skip: int = 0, limit: int = 50, db: Session = Depends(get_db)) -> list[schemas.ProductRead]:
|
||||
# on retourne la liste paginée de produits
|
||||
return crud.list_products(db, skip=skip, limit=limit)
|
||||
@router.get("", response_model=list[schemas.ProductWithSnapshot])
|
||||
def list_products(skip: int = 0, limit: int = 50, db: Session = Depends(get_db)) -> list[schemas.ProductWithSnapshot]:
|
||||
# on retourne la liste paginée de produits enrichis avec les derniers snapshots
|
||||
return crud.list_products_with_snapshots(db, skip=skip, limit=limit)
|
||||
|
||||
|
||||
@router.post("", response_model=schemas.ProductRead, status_code=status.HTTP_201_CREATED)
|
||||
@@ -28,9 +28,9 @@ def create_product(
|
||||
return product
|
||||
|
||||
|
||||
@router.get("/{product_id}", response_model=schemas.ProductRead)
|
||||
def read_product(product_id: int, db: Session = Depends(get_db)) -> schemas.ProductRead:
|
||||
product = crud.get_product(db, product_id)
|
||||
@router.get("/{product_id}", response_model=schemas.ProductWithSnapshot)
|
||||
def read_product(product_id: int, db: Session = Depends(get_db)) -> schemas.ProductWithSnapshot:
|
||||
product = crud.get_product_with_snapshot(db, product_id)
|
||||
if not product:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Produit introuvable")
|
||||
return product
|
||||
|
||||
@@ -20,7 +20,14 @@ def list_products(db: Session, skip: int = 0, limit: int = 100) -> list[models.P
|
||||
|
||||
|
||||
def create_product(db: Session, data: schemas.ProductCreate) -> models.Product:
|
||||
product = models.Product(**data.dict())
|
||||
# Convertir les HttpUrl en strings pour SQLite
|
||||
data_dict = data.model_dump()
|
||||
if data_dict.get("url"):
|
||||
data_dict["url"] = str(data_dict["url"])
|
||||
if data_dict.get("url_image"):
|
||||
data_dict["url_image"] = str(data_dict["url_image"])
|
||||
|
||||
product = models.Product(**data_dict)
|
||||
db.add(product)
|
||||
try:
|
||||
db.commit()
|
||||
@@ -62,3 +69,63 @@ def get_latest_snapshot(db: Session, product_id: int) -> models.ProductSnapshot
|
||||
.order_by(models.ProductSnapshot.scrape_le.desc())
|
||||
.first()
|
||||
)
|
||||
|
||||
|
||||
def get_product_with_snapshot(db: Session, product_id: int) -> dict | None:
|
||||
"""Retourne un produit enrichi avec les données du dernier snapshot."""
|
||||
product = get_product(db, product_id)
|
||||
if not product:
|
||||
return None
|
||||
return _enrich_product_with_snapshot(db, product)
|
||||
|
||||
|
||||
def list_products_with_snapshots(db: Session, skip: int = 0, limit: int = 100) -> list[dict]:
|
||||
"""Retourne la liste des produits enrichis avec leurs derniers snapshots."""
|
||||
products = list_products(db, skip=skip, limit=limit)
|
||||
return [_enrich_product_with_snapshot(db, p) for p in products]
|
||||
|
||||
|
||||
def _enrich_product_with_snapshot(db: Session, product: models.Product) -> dict:
|
||||
"""Ajoute les données du dernier snapshot au produit."""
|
||||
snapshot = get_latest_snapshot(db, product.id)
|
||||
|
||||
result = {
|
||||
"id": product.id,
|
||||
"boutique": product.boutique,
|
||||
"url": str(product.url),
|
||||
"asin": product.asin,
|
||||
"titre": product.titre,
|
||||
"url_image": str(product.url_image) if product.url_image else None,
|
||||
"categorie": product.categorie,
|
||||
"type": product.type,
|
||||
"actif": product.actif,
|
||||
"cree_le": product.cree_le,
|
||||
"modifie_le": product.modifie_le,
|
||||
}
|
||||
|
||||
if snapshot:
|
||||
# Calcul de la réduction en pourcentage
|
||||
reduction = None
|
||||
if snapshot.prix_actuel and snapshot.prix_conseille:
|
||||
reduction = round((1 - snapshot.prix_actuel / snapshot.prix_conseille) * 100)
|
||||
|
||||
result.update(
|
||||
{
|
||||
"prix_actuel": snapshot.prix_actuel,
|
||||
"prix_conseille": snapshot.prix_conseille,
|
||||
"prix_min_30j": snapshot.prix_min_30j,
|
||||
"reduction_pourcent": reduction,
|
||||
"etat_stock": snapshot.etat_stock,
|
||||
"en_stock": snapshot.en_stock,
|
||||
"note": snapshot.note,
|
||||
"nombre_avis": snapshot.nombre_avis,
|
||||
"prime": snapshot.prime,
|
||||
"choix_amazon": snapshot.choix_amazon,
|
||||
"offre_limitee": snapshot.offre_limitee,
|
||||
"exclusivite_amazon": snapshot.exclusivite_amazon,
|
||||
"dernier_scrape": snapshot.scrape_le,
|
||||
"statut_scrap": snapshot.statut_scrap,
|
||||
}
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@@ -61,3 +61,29 @@ class ProductSnapshotRead(ProductSnapshotBase):
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class ProductWithSnapshot(ProductBase):
|
||||
"""Produit enrichi avec les données du dernier snapshot."""
|
||||
|
||||
id: int
|
||||
cree_le: datetime
|
||||
modifie_le: datetime
|
||||
# Données du dernier snapshot
|
||||
prix_actuel: Optional[float] = None
|
||||
prix_conseille: Optional[float] = None
|
||||
prix_min_30j: Optional[float] = None
|
||||
reduction_pourcent: Optional[int] = None
|
||||
etat_stock: Optional[str] = None
|
||||
en_stock: Optional[bool] = None
|
||||
note: Optional[float] = None
|
||||
nombre_avis: Optional[int] = None
|
||||
prime: Optional[bool] = None
|
||||
choix_amazon: Optional[bool] = None
|
||||
offre_limitee: Optional[bool] = None
|
||||
exclusivite_amazon: Optional[bool] = None
|
||||
dernier_scrape: Optional[datetime] = None
|
||||
statut_scrap: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
from os import getenv
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from backend.app.api import routes_config, routes_debug, routes_products, routes_scrape
|
||||
@@ -14,6 +15,15 @@ load_dotenv()
|
||||
|
||||
app = FastAPI(title="suivi_produit")
|
||||
|
||||
# CORS pour le frontend
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["http://localhost:5173", "http://127.0.0.1:5173"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(routes_products.router)
|
||||
app.include_router(routes_scrape.router)
|
||||
app.include_router(routes_config.router)
|
||||
|
||||
11004
backend/app/samples/debug/10_20260119_035240_capture.html
Normal file
BIN
backend/app/samples/debug/10_20260119_035240_capture.png
Normal file
|
After Width: | Height: | Size: 3.8 MiB |
10447
backend/app/samples/debug/11_20260119_035408_capture.html
Normal file
BIN
backend/app/samples/debug/11_20260119_035408_capture.png
Normal file
|
After Width: | Height: | Size: 1.8 MiB |
9031
backend/app/samples/debug/1_20260119_033545_capture.html
Normal file
BIN
backend/app/samples/debug/1_20260119_033545_capture.png
Normal file
|
After Width: | Height: | Size: 1.6 MiB |
8877
backend/app/samples/debug/1_20260119_034228_capture.html
Normal file
BIN
backend/app/samples/debug/1_20260119_034228_capture.png
Normal file
|
After Width: | Height: | Size: 1.4 MiB |
8341
backend/app/samples/debug/1_20260119_034843_capture.html
Normal file
BIN
backend/app/samples/debug/1_20260119_034843_capture.png
Normal file
|
After Width: | Height: | Size: 2.0 MiB |
9450
backend/app/samples/debug/2_20260119_033514_capture.html
Normal file
BIN
backend/app/samples/debug/2_20260119_033514_capture.png
Normal file
|
After Width: | Height: | Size: 1.9 MiB |
9457
backend/app/samples/debug/2_20260119_033516_capture.html
Normal file
BIN
backend/app/samples/debug/2_20260119_033516_capture.png
Normal file
|
After Width: | Height: | Size: 1.9 MiB |
9200
backend/app/samples/debug/2_20260119_034233_capture.html
Normal file
BIN
backend/app/samples/debug/2_20260119_034233_capture.png
Normal file
|
After Width: | Height: | Size: 1.8 MiB |
9227
backend/app/samples/debug/2_20260119_034848_capture.html
Normal file
BIN
backend/app/samples/debug/2_20260119_034848_capture.png
Normal file
|
After Width: | Height: | Size: 2.3 MiB |
10948
backend/app/samples/debug/3_20260119_033507_capture.html
Normal file
BIN
backend/app/samples/debug/3_20260119_033507_capture.png
Normal file
|
After Width: | Height: | Size: 3.4 MiB |
10742
backend/app/samples/debug/3_20260119_034238_capture.html
Normal file
BIN
backend/app/samples/debug/3_20260119_034238_capture.png
Normal file
|
After Width: | Height: | Size: 1.6 MiB |
10758
backend/app/samples/debug/3_20260119_034854_capture.html
Normal file
BIN
backend/app/samples/debug/3_20260119_034854_capture.png
Normal file
|
After Width: | Height: | Size: 3.7 MiB |
9660
backend/app/samples/debug/4_20260119_033624_capture.html
Normal file
BIN
backend/app/samples/debug/4_20260119_033624_capture.png
Normal file
|
After Width: | Height: | Size: 1.4 MiB |
9649
backend/app/samples/debug/4_20260119_033635_capture.html
Normal file
BIN
backend/app/samples/debug/4_20260119_033635_capture.png
Normal file
|
After Width: | Height: | Size: 1.4 MiB |
9470
backend/app/samples/debug/4_20260119_034245_capture.html
Normal file
BIN
backend/app/samples/debug/4_20260119_034245_capture.png
Normal file
|
After Width: | Height: | Size: 1.3 MiB |
9477
backend/app/samples/debug/4_20260119_034902_capture.html
Normal file
BIN
backend/app/samples/debug/4_20260119_034902_capture.png
Normal file
|
After Width: | Height: | Size: 1.5 MiB |
8561
backend/app/samples/debug/5_20260119_033709_capture.html
Normal file
BIN
backend/app/samples/debug/5_20260119_033709_capture.png
Normal file
|
After Width: | Height: | Size: 1.5 MiB |
8554
backend/app/samples/debug/5_20260119_034251_capture.html
Normal file
BIN
backend/app/samples/debug/5_20260119_034251_capture.png
Normal file
|
After Width: | Height: | Size: 1.3 MiB |
8559
backend/app/samples/debug/5_20260119_034907_capture.html
Normal file
BIN
backend/app/samples/debug/5_20260119_034907_capture.png
Normal file
|
After Width: | Height: | Size: 1.4 MiB |
10110
backend/app/samples/debug/6_20260119_034451_capture.html
Normal file
BIN
backend/app/samples/debug/6_20260119_034451_capture.png
Normal file
|
After Width: | Height: | Size: 2.2 MiB |
10120
backend/app/samples/debug/6_20260119_034914_capture.html
Normal file
BIN
backend/app/samples/debug/6_20260119_034914_capture.png
Normal file
|
After Width: | Height: | Size: 2.2 MiB |
10396
backend/app/samples/debug/7_20260119_035030_capture.html
Normal file
BIN
backend/app/samples/debug/7_20260119_035030_capture.png
Normal file
|
After Width: | Height: | Size: 3.7 MiB |
10468
backend/app/samples/debug/8_20260119_035115_capture.html
Normal file
BIN
backend/app/samples/debug/8_20260119_035115_capture.png
Normal file
|
After Width: | Height: | Size: 2.5 MiB |
8162
backend/app/samples/debug/9_20260119_035138_capture.html
Normal file
BIN
backend/app/samples/debug/9_20260119_035138_capture.png
Normal file
|
After Width: | Height: | Size: 913 KiB |
@@ -13,7 +13,13 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from backend.app.core.config import load_config
|
||||
from backend.app.db import database, models
|
||||
from backend.app.scraper.amazon.parser import detect_blocked, extract_product_data
|
||||
from backend.app.scraper.amazon.parser import extract_product_data
|
||||
|
||||
# Répertoires de stockage
|
||||
SAMPLES_DIR = Path(__file__).resolve().parent.parent / "samples"
|
||||
DEBUG_DIR = SAMPLES_DIR / "debug"
|
||||
STORAGE_STATE_PATH = SAMPLES_DIR / "storage_state.json"
|
||||
RAW_DATA_DIR = Path(__file__).resolve().parent.parent.parent / "data" / "raw"
|
||||
|
||||
|
||||
def _create_run(session: Session) -> models.ScrapeRun:
|
||||
@@ -32,9 +38,8 @@ def _finalize_run(run: models.ScrapeRun, session: Session, status: str) -> None:
|
||||
|
||||
|
||||
def _save_raw_json(payload: dict, product_id: int) -> Path:
|
||||
base_dir = Path(__file__).resolve().parent.parent.parent / "data" / "raw"
|
||||
timestamp = datetime.utcnow().strftime("%Y-%m-%d")
|
||||
folder = base_dir / timestamp
|
||||
folder = RAW_DATA_DIR / timestamp
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
filename = f"{product_id}_{datetime.utcnow().strftime('%H%M%S')}.json"
|
||||
path = folder / filename
|
||||
@@ -42,15 +47,24 @@ def _save_raw_json(payload: dict, product_id: int) -> Path:
|
||||
return path
|
||||
|
||||
|
||||
def _save_debug_artifacts(page, product_id: int) -> tuple[Path, Path]:
|
||||
base_dir = Path(__file__).resolve().parent.parent.parent / "data" / "screenshots"
|
||||
base_dir.mkdir(parents=True, exist_ok=True)
|
||||
def _save_debug_artifacts(page, product_id: int, suffix: str = "capture") -> dict:
|
||||
"""Sauvegarde screenshot et HTML dans le répertoire debug."""
|
||||
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||
screenshot_path = base_dir / f"{product_id}_{stamp}.png"
|
||||
html_path = base_dir / f"{product_id}_{stamp}.html"
|
||||
page.screenshot(path=str(screenshot_path), full_page=True)
|
||||
html_path.write_text(page.content())
|
||||
return screenshot_path, html_path
|
||||
debug_files = {}
|
||||
try:
|
||||
screenshot_path = DEBUG_DIR / f"{product_id}_{stamp}_{suffix}.png"
|
||||
html_path = DEBUG_DIR / f"{product_id}_{stamp}_{suffix}.html"
|
||||
page.screenshot(path=str(screenshot_path), full_page=True)
|
||||
html_path.write_text(page.content(), encoding="utf-8")
|
||||
debug_files = {
|
||||
"screenshot": str(screenshot_path),
|
||||
"html": str(html_path),
|
||||
}
|
||||
logger.info("Artifacts debug sauvegardés: screenshot={}, html={}", screenshot_path.name, html_path.name)
|
||||
except Exception as e:
|
||||
logger.warning("Impossible de générer les artifacts de debug: {}", e)
|
||||
return debug_files
|
||||
|
||||
|
||||
def _update_product_from_scrape(
|
||||
@@ -101,77 +115,130 @@ def _create_snapshot(
|
||||
session.commit()
|
||||
|
||||
|
||||
def _create_browser_context(playwright, config):
|
||||
"""Crée un contexte navigateur avec storage_state si disponible."""
|
||||
browser = playwright.chromium.launch(headless=config.scrape.headless)
|
||||
context_kwargs = {
|
||||
"locale": config.scrape.locale,
|
||||
"timezone_id": config.scrape.timezone,
|
||||
"user_agent": config.scrape.user_agent,
|
||||
"viewport": config.scrape.viewport,
|
||||
}
|
||||
# Charger la session persistée si disponible
|
||||
if STORAGE_STATE_PATH.exists():
|
||||
context_kwargs["storage_state"] = str(STORAGE_STATE_PATH)
|
||||
logger.info("Session persistée chargée: {}", STORAGE_STATE_PATH)
|
||||
|
||||
context = browser.new_context(**context_kwargs)
|
||||
return browser, context
|
||||
|
||||
|
||||
def _save_storage_state(context) -> None:
|
||||
"""Sauvegarde l'état de session pour réutilisation."""
|
||||
try:
|
||||
context.storage_state(path=str(STORAGE_STATE_PATH))
|
||||
logger.info("Session persistée sauvegardée: {}", STORAGE_STATE_PATH)
|
||||
except Exception as e:
|
||||
logger.warning("Impossible de sauvegarder la session: {}", e)
|
||||
|
||||
|
||||
def _process_product(
|
||||
page,
|
||||
session: Session,
|
||||
product: models.Product,
|
||||
run: models.ScrapeRun,
|
||||
config,
|
||||
) -> tuple[bool, dict]:
|
||||
"""Scrape un produit et retourne (success, data)."""
|
||||
logger.info("Scraping produit {} ({})", product.id, product.url)
|
||||
|
||||
page.goto(product.url, wait_until="domcontentloaded", timeout=config.scrape.timeout_ms)
|
||||
|
||||
# Toujours sauvegarder les artifacts de debug
|
||||
debug_files = _save_debug_artifacts(page, product.id, "capture")
|
||||
|
||||
# Extraire les données
|
||||
data = extract_product_data(page, product.url)
|
||||
|
||||
# Vérifier si bloqué (pas de titre = probable blocage)
|
||||
if not data.get("titre"):
|
||||
logger.warning("Titre absent pour produit {}, probable blocage Amazon", product.id)
|
||||
data["bloque"] = True
|
||||
data["debug_files"] = debug_files
|
||||
raw_path = _save_raw_json(data, product.id)
|
||||
_create_snapshot(
|
||||
session,
|
||||
product,
|
||||
run,
|
||||
data,
|
||||
status="bloque",
|
||||
raw_json_path=raw_path,
|
||||
error_message=f"Blocage détecté - debug: {debug_files.get('screenshot', 'N/A')}",
|
||||
)
|
||||
return False, data
|
||||
|
||||
# Succès ou partiel
|
||||
data["debug_files"] = debug_files
|
||||
raw_path = _save_raw_json(data, product.id)
|
||||
required = ["titre", "prix_actuel"]
|
||||
missing = [field for field in required if not data.get(field)]
|
||||
status = "champs_manquants" if missing else "ok"
|
||||
|
||||
_create_snapshot(
|
||||
session,
|
||||
product,
|
||||
run,
|
||||
data,
|
||||
status=status,
|
||||
raw_json_path=raw_path,
|
||||
error_message=", ".join(missing) if missing else None,
|
||||
)
|
||||
|
||||
if missing:
|
||||
logger.warning("Champs manquants pour {}: {}", product.id, missing)
|
||||
return False, data
|
||||
|
||||
logger.info("Scraping OK pour {} (titre={})", product.id, data.get("titre", "")[:50])
|
||||
return True, data
|
||||
|
||||
|
||||
def scrape_product(product_id: int) -> None:
|
||||
logger.info("Déclenchement du scraping pour le produit %s", product_id)
|
||||
logger.info("Déclenchement du scraping pour le produit {}", product_id)
|
||||
session = database.SessionLocal()
|
||||
run = _create_run(session)
|
||||
try:
|
||||
product = session.get(models.Product, product_id)
|
||||
if not product:
|
||||
logger.warning("Produit %s introuvable", product_id)
|
||||
logger.warning("Produit {} introuvable", product_id)
|
||||
_finalize_run(run, session, "echec")
|
||||
return
|
||||
|
||||
config = load_config()
|
||||
run.nb_total = 1
|
||||
session.commit()
|
||||
|
||||
with sync_playwright() as playwright:
|
||||
browser = playwright.chromium.launch(headless=config.scrape.headless)
|
||||
context = browser.new_context(
|
||||
locale=config.scrape.locale,
|
||||
timezone_id=config.scrape.timezone,
|
||||
user_agent=config.scrape.user_agent,
|
||||
viewport=config.scrape.viewport,
|
||||
)
|
||||
browser, context = _create_browser_context(playwright, config)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(config.scrape.timeout_ms)
|
||||
|
||||
try:
|
||||
page.goto(product.url, wait_until="domcontentloaded", timeout=config.scrape.timeout_ms)
|
||||
success, _ = _process_product(page, session, product, run, config)
|
||||
run.nb_ok = 1 if success else 0
|
||||
run.nb_echec = 0 if success else 1
|
||||
_finalize_run(run, session, "succes" if success else "partiel")
|
||||
|
||||
html = page.content()
|
||||
if detect_blocked(html):
|
||||
screenshot_path, html_path = _save_debug_artifacts(page, product.id)
|
||||
data = {"url": product.url, "asin": product.asin, "bloque": True}
|
||||
raw_path = _save_raw_json(data, product.id)
|
||||
_create_snapshot(
|
||||
session,
|
||||
product,
|
||||
run,
|
||||
data,
|
||||
status="bloque",
|
||||
raw_json_path=raw_path,
|
||||
error_message=f"Bloque: {screenshot_path.name} / {html_path.name}",
|
||||
)
|
||||
run.nb_echec = 1
|
||||
_finalize_run(run, session, "partiel")
|
||||
return
|
||||
|
||||
data = extract_product_data(page, product.url)
|
||||
raw_path = _save_raw_json(data, product.id)
|
||||
required = ["titre", "prix_actuel", "note"]
|
||||
missing = [field for field in required if not data.get(field)]
|
||||
status = "champs_manquants" if missing else "ok"
|
||||
_create_snapshot(
|
||||
session,
|
||||
product,
|
||||
run,
|
||||
data,
|
||||
status=status,
|
||||
raw_json_path=raw_path,
|
||||
error_message=", ".join(missing) if missing else None,
|
||||
)
|
||||
run.nb_ok = 1 if not missing else 0
|
||||
run.nb_echec = 0 if not missing else 1
|
||||
_finalize_run(run, session, "succes" if not missing else "partiel")
|
||||
# Sauvegarder la session pour réutilisation
|
||||
_save_storage_state(context)
|
||||
|
||||
# Délai anti-blocage
|
||||
delay_min, delay_max = config.scrape.delay_range_ms
|
||||
time.sleep(random.uniform(delay_min, delay_max) / 1000.0)
|
||||
finally:
|
||||
# fermeture propre du navigateur
|
||||
context.close()
|
||||
browser.close()
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception("Erreur pendant le scraping de %s", product_id)
|
||||
except Exception as e:
|
||||
logger.exception("Erreur pendant le scraping de {}: {}", product_id, e)
|
||||
_finalize_run(run, session, "erreur")
|
||||
finally:
|
||||
session.close()
|
||||
@@ -183,20 +250,19 @@ def scrape_all(product_ids: Iterable[int] | None = None) -> None:
|
||||
run = _create_run(session)
|
||||
try:
|
||||
config = load_config()
|
||||
products = session.query(models.Product).all()
|
||||
products = session.query(models.Product).filter(models.Product.actif == True).all()
|
||||
if product_ids:
|
||||
products = [product for product in products if product.id in product_ids]
|
||||
run.nb_total = len(products)
|
||||
session.commit()
|
||||
|
||||
if not products:
|
||||
logger.info("Aucun produit actif à scraper")
|
||||
_finalize_run(run, session, "succes")
|
||||
return
|
||||
|
||||
with sync_playwright() as playwright:
|
||||
browser = playwright.chromium.launch(headless=config.scrape.headless)
|
||||
context = browser.new_context(
|
||||
locale=config.scrape.locale,
|
||||
timezone_id=config.scrape.timezone,
|
||||
user_agent=config.scrape.user_agent,
|
||||
viewport=config.scrape.viewport,
|
||||
)
|
||||
browser, context = _create_browser_context(playwright, config)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(config.scrape.timeout_ms)
|
||||
|
||||
@@ -205,55 +271,31 @@ def scrape_all(product_ids: Iterable[int] | None = None) -> None:
|
||||
|
||||
try:
|
||||
for product in products:
|
||||
page.goto(product.url, wait_until="domcontentloaded", timeout=config.scrape.timeout_ms)
|
||||
html = page.content()
|
||||
if detect_blocked(html):
|
||||
screenshot_path, html_path = _save_debug_artifacts(page, product.id)
|
||||
data = {"url": product.url, "asin": product.asin, "bloque": True}
|
||||
raw_path = _save_raw_json(data, product.id)
|
||||
_create_snapshot(
|
||||
session,
|
||||
product,
|
||||
run,
|
||||
data,
|
||||
status="bloque",
|
||||
raw_json_path=raw_path,
|
||||
error_message=f"Bloque: {screenshot_path.name} / {html_path.name}",
|
||||
)
|
||||
try:
|
||||
success, _ = _process_product(page, session, product, run, config)
|
||||
if success:
|
||||
nb_ok += 1
|
||||
else:
|
||||
nb_echec += 1
|
||||
except Exception as e:
|
||||
logger.error("Erreur scraping produit {}: {}", product.id, e)
|
||||
nb_echec += 1
|
||||
continue
|
||||
|
||||
data = extract_product_data(page, product.url)
|
||||
raw_path = _save_raw_json(data, product.id)
|
||||
required = ["titre", "prix_actuel", "note"]
|
||||
missing = [field for field in required if not data.get(field)]
|
||||
status = "champs_manquants" if missing else "ok"
|
||||
_create_snapshot(
|
||||
session,
|
||||
product,
|
||||
run,
|
||||
data,
|
||||
status=status,
|
||||
raw_json_path=raw_path,
|
||||
error_message=", ".join(missing) if missing else None,
|
||||
)
|
||||
if missing:
|
||||
nb_echec += 1
|
||||
else:
|
||||
nb_ok += 1
|
||||
|
||||
# Délai anti-blocage entre les produits
|
||||
delay_min, delay_max = config.scrape.delay_range_ms
|
||||
time.sleep(random.uniform(delay_min, delay_max) / 1000.0)
|
||||
|
||||
run.nb_ok = nb_ok
|
||||
run.nb_echec = nb_echec
|
||||
_finalize_run(run, session, "succes" if nb_echec == 0 else "partiel")
|
||||
|
||||
# Sauvegarder la session pour réutilisation
|
||||
_save_storage_state(context)
|
||||
finally:
|
||||
# fermeture propre du navigateur
|
||||
context.close()
|
||||
browser.close()
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception("Erreur du scraping global")
|
||||
except Exception as e:
|
||||
logger.exception("Erreur du scraping global: {}", e)
|
||||
_finalize_run(run, session, "erreur")
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
@@ -10,21 +10,47 @@
|
||||
"headless": true,
|
||||
"timeout_ms": 30000,
|
||||
"retries": 1,
|
||||
"delay_range_ms": [1000, 3000],
|
||||
"delay_range_ms": [
|
||||
1000,
|
||||
3000
|
||||
],
|
||||
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||||
"viewport": { "width": 1366, "height": 768 },
|
||||
"viewport": {
|
||||
"width": 1366,
|
||||
"height": 768
|
||||
},
|
||||
"locale": "fr-FR",
|
||||
"timezone": "Europe/Paris",
|
||||
"proxy": null
|
||||
},
|
||||
"stores_enabled": ["amazon_fr"],
|
||||
"stores_enabled": [
|
||||
"amazon_fr"
|
||||
],
|
||||
"taxonomy": {
|
||||
"categories": ["SSD", "CPU", "GPU", "RAM"],
|
||||
"categories": [
|
||||
"SSD",
|
||||
"CPU",
|
||||
"GPU",
|
||||
"RAM",
|
||||
"Laptop"
|
||||
],
|
||||
"types_by_category": {
|
||||
"SSD": ["NVMe", "SATA"],
|
||||
"CPU": ["Desktop", "Mobile"],
|
||||
"GPU": ["Gaming", "Workstation"],
|
||||
"RAM": ["DDR4", "DDR5"]
|
||||
"SSD": [
|
||||
"NVMe",
|
||||
"SATA"
|
||||
],
|
||||
"CPU": [
|
||||
"Desktop",
|
||||
"Mobile"
|
||||
],
|
||||
"GPU": [
|
||||
"Gaming",
|
||||
"Workstation"
|
||||
],
|
||||
"RAM": [
|
||||
"DDR4",
|
||||
"DDR5"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||