This commit is contained in:
Gilles Soulier
2026-01-14 21:54:55 +01:00
parent c91c0f1fc9
commit d0b73b9319
140 changed files with 5822 additions and 161 deletions

View File

@@ -0,0 +1,5 @@
"""Module API FastAPI."""
from pricewatch.app.api.main import app
__all__ = ["app"]

Binary file not shown.

Binary file not shown.

876
pricewatch/app/api/main.py Normal file
View File

@@ -0,0 +1,876 @@
"""
API REST FastAPI pour PriceWatch (Phase 3).
"""
from __future__ import annotations
import csv
from collections import deque
from datetime import datetime, timezone
import os
from pathlib import Path
from io import StringIO
from typing import Generator, Optional
import httpx
from fastapi import Depends, FastAPI, Header, HTTPException, Response
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
from sqlalchemy import and_, desc, func
from sqlalchemy.orm import Session
from pricewatch.app.api.schemas import (
EnqueueRequest,
EnqueueResponse,
HealthStatus,
PriceHistoryOut,
PriceHistoryCreate,
PriceHistoryUpdate,
ProductOut,
ProductCreate,
ProductUpdate,
ScheduleRequest,
ScheduleResponse,
ScrapingLogOut,
ScrapingLogCreate,
ScrapingLogUpdate,
ScrapePreviewRequest,
ScrapePreviewResponse,
ScrapeCommitRequest,
ScrapeCommitResponse,
VersionResponse,
BackendLogEntry,
UvicornLogEntry,
WebhookOut,
WebhookCreate,
WebhookUpdate,
WebhookTestResponse,
)
from pricewatch.app.core.config import get_config
from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import ProductSnapshot
from pricewatch.app.db.connection import check_db_connection, get_session
from pricewatch.app.db.models import PriceHistory, Product, ScrapingLog, Webhook
from pricewatch.app.scraping.pipeline import ScrapingPipeline
from pricewatch.app.tasks.scrape import scrape_product
from pricewatch.app.tasks.scheduler import RedisUnavailableError, check_redis_connection, ScrapingScheduler
logger = get_logger("api")
app = FastAPI(title="PriceWatch API", version="0.4.0")
# Buffer de logs backend en memoire pour debug UI.
BACKEND_LOGS = deque(maxlen=200)
UVICORN_LOG_PATH = Path(
os.environ.get("PW_UVICORN_LOG_PATH", "/app/logs/uvicorn.log")
)
def get_db_session() -> Generator[Session, None, None]:
"""Dependency: session SQLAlchemy."""
with get_session(get_config()) as session:
yield session
def require_token(authorization: Optional[str] = Header(default=None)) -> None:
"""Auth simple via token Bearer."""
config = get_config()
token = config.api_token
if not token:
raise HTTPException(status_code=500, detail="API token non configure")
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Token manquant")
provided = authorization.split("Bearer ")[-1].strip()
if provided != token:
raise HTTPException(status_code=403, detail="Token invalide")
@app.get("/health", response_model=HealthStatus)
def health_check() -> HealthStatus:
"""Health check DB + Redis."""
config = get_config()
return HealthStatus(
db=check_db_connection(config),
redis=check_redis_connection(config.redis.url),
)
@app.get("/version", response_model=VersionResponse)
def version_info() -> VersionResponse:
"""Expose la version API."""
return VersionResponse(api_version=app.version)
@app.get("/logs/backend", response_model=list[BackendLogEntry], dependencies=[Depends(require_token)])
def list_backend_logs() -> list[BackendLogEntry]:
"""Expose un buffer de logs backend."""
return list(BACKEND_LOGS)
@app.get("/logs/uvicorn", response_model=list[UvicornLogEntry], dependencies=[Depends(require_token)])
def list_uvicorn_logs(limit: int = 200) -> list[UvicornLogEntry]:
"""Expose les dernieres lignes du log Uvicorn."""
lines = _read_uvicorn_lines(limit=limit)
return [UvicornLogEntry(line=line) for line in lines]
@app.get("/products", response_model=list[ProductOut], dependencies=[Depends(require_token)])
def list_products(
source: Optional[str] = None,
reference: Optional[str] = None,
updated_after: Optional[datetime] = None,
price_min: Optional[float] = None,
price_max: Optional[float] = None,
fetched_after: Optional[datetime] = None,
fetched_before: Optional[datetime] = None,
stock_status: Optional[str] = None,
limit: int = 50,
offset: int = 0,
session: Session = Depends(get_db_session),
) -> list[ProductOut]:
"""Liste des produits avec filtres optionnels."""
latest_price_subquery = (
session.query(
PriceHistory.product_id.label("product_id"),
func.max(PriceHistory.fetched_at).label("latest_fetched_at"),
)
.group_by(PriceHistory.product_id)
.subquery()
)
latest_price = (
session.query(PriceHistory)
.join(
latest_price_subquery,
and_(
PriceHistory.product_id == latest_price_subquery.c.product_id,
PriceHistory.fetched_at == latest_price_subquery.c.latest_fetched_at,
),
)
.subquery()
)
query = session.query(Product).outerjoin(latest_price, Product.id == latest_price.c.product_id)
if source:
query = query.filter(Product.source == source)
if reference:
query = query.filter(Product.reference == reference)
if updated_after:
query = query.filter(Product.last_updated_at >= updated_after)
if price_min is not None:
query = query.filter(latest_price.c.price >= price_min)
if price_max is not None:
query = query.filter(latest_price.c.price <= price_max)
if fetched_after:
query = query.filter(latest_price.c.fetched_at >= fetched_after)
if fetched_before:
query = query.filter(latest_price.c.fetched_at <= fetched_before)
if stock_status:
query = query.filter(latest_price.c.stock_status == stock_status)
products = query.order_by(desc(Product.last_updated_at)).offset(offset).limit(limit).all()
return [_product_to_out(session, product) for product in products]
@app.post("/products", response_model=ProductOut, dependencies=[Depends(require_token)])
def create_product(
payload: ProductCreate,
session: Session = Depends(get_db_session),
) -> ProductOut:
"""Cree un produit."""
product = Product(
source=payload.source,
reference=payload.reference,
url=payload.url,
title=payload.title,
category=payload.category,
description=payload.description,
currency=payload.currency,
msrp=payload.msrp,
)
session.add(product)
try:
session.commit()
session.refresh(product)
except IntegrityError as exc:
session.rollback()
raise HTTPException(status_code=409, detail="Produit deja existant") from exc
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _product_to_out(session, product)
@app.get("/products/{product_id}", response_model=ProductOut, dependencies=[Depends(require_token)])
def get_product(
product_id: int,
session: Session = Depends(get_db_session),
) -> ProductOut:
"""Detail produit + dernier prix."""
product = session.query(Product).filter(Product.id == product_id).one_or_none()
if not product:
raise HTTPException(status_code=404, detail="Produit non trouve")
return _product_to_out(session, product)
@app.patch("/products/{product_id}", response_model=ProductOut, dependencies=[Depends(require_token)])
def update_product(
product_id: int,
payload: ProductUpdate,
session: Session = Depends(get_db_session),
) -> ProductOut:
"""Met a jour un produit (partial)."""
product = session.query(Product).filter(Product.id == product_id).one_or_none()
if not product:
raise HTTPException(status_code=404, detail="Produit non trouve")
updates = payload.model_dump(exclude_unset=True)
for key, value in updates.items():
setattr(product, key, value)
try:
session.commit()
session.refresh(product)
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _product_to_out(session, product)
@app.delete("/products/{product_id}", dependencies=[Depends(require_token)])
def delete_product(
product_id: int,
session: Session = Depends(get_db_session),
) -> dict[str, str]:
"""Supprime un produit (cascade)."""
product = session.query(Product).filter(Product.id == product_id).one_or_none()
if not product:
raise HTTPException(status_code=404, detail="Produit non trouve")
session.delete(product)
try:
session.commit()
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return {"status": "deleted"}
@app.get(
"/products/{product_id}/prices",
response_model=list[PriceHistoryOut],
dependencies=[Depends(require_token)],
)
def list_prices(
product_id: int,
price_min: Optional[float] = None,
price_max: Optional[float] = None,
fetched_after: Optional[datetime] = None,
fetched_before: Optional[datetime] = None,
fetch_status: Optional[str] = None,
limit: int = 50,
offset: int = 0,
session: Session = Depends(get_db_session),
) -> list[PriceHistoryOut]:
"""Historique de prix pour un produit."""
query = session.query(PriceHistory).filter(PriceHistory.product_id == product_id)
if price_min is not None:
query = query.filter(PriceHistory.price >= price_min)
if price_max is not None:
query = query.filter(PriceHistory.price <= price_max)
if fetched_after:
query = query.filter(PriceHistory.fetched_at >= fetched_after)
if fetched_before:
query = query.filter(PriceHistory.fetched_at <= fetched_before)
if fetch_status:
query = query.filter(PriceHistory.fetch_status == fetch_status)
prices = query.order_by(desc(PriceHistory.fetched_at)).offset(offset).limit(limit).all()
return [_price_to_out(price) for price in prices]
@app.post("/prices", response_model=PriceHistoryOut, dependencies=[Depends(require_token)])
def create_price(
payload: PriceHistoryCreate,
session: Session = Depends(get_db_session),
) -> PriceHistoryOut:
"""Ajoute une entree d'historique de prix."""
price = PriceHistory(
product_id=payload.product_id,
price=payload.price,
shipping_cost=payload.shipping_cost,
stock_status=payload.stock_status,
fetch_method=payload.fetch_method,
fetch_status=payload.fetch_status,
fetched_at=payload.fetched_at,
)
session.add(price)
try:
session.commit()
session.refresh(price)
except IntegrityError as exc:
session.rollback()
raise HTTPException(status_code=409, detail="Entree prix deja existante") from exc
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _price_to_out(price)
@app.patch("/prices/{price_id}", response_model=PriceHistoryOut, dependencies=[Depends(require_token)])
def update_price(
price_id: int,
payload: PriceHistoryUpdate,
session: Session = Depends(get_db_session),
) -> PriceHistoryOut:
"""Met a jour une entree de prix."""
price = session.query(PriceHistory).filter(PriceHistory.id == price_id).one_or_none()
if not price:
raise HTTPException(status_code=404, detail="Entree prix non trouvee")
updates = payload.model_dump(exclude_unset=True)
for key, value in updates.items():
setattr(price, key, value)
try:
session.commit()
session.refresh(price)
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _price_to_out(price)
@app.delete("/prices/{price_id}", dependencies=[Depends(require_token)])
def delete_price(
price_id: int,
session: Session = Depends(get_db_session),
) -> dict[str, str]:
"""Supprime une entree de prix."""
price = session.query(PriceHistory).filter(PriceHistory.id == price_id).one_or_none()
if not price:
raise HTTPException(status_code=404, detail="Entree prix non trouvee")
session.delete(price)
try:
session.commit()
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return {"status": "deleted"}
@app.get("/logs", response_model=list[ScrapingLogOut], dependencies=[Depends(require_token)])
def list_logs(
source: Optional[str] = None,
fetch_status: Optional[str] = None,
fetched_after: Optional[datetime] = None,
fetched_before: Optional[datetime] = None,
limit: int = 50,
offset: int = 0,
session: Session = Depends(get_db_session),
) -> list[ScrapingLogOut]:
"""Liste des logs de scraping."""
query = session.query(ScrapingLog)
if source:
query = query.filter(ScrapingLog.source == source)
if fetch_status:
query = query.filter(ScrapingLog.fetch_status == fetch_status)
if fetched_after:
query = query.filter(ScrapingLog.fetched_at >= fetched_after)
if fetched_before:
query = query.filter(ScrapingLog.fetched_at <= fetched_before)
logs = query.order_by(desc(ScrapingLog.fetched_at)).offset(offset).limit(limit).all()
return [_log_to_out(log) for log in logs]
@app.post("/logs", response_model=ScrapingLogOut, dependencies=[Depends(require_token)])
def create_log(
payload: ScrapingLogCreate,
session: Session = Depends(get_db_session),
) -> ScrapingLogOut:
"""Cree un log de scraping."""
log_entry = ScrapingLog(
product_id=payload.product_id,
url=payload.url,
source=payload.source,
reference=payload.reference,
fetch_method=payload.fetch_method,
fetch_status=payload.fetch_status,
fetched_at=payload.fetched_at,
duration_ms=payload.duration_ms,
html_size_bytes=payload.html_size_bytes,
errors=payload.errors,
notes=payload.notes,
)
session.add(log_entry)
try:
session.commit()
session.refresh(log_entry)
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _log_to_out(log_entry)
@app.patch("/logs/{log_id}", response_model=ScrapingLogOut, dependencies=[Depends(require_token)])
def update_log(
log_id: int,
payload: ScrapingLogUpdate,
session: Session = Depends(get_db_session),
) -> ScrapingLogOut:
"""Met a jour un log."""
log_entry = session.query(ScrapingLog).filter(ScrapingLog.id == log_id).one_or_none()
if not log_entry:
raise HTTPException(status_code=404, detail="Log non trouve")
updates = payload.model_dump(exclude_unset=True)
for key, value in updates.items():
setattr(log_entry, key, value)
try:
session.commit()
session.refresh(log_entry)
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _log_to_out(log_entry)
@app.delete("/logs/{log_id}", dependencies=[Depends(require_token)])
def delete_log(
log_id: int,
session: Session = Depends(get_db_session),
) -> dict[str, str]:
"""Supprime un log."""
log_entry = session.query(ScrapingLog).filter(ScrapingLog.id == log_id).one_or_none()
if not log_entry:
raise HTTPException(status_code=404, detail="Log non trouve")
session.delete(log_entry)
try:
session.commit()
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return {"status": "deleted"}
@app.get("/products/export", dependencies=[Depends(require_token)])
def export_products(
source: Optional[str] = None,
reference: Optional[str] = None,
updated_after: Optional[datetime] = None,
price_min: Optional[float] = None,
price_max: Optional[float] = None,
fetched_after: Optional[datetime] = None,
fetched_before: Optional[datetime] = None,
stock_status: Optional[str] = None,
format: str = "csv",
limit: int = 500,
offset: int = 0,
session: Session = Depends(get_db_session),
) -> Response:
"""Export produits en CSV/JSON."""
products = list_products(
source=source,
reference=reference,
updated_after=updated_after,
price_min=price_min,
price_max=price_max,
fetched_after=fetched_after,
fetched_before=fetched_before,
stock_status=stock_status,
limit=limit,
offset=offset,
session=session,
)
rows = [product.model_dump() for product in products]
fieldnames = list(ProductOut.model_fields.keys())
return _export_response(rows, fieldnames, "products", format)
@app.get("/prices/export", dependencies=[Depends(require_token)])
def export_prices(
product_id: Optional[int] = None,
price_min: Optional[float] = None,
price_max: Optional[float] = None,
fetched_after: Optional[datetime] = None,
fetched_before: Optional[datetime] = None,
fetch_status: Optional[str] = None,
format: str = "csv",
limit: int = 500,
offset: int = 0,
session: Session = Depends(get_db_session),
) -> Response:
"""Export historique de prix en CSV/JSON."""
query = session.query(PriceHistory)
if product_id is not None:
query = query.filter(PriceHistory.product_id == product_id)
if price_min is not None:
query = query.filter(PriceHistory.price >= price_min)
if price_max is not None:
query = query.filter(PriceHistory.price <= price_max)
if fetched_after:
query = query.filter(PriceHistory.fetched_at >= fetched_after)
if fetched_before:
query = query.filter(PriceHistory.fetched_at <= fetched_before)
if fetch_status:
query = query.filter(PriceHistory.fetch_status == fetch_status)
prices = query.order_by(desc(PriceHistory.fetched_at)).offset(offset).limit(limit).all()
rows = [_price_to_out(price).model_dump() for price in prices]
fieldnames = list(PriceHistoryOut.model_fields.keys())
return _export_response(rows, fieldnames, "prices", format)
@app.get("/logs/export", dependencies=[Depends(require_token)])
def export_logs(
source: Optional[str] = None,
fetch_status: Optional[str] = None,
fetched_after: Optional[datetime] = None,
fetched_before: Optional[datetime] = None,
format: str = "csv",
limit: int = 500,
offset: int = 0,
session: Session = Depends(get_db_session),
) -> Response:
"""Export logs de scraping en CSV/JSON."""
logs = list_logs(
source=source,
fetch_status=fetch_status,
fetched_after=fetched_after,
fetched_before=fetched_before,
limit=limit,
offset=offset,
session=session,
)
rows = [log.model_dump() for log in logs]
fieldnames = list(ScrapingLogOut.model_fields.keys())
return _export_response(rows, fieldnames, "logs", format)
@app.get("/webhooks", response_model=list[WebhookOut], dependencies=[Depends(require_token)])
def list_webhooks(
event: Optional[str] = None,
enabled: Optional[bool] = None,
limit: int = 50,
offset: int = 0,
session: Session = Depends(get_db_session),
) -> list[WebhookOut]:
"""Liste des webhooks."""
query = session.query(Webhook)
if event:
query = query.filter(Webhook.event == event)
if enabled is not None:
query = query.filter(Webhook.enabled == enabled)
webhooks = query.order_by(desc(Webhook.created_at)).offset(offset).limit(limit).all()
return [_webhook_to_out(webhook) for webhook in webhooks]
@app.post("/webhooks", response_model=WebhookOut, dependencies=[Depends(require_token)])
def create_webhook(
payload: WebhookCreate,
session: Session = Depends(get_db_session),
) -> WebhookOut:
"""Cree un webhook."""
webhook = Webhook(
event=payload.event,
url=payload.url,
enabled=payload.enabled,
secret=payload.secret,
)
session.add(webhook)
try:
session.commit()
session.refresh(webhook)
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _webhook_to_out(webhook)
@app.patch("/webhooks/{webhook_id}", response_model=WebhookOut, dependencies=[Depends(require_token)])
def update_webhook(
webhook_id: int,
payload: WebhookUpdate,
session: Session = Depends(get_db_session),
) -> WebhookOut:
"""Met a jour un webhook."""
webhook = session.query(Webhook).filter(Webhook.id == webhook_id).one_or_none()
if not webhook:
raise HTTPException(status_code=404, detail="Webhook non trouve")
updates = payload.model_dump(exclude_unset=True)
for key, value in updates.items():
setattr(webhook, key, value)
try:
session.commit()
session.refresh(webhook)
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return _webhook_to_out(webhook)
@app.delete("/webhooks/{webhook_id}", dependencies=[Depends(require_token)])
def delete_webhook(
webhook_id: int,
session: Session = Depends(get_db_session),
) -> dict[str, str]:
"""Supprime un webhook."""
webhook = session.query(Webhook).filter(Webhook.id == webhook_id).one_or_none()
if not webhook:
raise HTTPException(status_code=404, detail="Webhook non trouve")
session.delete(webhook)
try:
session.commit()
except SQLAlchemyError as exc:
session.rollback()
raise HTTPException(status_code=500, detail="Erreur DB") from exc
return {"status": "deleted"}
@app.post(
"/webhooks/{webhook_id}/test",
response_model=WebhookTestResponse,
dependencies=[Depends(require_token)],
)
def send_webhook_test(
webhook_id: int,
session: Session = Depends(get_db_session),
) -> WebhookTestResponse:
"""Envoie un evenement de test."""
webhook = session.query(Webhook).filter(Webhook.id == webhook_id).one_or_none()
if not webhook:
raise HTTPException(status_code=404, detail="Webhook non trouve")
if not webhook.enabled:
raise HTTPException(status_code=409, detail="Webhook desactive")
payload = {"message": "test webhook", "webhook_id": webhook.id}
_send_webhook(webhook, "test", payload)
return WebhookTestResponse(status="sent")
@app.post("/enqueue", response_model=EnqueueResponse, dependencies=[Depends(require_token)])
def enqueue_job(payload: EnqueueRequest) -> EnqueueResponse:
"""Enqueue un job immediat."""
try:
scheduler = ScrapingScheduler(get_config())
job = scheduler.enqueue_immediate(
payload.url,
use_playwright=payload.use_playwright,
save_db=payload.save_db,
)
return EnqueueResponse(job_id=job.id)
except RedisUnavailableError as exc:
raise HTTPException(status_code=503, detail=str(exc)) from exc
@app.post("/schedule", response_model=ScheduleResponse, dependencies=[Depends(require_token)])
def schedule_job(payload: ScheduleRequest) -> ScheduleResponse:
"""Planifie un job recurrent."""
try:
scheduler = ScrapingScheduler(get_config())
job_info = scheduler.schedule_product(
payload.url,
interval_hours=payload.interval_hours,
use_playwright=payload.use_playwright,
save_db=payload.save_db,
)
return ScheduleResponse(job_id=job_info.job_id, next_run=job_info.next_run)
except RedisUnavailableError as exc:
raise HTTPException(status_code=503, detail=str(exc)) from exc
@app.post("/scrape/preview", response_model=ScrapePreviewResponse, dependencies=[Depends(require_token)])
def preview_scrape(payload: ScrapePreviewRequest) -> ScrapePreviewResponse:
"""Scrape un produit sans persistence pour previsualisation."""
_add_backend_log("INFO", f"Preview scraping: {payload.url}")
result = scrape_product(
payload.url,
use_playwright=payload.use_playwright,
save_db=False,
)
snapshot = result.get("snapshot")
if snapshot is None:
_add_backend_log("ERROR", f"Preview scraping KO: {payload.url}")
return ScrapePreviewResponse(success=False, snapshot=None, error=result.get("error"))
return ScrapePreviewResponse(
success=bool(result.get("success")),
snapshot=snapshot.model_dump(mode="json"),
error=result.get("error"),
)
@app.post("/scrape/commit", response_model=ScrapeCommitResponse, dependencies=[Depends(require_token)])
def commit_scrape(payload: ScrapeCommitRequest) -> ScrapeCommitResponse:
"""Persiste un snapshot previsualise."""
try:
snapshot = ProductSnapshot.model_validate(payload.snapshot)
except Exception as exc:
_add_backend_log("ERROR", "Commit scraping KO: snapshot invalide")
raise HTTPException(status_code=400, detail="Snapshot invalide") from exc
product_id = ScrapingPipeline(config=get_config()).process_snapshot(snapshot, save_to_db=True)
_add_backend_log("INFO", f"Commit scraping OK: product_id={product_id}")
return ScrapeCommitResponse(success=True, product_id=product_id)
def _export_response(
rows: list[dict[str, object]],
fieldnames: list[str],
filename_prefix: str,
format: str,
) -> Response:
"""Expose une reponse CSV/JSON avec un nom de fichier stable."""
if format not in {"csv", "json"}:
raise HTTPException(status_code=400, detail="Format invalide (csv ou json)")
headers = {"Content-Disposition": f'attachment; filename="{filename_prefix}.{format}"'}
if format == "json":
return JSONResponse(content=jsonable_encoder(rows), headers=headers)
return _to_csv_response(rows, fieldnames, headers)
def _to_csv_response(
rows: list[dict[str, object]],
fieldnames: list[str],
headers: dict[str, str],
) -> Response:
buffer = StringIO()
writer = csv.DictWriter(buffer, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
return Response(content=buffer.getvalue(), media_type="text/csv", headers=headers)
def _send_webhook(webhook: Webhook, event: str, payload: dict[str, object]) -> None:
"""Envoie un webhook avec gestion d'erreur explicite."""
headers = {"Content-Type": "application/json"}
if webhook.secret:
headers["X-Webhook-Secret"] = webhook.secret
try:
response = httpx.post(
webhook.url,
json={"event": event, "payload": payload},
headers=headers,
timeout=5.0,
)
response.raise_for_status()
except httpx.HTTPError as exc:
logger.error("Erreur webhook", extra={"url": webhook.url, "event": event, "error": str(exc)})
raise HTTPException(status_code=502, detail="Echec webhook") from exc
def _add_backend_log(level: str, message: str) -> None:
BACKEND_LOGS.append(
BackendLogEntry(
time=datetime.now(timezone.utc),
level=level,
message=message,
)
)
def _read_uvicorn_lines(limit: int = 200) -> list[str]:
"""Lit les dernieres lignes du log Uvicorn si disponible."""
if limit <= 0:
return []
try:
if not UVICORN_LOG_PATH.exists():
return []
with UVICORN_LOG_PATH.open("r", encoding="utf-8", errors="ignore") as handle:
lines = handle.readlines()
return [line.rstrip("\n") for line in lines[-limit:]]
except Exception:
return []
def _product_to_out(session: Session, product: Product) -> ProductOut:
"""Helper pour mapper Product + dernier prix."""
latest = (
session.query(PriceHistory)
.filter(PriceHistory.product_id == product.id)
.order_by(desc(PriceHistory.fetched_at))
.first()
)
images = [image.image_url for image in product.images]
specs = {spec.spec_key: spec.spec_value for spec in product.specs}
discount_amount = None
discount_percent = None
if latest and latest.price is not None and product.msrp:
discount_amount = float(product.msrp) - float(latest.price)
if product.msrp > 0:
discount_percent = (discount_amount / float(product.msrp)) * 100
return ProductOut(
id=product.id,
source=product.source,
reference=product.reference,
url=product.url,
title=product.title,
category=product.category,
description=product.description,
currency=product.currency,
msrp=float(product.msrp) if product.msrp is not None else None,
first_seen_at=product.first_seen_at,
last_updated_at=product.last_updated_at,
latest_price=float(latest.price) if latest and latest.price is not None else None,
latest_shipping_cost=(
float(latest.shipping_cost) if latest and latest.shipping_cost is not None else None
),
latest_stock_status=latest.stock_status if latest else None,
latest_fetched_at=latest.fetched_at if latest else None,
images=images,
specs=specs,
discount_amount=discount_amount,
discount_percent=discount_percent,
)
def _price_to_out(price: PriceHistory) -> PriceHistoryOut:
return PriceHistoryOut(
id=price.id,
product_id=price.product_id,
price=float(price.price) if price.price is not None else None,
shipping_cost=float(price.shipping_cost) if price.shipping_cost is not None else None,
stock_status=price.stock_status,
fetch_method=price.fetch_method,
fetch_status=price.fetch_status,
fetched_at=price.fetched_at,
)
def _log_to_out(log: ScrapingLog) -> ScrapingLogOut:
return ScrapingLogOut(
id=log.id,
product_id=log.product_id,
url=log.url,
source=log.source,
reference=log.reference,
fetch_method=log.fetch_method,
fetch_status=log.fetch_status,
fetched_at=log.fetched_at,
duration_ms=log.duration_ms,
html_size_bytes=log.html_size_bytes,
errors=log.errors,
notes=log.notes,
)
def _webhook_to_out(webhook: Webhook) -> WebhookOut:
return WebhookOut(
id=webhook.id,
event=webhook.event,
url=webhook.url,
enabled=webhook.enabled,
secret=webhook.secret,
created_at=webhook.created_at,
)

View File

@@ -0,0 +1,212 @@
"""
Schemas API FastAPI pour Phase 3.
"""
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
class HealthStatus(BaseModel):
db: bool
redis: bool
class ProductOut(BaseModel):
id: int
source: str
reference: str
url: str
title: Optional[str] = None
category: Optional[str] = None
description: Optional[str] = None
currency: Optional[str] = None
msrp: Optional[float] = None
first_seen_at: datetime
last_updated_at: datetime
latest_price: Optional[float] = None
latest_shipping_cost: Optional[float] = None
latest_stock_status: Optional[str] = None
latest_fetched_at: Optional[datetime] = None
images: list[str] = []
specs: dict[str, str] = {}
discount_amount: Optional[float] = None
discount_percent: Optional[float] = None
class ProductCreate(BaseModel):
source: str
reference: str
url: str
title: Optional[str] = None
category: Optional[str] = None
description: Optional[str] = None
currency: Optional[str] = None
msrp: Optional[float] = None
class ProductUpdate(BaseModel):
url: Optional[str] = None
title: Optional[str] = None
category: Optional[str] = None
description: Optional[str] = None
currency: Optional[str] = None
msrp: Optional[float] = None
class PriceHistoryOut(BaseModel):
id: int
product_id: int
price: Optional[float] = None
shipping_cost: Optional[float] = None
stock_status: Optional[str] = None
fetch_method: str
fetch_status: str
fetched_at: datetime
class PriceHistoryCreate(BaseModel):
product_id: int
price: Optional[float] = None
shipping_cost: Optional[float] = None
stock_status: Optional[str] = None
fetch_method: str
fetch_status: str
fetched_at: datetime
class PriceHistoryUpdate(BaseModel):
price: Optional[float] = None
shipping_cost: Optional[float] = None
stock_status: Optional[str] = None
fetch_method: Optional[str] = None
fetch_status: Optional[str] = None
fetched_at: Optional[datetime] = None
class ScrapingLogOut(BaseModel):
id: int
product_id: Optional[int] = None
url: str
source: str
reference: Optional[str] = None
fetch_method: str
fetch_status: str
fetched_at: datetime
duration_ms: Optional[int] = None
html_size_bytes: Optional[int] = None
errors: Optional[list[str]] = None
notes: Optional[list[str]] = None
class WebhookOut(BaseModel):
id: int
event: str
url: str
enabled: bool
secret: Optional[str] = None
created_at: datetime
class WebhookCreate(BaseModel):
event: str
url: str
enabled: bool = True
secret: Optional[str] = None
class WebhookUpdate(BaseModel):
event: Optional[str] = None
url: Optional[str] = None
enabled: Optional[bool] = None
secret: Optional[str] = None
class WebhookTestResponse(BaseModel):
status: str
class ScrapingLogCreate(BaseModel):
product_id: Optional[int] = None
url: str
source: str
reference: Optional[str] = None
fetch_method: str
fetch_status: str
fetched_at: datetime
duration_ms: Optional[int] = None
html_size_bytes: Optional[int] = None
errors: Optional[list[str]] = None
notes: Optional[list[str]] = None
class ScrapingLogUpdate(BaseModel):
product_id: Optional[int] = None
url: Optional[str] = None
source: Optional[str] = None
reference: Optional[str] = None
fetch_method: Optional[str] = None
fetch_status: Optional[str] = None
fetched_at: Optional[datetime] = None
duration_ms: Optional[int] = None
html_size_bytes: Optional[int] = None
errors: Optional[list[str]] = None
notes: Optional[list[str]] = None
class EnqueueRequest(BaseModel):
url: str = Field(..., description="URL du produit")
use_playwright: Optional[bool] = None
save_db: bool = True
class EnqueueResponse(BaseModel):
job_id: str
class ScheduleRequest(BaseModel):
url: str = Field(..., description="URL du produit")
interval_hours: int = Field(default=24, ge=1)
use_playwright: Optional[bool] = None
save_db: bool = True
class ScheduleResponse(BaseModel):
job_id: str
next_run: datetime
class ScrapePreviewRequest(BaseModel):
url: str
use_playwright: Optional[bool] = None
class ScrapePreviewResponse(BaseModel):
success: bool
snapshot: Optional[dict[str, object]] = None
error: Optional[str] = None
class ScrapeCommitRequest(BaseModel):
snapshot: dict[str, object]
class ScrapeCommitResponse(BaseModel):
success: bool
product_id: Optional[int] = None
error: Optional[str] = None
class VersionResponse(BaseModel):
api_version: str
class BackendLogEntry(BaseModel):
time: datetime
level: str
message: str
class UvicornLogEntry(BaseModel):
line: str

BIN
pricewatch/app/cli/__pycache__/main.cpython-313.pyc Executable file → Normal file

Binary file not shown.

View File

@@ -15,7 +15,7 @@ from typing import Optional
import redis
import typer
from rq import Connection, Worker
from rq import Worker
from alembic import command as alembic_command
from alembic.config import Config as AlembicConfig
from rich import print as rprint
@@ -34,7 +34,7 @@ from pricewatch.app.scraping.pipeline import ScrapingPipeline
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore
from pricewatch.app.tasks.scheduler import ScrapingScheduler
from pricewatch.app.tasks.scheduler import RedisUnavailableError, ScrapingScheduler
# Créer l'application Typer
app = typer.Typer(
@@ -197,18 +197,21 @@ def run(
html = None
fetch_method = FetchMethod.HTTP
fetch_error = None
http_result = None
# Tenter HTTP d'abord
logger.info("Tentative HTTP...")
http_result = fetch_http(canonical_url)
if config.options.force_playwright:
logger.info("Playwright force, skip HTTP")
else:
logger.info("Tentative HTTP...")
http_result = fetch_http(canonical_url)
if http_result.success:
if http_result and http_result.success:
html = http_result.html
fetch_method = FetchMethod.HTTP
logger.info("✓ HTTP réussi")
elif config.options.use_playwright:
# Fallback Playwright
logger.warning(f"HTTP échoué: {http_result.error}, fallback Playwright")
fallback_reason = http_result.error if http_result else "force_playwright"
logger.warning(f"HTTP échoué: {fallback_reason}, fallback Playwright")
pw_result = fetch_playwright(
canonical_url,
headless=not config.options.headful,
@@ -231,7 +234,7 @@ def run(
fetch_error = pw_result.error
logger.error(f"✗ Playwright échoué: {fetch_error}")
else:
fetch_error = http_result.error
fetch_error = http_result.error if http_result else "skip_http"
logger.error(f"✗ HTTP échoué: {fetch_error}")
# Parser si on a du HTML
@@ -467,11 +470,25 @@ def worker(
Lance un worker RQ.
"""
config = get_config()
connection = redis.from_url(config.redis.url)
try:
connection = redis.from_url(config.redis.url)
# Verification connexion avant de lancer le worker
connection.ping()
except redis.exceptions.ConnectionError as e:
rprint(f"[red]✗ Impossible de se connecter a Redis ({config.redis.url})[/red]")
rprint(f"[red] Erreur: {e}[/red]")
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
rprint(" docker compose up -d redis")
rprint(" # ou")
rprint(" redis-server")
raise typer.Exit(code=1)
except redis.exceptions.RedisError as e:
rprint(f"[red]✗ Erreur Redis: {e}[/red]")
raise typer.Exit(code=1)
with Connection(connection):
worker_instance = Worker([queue])
worker_instance.work(with_scheduler=with_scheduler)
# RQ 2.x: connexion passee directement au Worker
worker_instance = Worker([queue], connection=connection)
worker_instance.work(with_scheduler=with_scheduler)
@app.command()
@@ -486,9 +503,15 @@ def enqueue(
"""
Enqueue un scraping immediat.
"""
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
try:
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
except RedisUnavailableError as e:
rprint(f"[red]✗ {e.message}[/red]")
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
rprint(" docker compose up -d redis")
raise typer.Exit(code=1)
@app.command()
@@ -504,16 +527,22 @@ def schedule(
"""
Planifie un scraping recurrent.
"""
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job_info = scheduler.schedule_product(
url,
interval_hours=interval,
use_playwright=use_playwright,
save_db=save_db,
)
rprint(
f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
)
try:
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
job_info = scheduler.schedule_product(
url,
interval_hours=interval,
use_playwright=use_playwright,
save_db=save_db,
)
rprint(
f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
)
except RedisUnavailableError as e:
rprint(f"[red]✗ {e.message}[/red]")
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
rprint(" docker compose up -d redis")
raise typer.Exit(code=1)
if __name__ == "__main__":

BIN
pricewatch/app/core/__pycache__/config.cpython-313.pyc Executable file → Normal file

Binary file not shown.

BIN
pricewatch/app/core/__pycache__/schema.cpython-313.pyc Executable file → Normal file

Binary file not shown.

6
pricewatch/app/core/config.py Executable file → Normal file
View File

@@ -108,6 +108,11 @@ class AppConfig(BaseSettings):
default=True, description="Enable background worker functionality"
)
# API auth
api_token: Optional[str] = Field(
default=None, description="API token simple (Bearer)"
)
# Scraping defaults
default_playwright_timeout: int = Field(
default=60000, description="Default Playwright timeout in milliseconds"
@@ -138,6 +143,7 @@ class AppConfig(BaseSettings):
logger.info(f"Worker enabled: {self.enable_worker}")
logger.info(f"Worker timeout: {self.worker_timeout}s")
logger.info(f"Worker concurrency: {self.worker_concurrency}")
logger.info(f"API token configured: {bool(self.api_token)}")
logger.info("================================")

View File

@@ -23,6 +23,9 @@ class ScrapingOptions(BaseModel):
use_playwright: bool = Field(
default=True, description="Utiliser Playwright en fallback"
)
force_playwright: bool = Field(
default=False, description="Forcer Playwright même si HTTP réussi"
)
headful: bool = Field(default=False, description="Mode headful (voir le navigateur)")
save_html: bool = Field(
default=True, description="Sauvegarder HTML pour debug"
@@ -94,7 +97,8 @@ def read_yaml_config(yaml_path: str | Path) -> ScrapingConfig:
config = ScrapingConfig.model_validate(data)
logger.info(
f"Configuration chargée: {len(config.urls)} URL(s), "
f"playwright={config.options.use_playwright}"
f"playwright={config.options.use_playwright}, "
f"force_playwright={config.options.force_playwright}"
)
return config

View File

@@ -9,7 +9,7 @@ from datetime import datetime
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field, HttpUrl, field_validator
from pydantic import BaseModel, ConfigDict, Field, HttpUrl, field_validator
class StockStatus(str, Enum):
@@ -38,6 +38,8 @@ class DebugStatus(str, Enum):
class DebugInfo(BaseModel):
"""Informations de debug pour tracer les problèmes de scraping."""
model_config = ConfigDict(use_enum_values=True)
method: FetchMethod = Field(
description="Méthode utilisée pour la récupération (http ou playwright)"
)
@@ -55,9 +57,6 @@ class DebugInfo(BaseModel):
default=None, description="Taille du HTML récupéré en octets"
)
class Config:
use_enum_values = True
class ProductSnapshot(BaseModel):
"""
@@ -81,6 +80,7 @@ class ProductSnapshot(BaseModel):
# Données produit principales
title: Optional[str] = Field(default=None, description="Nom du produit")
price: Optional[float] = Field(default=None, description="Prix du produit", ge=0)
msrp: Optional[float] = Field(default=None, description="Prix conseille", ge=0)
currency: str = Field(default="EUR", description="Devise (EUR, USD, etc.)")
shipping_cost: Optional[float] = Field(
default=None, description="Frais de port", ge=0
@@ -94,6 +94,7 @@ class ProductSnapshot(BaseModel):
default=None, description="Référence produit (ASIN, SKU, etc.)"
)
category: Optional[str] = Field(default=None, description="Catégorie du produit")
description: Optional[str] = Field(default=None, description="Description produit")
# Médias
images: list[str] = Field(
@@ -133,20 +134,22 @@ class ProductSnapshot(BaseModel):
"""Filtre les URLs d'images vides."""
return [url.strip() for url in v if url and url.strip()]
class Config:
use_enum_values = True
json_schema_extra = {
model_config = ConfigDict(
use_enum_values=True,
json_schema_extra={
"example": {
"source": "amazon",
"url": "https://www.amazon.fr/dp/B08N5WRWNW",
"fetched_at": "2026-01-13T10:30:00Z",
"title": "Exemple de produit",
"price": 299.99,
"msrp": 349.99,
"currency": "EUR",
"shipping_cost": 0.0,
"stock_status": "in_stock",
"reference": "B08N5WRWNW",
"category": "Electronics",
"description": "Chargeur USB-C multi-ports.",
"images": [
"https://example.com/image1.jpg",
"https://example.com/image2.jpg",
@@ -165,7 +168,8 @@ class ProductSnapshot(BaseModel):
"html_size_bytes": 145000,
},
}
}
},
)
def to_dict(self) -> dict:
"""Serialize vers un dictionnaire Python natif."""

2
pricewatch/app/db/__init__.py Executable file → Normal file
View File

@@ -20,6 +20,7 @@ from pricewatch.app.db.models import (
ProductImage,
ProductSpec,
ScrapingLog,
Webhook,
)
__all__ = [
@@ -30,6 +31,7 @@ __all__ = [
"ProductImage",
"ProductSpec",
"ScrapingLog",
"Webhook",
"ProductRepository",
# Connection
"get_engine",

BIN
pricewatch/app/db/__pycache__/__init__.cpython-313.pyc Executable file → Normal file

Binary file not shown.

View File

BIN
pricewatch/app/db/__pycache__/models.cpython-313.pyc Executable file → Normal file

Binary file not shown.

View File

0
pricewatch/app/db/connection.py Executable file → Normal file
View File

View File

0
pricewatch/app/db/migrations/env.py Executable file → Normal file
View File

0
pricewatch/app/db/migrations/script.py.mako Executable file → Normal file
View File

View File

View File

@@ -0,0 +1,35 @@
"""Add webhooks table
Revision ID: 20260114_02
Revises: 20260114_01
Create Date: 2026-01-14 00:00:00
"""
from alembic import op
import sqlalchemy as sa
# Revision identifiers, used by Alembic.
revision = "20260114_02"
down_revision = "20260114_01"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_table(
"webhooks",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("event", sa.String(length=50), nullable=False),
sa.Column("url", sa.Text(), nullable=False),
sa.Column("enabled", sa.Boolean(), nullable=False, server_default=sa.text("true")),
sa.Column("secret", sa.String(length=200), nullable=True),
sa.Column("created_at", sa.TIMESTAMP(), nullable=False),
)
op.create_index("ix_webhook_event", "webhooks", ["event"], unique=False)
op.create_index("ix_webhook_enabled", "webhooks", ["enabled"], unique=False)
def downgrade() -> None:
op.drop_index("ix_webhook_enabled", table_name="webhooks")
op.drop_index("ix_webhook_event", table_name="webhooks")
op.drop_table("webhooks")

View File

@@ -0,0 +1,26 @@
"""Ajout description et msrp sur products.
Revision ID: 20260115_02_product_details
Revises: 20260114_02
Create Date: 2026-01-15 10:00:00.000000
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "20260115_02_product_details"
down_revision = "20260114_02"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column("products", sa.Column("description", sa.Text(), nullable=True))
op.add_column("products", sa.Column("msrp", sa.Numeric(10, 2), nullable=True))
def downgrade() -> None:
op.drop_column("products", "msrp")
op.drop_column("products", "description")

48
pricewatch/app/db/models.py Executable file → Normal file
View File

@@ -15,7 +15,7 @@ Justification technique:
- JSONB uniquement pour données variables: errors, notes dans logs
"""
from datetime import datetime
from datetime import datetime, timezone
from decimal import Decimal
from typing import List, Optional
@@ -28,6 +28,7 @@ from sqlalchemy import (
Integer,
JSON,
Numeric,
Boolean,
String,
Text,
UniqueConstraint,
@@ -42,6 +43,10 @@ class Base(DeclarativeBase):
pass
def utcnow() -> datetime:
return datetime.now(timezone.utc)
class Product(Base):
"""
Catalogue produits (1 ligne par produit unique).
@@ -70,19 +75,25 @@ class Product(Base):
category: Mapped[Optional[str]] = mapped_column(
Text, nullable=True, comment="Product category (breadcrumb)"
)
description: Mapped[Optional[str]] = mapped_column(
Text, nullable=True, comment="Product description"
)
currency: Mapped[Optional[str]] = mapped_column(
String(3), nullable=True, comment="Currency code (EUR, USD, GBP)"
)
msrp: Mapped[Optional[Decimal]] = mapped_column(
Numeric(10, 2), nullable=True, comment="Recommended price"
)
# Timestamps
first_seen_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="First scraping timestamp"
TIMESTAMP, nullable=False, default=utcnow, comment="First scraping timestamp"
)
last_updated_at: Mapped[datetime] = mapped_column(
TIMESTAMP,
nullable=False,
default=datetime.utcnow,
onupdate=datetime.utcnow,
default=utcnow,
onupdate=utcnow,
comment="Last metadata update",
)
@@ -280,7 +291,7 @@ class ScrapingLog(Base):
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
)
fetched_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="Scraping timestamp"
TIMESTAMP, nullable=False, default=utcnow, comment="Scraping timestamp"
)
# Performance metrics
@@ -318,3 +329,30 @@ class ScrapingLog(Base):
def __repr__(self) -> str:
return f"<ScrapingLog(id={self.id}, url={self.url}, status={self.fetch_status}, fetched_at={self.fetched_at})>"
class Webhook(Base):
"""
Webhooks pour notifications externes.
"""
__tablename__ = "webhooks"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
event: Mapped[str] = mapped_column(String(50), nullable=False, comment="Event name")
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Webhook URL")
enabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
secret: Mapped[Optional[str]] = mapped_column(
String(200), nullable=True, comment="Secret optionnel"
)
created_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=utcnow, comment="Creation timestamp"
)
__table_args__ = (
Index("ix_webhook_event", "event"),
Index("ix_webhook_enabled", "enabled"),
)
def __repr__(self) -> str:
return f"<Webhook(id={self.id}, event={self.event}, url={self.url})>"

4
pricewatch/app/db/repository.py Executable file → Normal file
View File

@@ -49,8 +49,12 @@ class ProductRepository:
product.title = snapshot.title
if snapshot.category:
product.category = snapshot.category
if snapshot.description:
product.description = snapshot.description
if snapshot.currency:
product.currency = snapshot.currency
if snapshot.msrp is not None:
product.msrp = snapshot.msrp
def add_price_history(self, product: Product, snapshot: ProductSnapshot) -> Optional[PriceHistory]:
"""Ajoute une entree d'historique de prix si inexistante."""

View File

View File

0
pricewatch/app/scraping/pipeline.py Executable file → Normal file
View File

View File

@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text
logger = get_logger("stores.aliexpress")
@@ -126,6 +127,8 @@ class AliexpressStore(BaseStore):
images = self._extract_images(html, soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(html, debug_info)
reference = self.extract_reference(url)
# Note sur le rendu client-side
@@ -150,8 +153,10 @@ class AliexpressStore(BaseStore):
stock_status=stock_status,
reference=reference,
category=category,
description=description,
images=images,
specs=specs,
msrp=msrp,
debug=debug_info,
)
@@ -183,6 +188,17 @@ class AliexpressStore(BaseStore):
debug.errors.append("Titre non trouvé")
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
return None
def _extract_price(
self, html: str, soup: BeautifulSoup, debug: DebugInfo
) -> Optional[float]:
@@ -193,35 +209,39 @@ class AliexpressStore(BaseStore):
On utilise regex sur le HTML brut.
"""
# Pattern 1: Prix avant € (ex: "136,69 €")
match = re.search(r"([0-9]+[.,][0-9]{2})\s*€", html)
match = re.search(r"([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)\\s*€", html)
if match:
price_str = match.group(1).replace(",", ".")
try:
return float(price_str)
except ValueError:
pass
price = parse_price_text(match.group(1))
if price is not None:
return price
# Pattern 2: € avant prix (ex: "€ 136.69")
match = re.search(r"\s*([0-9]+[.,][0-9]{2})", html)
match = re.search(r"\\s*([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)", html)
if match:
price_str = match.group(1).replace(",", ".")
try:
return float(price_str)
except ValueError:
pass
price = parse_price_text(match.group(1))
if price is not None:
return price
# Pattern 3: Chercher dans meta tags (moins fiable)
og_price = soup.find("meta", property="og:price:amount")
if og_price:
price_str = og_price.get("content", "")
try:
return float(price_str)
except ValueError:
pass
price = parse_price_text(price_str)
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_msrp(self, html: str, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille si present."""
match = re.search(r"originalPrice\"\\s*:\\s*\"([0-9\\s.,]+)\"", html)
if match:
price = parse_price_text(match.group(1))
if price is not None:
return price
return None
def _extract_currency(
self, url: str, soup: BeautifulSoup, debug: DebugInfo
) -> str:

View File

View File

@@ -54,12 +54,12 @@ specs_table:
# ASIN (parfois dans les métadonnées)
asin:
- "input[name='ASIN']"
- "th:contains('ASIN') + td"
- "th:-soup-contains('ASIN') + td"
# Messages captcha / robot check
captcha_indicators:
- "form[action*='validateCaptcha']"
- "p.a-last:contains('Sorry')"
- "p.a-last:-soup-contains('Sorry')"
- "img[alt*='captcha']"
# Notes pour le parsing:

View File

@@ -4,7 +4,9 @@ Store Amazon - Parsing de produits Amazon.fr et Amazon.com.
Supporte l'extraction de: titre, prix, ASIN, images, specs, etc.
"""
import json
import re
from html import unescape
from datetime import datetime
from pathlib import Path
from typing import Optional
@@ -21,6 +23,7 @@ from pricewatch.app.core.schema import (
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text
logger = get_logger("stores.amazon")
@@ -131,6 +134,8 @@ class AmazonStore(BaseStore):
images = self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
# Déterminer le statut final (ne pas écraser FAILED)
@@ -150,8 +155,10 @@ class AmazonStore(BaseStore):
stock_status=stock_status,
reference=reference,
category=category,
description=description,
images=images,
specs=specs,
msrp=msrp,
debug=debug_info,
)
@@ -195,6 +202,17 @@ class AmazonStore(BaseStore):
debug.errors.append("Titre non trouvé")
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
selectors = self.get_selector("price", [])
@@ -205,14 +223,9 @@ class AmazonStore(BaseStore):
elements = soup.select(selector)
for element in elements:
text = element.get_text(strip=True)
# Extraire nombre (format: "299,99" ou "299.99")
match = re.search(r"(\d+)[.,](\d+)", text)
if match:
price_str = f"{match.group(1)}.{match.group(2)}"
try:
return float(price_str)
except ValueError:
continue
price = parse_price_text(text)
if price is not None:
return price
# Fallback: chercher les spans séparés a-price-whole et a-price-fraction
whole = soup.select_one("span.a-price-whole")
@@ -220,15 +233,24 @@ class AmazonStore(BaseStore):
if whole and fraction:
whole_text = whole.get_text(strip=True)
fraction_text = fraction.get_text(strip=True)
try:
price_str = f"{whole_text}.{fraction_text}"
return float(price_str)
except ValueError:
pass
price = parse_price_text(f"{whole_text}.{fraction_text}")
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille."""
strike = soup.select_one("span.priceBlockStrikePriceString") or soup.select_one(
"span.a-text-price span.a-offscreen"
)
if strike:
price = parse_price_text(strike.get_text(strip=True))
if price is not None:
return price
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la devise."""
selectors = self.get_selector("currency", [])
@@ -270,6 +292,7 @@ class AmazonStore(BaseStore):
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
"""Extrait les URLs d'images."""
images = []
seen = set()
selectors = self.get_selector("images", [])
if isinstance(selectors, str):
selectors = [selectors]
@@ -278,19 +301,57 @@ class AmazonStore(BaseStore):
elements = soup.select(selector)
for element in elements:
# Attribut src ou data-src
url = element.get("src") or element.get("data-src")
url = element.get("src") or element.get("data-src") or element.get("data-old-hires")
if url and url.startswith("http"):
images.append(url)
if self._is_product_image(url) and url not in seen:
images.append(url)
seen.add(url)
dynamic = element.get("data-a-dynamic-image")
if dynamic:
urls = self._extract_dynamic_images(dynamic)
for dyn_url in urls:
if self._is_product_image(dyn_url) and dyn_url not in seen:
images.append(dyn_url)
seen.add(dyn_url)
# Fallback: chercher tous les img tags si aucune image trouvée
if not images:
all_imgs = soup.find_all("img")
for img in all_imgs:
url = img.get("src") or img.get("data-src")
if url and url.startswith("http"):
images.append(url)
if url and url.startswith("http") and self._is_product_image(url):
if url not in seen:
images.append(url)
seen.add(url)
return list(set(images)) # Dédupliquer
return images
def _extract_dynamic_images(self, raw: str) -> list[str]:
"""Extrait les URLs du JSON data-a-dynamic-image."""
try:
data = json.loads(unescape(raw))
except (TypeError, json.JSONDecodeError):
return []
urls = []
if isinstance(data, dict):
candidates = []
for url, dims in data.items():
if not isinstance(url, str) or not url.startswith("http"):
continue
size = dims[0] if isinstance(dims, list) and dims else 0
candidates.append((size, url))
candidates.sort(key=lambda item: item[0], reverse=True)
for _, url in candidates:
urls.append(url)
return urls
def _is_product_image(self, url: str) -> bool:
"""Filtre basique pour eviter les logos et sprites."""
lowered = url.lower()
if "prime_logo" in lowered or "sprite" in lowered:
return False
return True
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la catégorie depuis les breadcrumbs."""

View File

@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text
logger = get_logger("stores.backmarket")
@@ -116,6 +117,8 @@ class BackmarketStore(BaseStore):
images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
reference = self.extract_reference(url)
# Spécifique Backmarket: condition (état du reconditionné)
@@ -140,8 +143,10 @@ class BackmarketStore(BaseStore):
stock_status=stock_status,
reference=reference,
category=category,
description=description,
images=images,
specs=specs,
msrp=msrp,
debug=debug_info,
)
@@ -213,6 +218,17 @@ class BackmarketStore(BaseStore):
debug.errors.append("Titre non trouvé")
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
selectors = self.get_selector("price", [])
@@ -225,20 +241,29 @@ class BackmarketStore(BaseStore):
# Attribut content (schema.org) ou texte
price_text = element.get("content") or element.get_text(strip=True)
# Extraire nombre (format: "299,99" ou "299.99" ou "299")
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
if match:
integer_part = match.group(1)
decimal_part = match.group(2) or "00"
price_str = f"{integer_part}.{decimal_part}"
try:
return float(price_str)
except ValueError:
continue
price = parse_price_text(price_text)
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille."""
selectors = [
".price--old",
".price--striked",
".price__old",
"del",
]
for selector in selectors:
element = soup.select_one(selector)
if element:
price = parse_price_text(element.get_text(strip=True))
if price is not None:
return price
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la devise."""
selectors = self.get_selector("currency", [])

View File

@@ -4,6 +4,7 @@ Store Cdiscount - Parsing de produits Cdiscount.com.
Supporte l'extraction de: titre, prix, SKU, images, specs, etc.
"""
import json
import re
from datetime import datetime
from pathlib import Path
@@ -21,6 +22,7 @@ from pricewatch.app.core.schema import (
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text
logger = get_logger("stores.cdiscount")
@@ -112,6 +114,8 @@ class CdiscountStore(BaseStore):
images = self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
reference = self.extract_reference(url) or self._extract_sku_from_html(soup)
# Déterminer le statut final
@@ -130,8 +134,10 @@ class CdiscountStore(BaseStore):
stock_status=stock_status,
reference=reference,
category=category,
description=description,
images=images,
specs=specs,
msrp=msrp,
debug=debug_info,
)
@@ -158,6 +164,21 @@ class CdiscountStore(BaseStore):
debug.errors.append("Titre non trouvé")
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
product_ld = self._find_product_ld(soup)
desc_ld = product_ld.get("description") if product_ld else None
if isinstance(desc_ld, str) and desc_ld.strip():
return desc_ld.strip()
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
selectors = self.get_selector("price", [])
@@ -170,20 +191,29 @@ class CdiscountStore(BaseStore):
# Attribut content (schema.org) ou texte
price_text = element.get("content") or element.get_text(strip=True)
# Extraire nombre (format: "299,99" ou "299.99")
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
if match:
integer_part = match.group(1)
decimal_part = match.group(2) or "00"
price_str = f"{integer_part}.{decimal_part}"
try:
return float(price_str)
except ValueError:
continue
price = parse_price_text(price_text)
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille."""
selectors = [
".jsStrikePrice",
".price__old",
".c-price__strike",
".price-strike",
]
for selector in selectors:
element = soup.select_one(selector)
if element:
price = parse_price_text(element.get_text(strip=True))
if price is not None:
return price
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la devise."""
selectors = self.get_selector("currency", [])
@@ -249,7 +279,14 @@ class CdiscountStore(BaseStore):
url = f"https:{url}"
images.append(url)
return list(set(images)) # Dédupliquer
ld_images = self._extract_ld_images(self._find_product_ld(soup))
for url in ld_images:
if url and url not in images:
if url.startswith("//"):
url = f"https:{url}"
images.append(url)
return list(dict.fromkeys(images)) # Préserver lordre
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la catégorie depuis les breadcrumbs."""
@@ -275,6 +312,53 @@ class CdiscountStore(BaseStore):
return None
def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:
"""Parse les scripts JSON-LD et retourne les objets."""
entries = []
scripts = soup.find_all("script", type="application/ld+json")
for script in scripts:
raw = script.string or script.text
if not raw:
continue
try:
payload = json.loads(raw.strip())
except (json.JSONDecodeError, TypeError):
continue
if isinstance(payload, list):
entries.extend(payload)
else:
entries.append(payload)
return entries
def _find_product_ld(self, soup: BeautifulSoup) -> dict:
"""Retourne lobjet Product JSON-LD si présent."""
for entry in self._extract_json_ld_entries(soup):
if not isinstance(entry, dict):
continue
type_field = entry.get("@type") or entry.get("type")
if isinstance(type_field, str) and "product" in type_field.lower():
return entry
return {}
def _extract_ld_images(self, product_ld: dict) -> list[str]:
"""Récupère les images listées dans le JSON-LD."""
if not product_ld:
return []
images = product_ld.get("image") or product_ld.get("images")
if not images:
return []
if isinstance(images, str):
images = [images]
extracted = []
for item in images:
if isinstance(item, str):
extracted.append(item)
elif isinstance(item, dict):
url = item.get("url")
if isinstance(url, str):
extracted.append(url)
return extracted
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
"""Extrait les caractéristiques techniques."""
specs = {}
@@ -298,6 +382,19 @@ class CdiscountStore(BaseStore):
if key and value:
specs[key] = value
product_ld = self._find_product_ld(soup)
additional = product_ld.get("additionalProperty") if product_ld else None
if isinstance(additional, dict):
additional = [additional]
if isinstance(additional, list):
for item in additional:
if not isinstance(item, dict):
continue
key = item.get("name") or item.get("propertyID")
value = item.get("value") or item.get("valueReference")
if key and value:
specs[key] = value
return specs
def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]:

View File

@@ -0,0 +1,48 @@
"""
Helpers pour parser des prix avec separateurs de milliers.
"""
from __future__ import annotations
import re
from typing import Optional
def parse_price_text(text: str) -> Optional[float]:
"""
Parse un texte de prix en float.
Gere les separateurs espace, point, virgule et espaces insécables.
"""
if not text:
return None
text = re.sub(r"(\d)\s*€\s*(\d)", r"\1,\2", text)
cleaned = text.replace("\u00a0", " ").replace("\u202f", " ").replace("\u2009", " ")
cleaned = "".join(ch for ch in cleaned if ch.isdigit() or ch in ".,")
if not cleaned:
return None
if "," in cleaned and "." in cleaned:
if cleaned.rfind(",") > cleaned.rfind("."):
cleaned = cleaned.replace(".", "")
cleaned = cleaned.replace(",", ".")
else:
cleaned = cleaned.replace(",", "")
elif "," in cleaned:
parts = cleaned.split(",")
if len(parts) > 1:
decimal = parts[-1]
integer = "".join(parts[:-1])
cleaned = f"{integer}.{decimal}" if decimal else integer
elif "." in cleaned:
parts = cleaned.split(".")
if len(parts) > 1:
decimal = parts[-1]
integer = "".join(parts[:-1])
cleaned = f"{integer}.{decimal}" if decimal else integer
try:
return float(cleaned)
except ValueError:
return None

13
pricewatch/app/tasks/__init__.py Executable file → Normal file
View File

@@ -3,6 +3,15 @@ Module tasks pour les jobs RQ.
"""
from pricewatch.app.tasks.scrape import scrape_product
from pricewatch.app.tasks.scheduler import ScrapingScheduler
from pricewatch.app.tasks.scheduler import (
RedisUnavailableError,
ScrapingScheduler,
check_redis_connection,
)
__all__ = ["scrape_product", "ScrapingScheduler"]
__all__ = [
"scrape_product",
"ScrapingScheduler",
"RedisUnavailableError",
"check_redis_connection",
]

75
pricewatch/app/tasks/scheduler.py Executable file → Normal file
View File

@@ -9,6 +9,8 @@ from datetime import datetime, timedelta, timezone
from typing import Optional
import redis
from redis.exceptions import ConnectionError as RedisConnectionError
from redis.exceptions import RedisError, TimeoutError as RedisTimeoutError
from rq import Queue
from rq_scheduler import Scheduler
@@ -19,6 +21,15 @@ from pricewatch.app.tasks.scrape import scrape_product
logger = get_logger("tasks.scheduler")
class RedisUnavailableError(Exception):
"""Exception levee quand Redis n'est pas disponible."""
def __init__(self, message: str = "Redis non disponible", cause: Optional[Exception] = None):
self.message = message
self.cause = cause
super().__init__(self.message)
@dataclass
class ScheduledJobInfo:
"""Infos de retour pour un job planifie."""
@@ -27,14 +38,72 @@ class ScheduledJobInfo:
next_run: datetime
def check_redis_connection(redis_url: str) -> bool:
"""
Verifie si Redis est accessible.
Returns:
True si Redis repond, False sinon.
"""
try:
conn = redis.from_url(redis_url)
conn.ping()
return True
except (RedisConnectionError, RedisTimeoutError, RedisError) as e:
logger.debug(f"Redis ping echoue: {e}")
return False
class ScrapingScheduler:
"""Scheduler pour les jobs de scraping avec RQ."""
def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
self.config = config or get_config()
self.redis = redis.from_url(self.config.redis.url)
self.queue = Queue(queue_name, connection=self.redis)
self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
self._queue_name = queue_name
self._redis: Optional[redis.Redis] = None
self._queue: Optional[Queue] = None
self._scheduler: Optional[Scheduler] = None
def _ensure_connected(self) -> None:
"""Etablit la connexion Redis si necessaire, leve RedisUnavailableError si echec."""
if self._redis is not None:
return
try:
self._redis = redis.from_url(self.config.redis.url)
# Ping pour verifier la connexion
self._redis.ping()
self._queue = Queue(self._queue_name, connection=self._redis)
self._scheduler = Scheduler(queue=self._queue, connection=self._redis)
logger.debug(f"Connexion Redis etablie: {self.config.redis.url}")
except (RedisConnectionError, RedisTimeoutError) as e:
self._redis = None
msg = f"Impossible de se connecter a Redis ({self.config.redis.url}): {e}"
logger.error(msg)
raise RedisUnavailableError(msg, cause=e) from e
except RedisError as e:
self._redis = None
msg = f"Erreur Redis: {e}"
logger.error(msg)
raise RedisUnavailableError(msg, cause=e) from e
@property
def redis(self) -> redis.Redis:
"""Acces a la connexion Redis (lazy)."""
self._ensure_connected()
return self._redis # type: ignore
@property
def queue(self) -> Queue:
"""Acces a la queue RQ (lazy)."""
self._ensure_connected()
return self._queue # type: ignore
@property
def scheduler(self) -> Scheduler:
"""Acces au scheduler RQ (lazy)."""
self._ensure_connected()
return self._scheduler # type: ignore
def enqueue_immediate(
self,

33
pricewatch/app/tasks/scrape.py Executable file → Normal file
View File

@@ -4,6 +4,7 @@ Tache de scraping asynchrone pour RQ.
from __future__ import annotations
import time
from typing import Any, Optional
from pricewatch.app.core.config import AppConfig, get_config
@@ -46,6 +47,9 @@ def scrape_product(
Retourne un dict avec success, product_id, snapshot, error.
"""
job_start_time = time.time()
logger.info(f"[JOB START] Scraping: {url}")
config: AppConfig = get_config()
setup_stores()
@@ -58,6 +62,8 @@ def scrape_product(
registry = get_registry()
store = registry.detect_store(url)
if not store:
elapsed_ms = int((time.time() - job_start_time) * 1000)
logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
snapshot = ProductSnapshot(
source="unknown",
url=url,
@@ -70,6 +76,8 @@ def scrape_product(
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
logger.info(f"[STORE] Detecte: {store.store_id}")
canonical_url = store.canonicalize(url)
html = None
@@ -79,13 +87,16 @@ def scrape_product(
html_size_bytes = None
pw_result = None
logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
http_result = fetch_http(canonical_url)
duration_ms = http_result.duration_ms
if http_result.success:
html = http_result.html
fetch_method = FetchMethod.HTTP
logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
elif use_playwright:
logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
pw_result = fetch_playwright(
canonical_url,
headless=not headful,
@@ -97,10 +108,13 @@ def scrape_product(
if pw_result.success:
html = pw_result.html
fetch_method = FetchMethod.PLAYWRIGHT
logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
else:
fetch_error = pw_result.error
logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
else:
fetch_error = http_result.error
logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")
if html:
html_size_bytes = len(html.encode("utf-8"))
@@ -118,12 +132,18 @@ def scrape_product(
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
try:
logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
snapshot = store.parse(html, canonical_url)
snapshot.debug.method = fetch_method
snapshot.debug.duration_ms = duration_ms
snapshot.debug.html_size_bytes = html_size_bytes
success = snapshot.debug.status != DebugStatus.FAILED
if success:
logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
else:
logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
except Exception as exc:
logger.error(f"[PARSE] Exception: {exc}")
snapshot = ProductSnapshot(
source=store.store_id,
url=canonical_url,
@@ -152,6 +172,19 @@ def scrape_product(
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
# Log final du job
elapsed_ms = int((time.time() - job_start_time) * 1000)
if success:
logger.info(
f"[JOB OK] {store.store_id}/{snapshot.reference} "
f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
)
else:
logger.warning(
f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
f"erreur={fetch_error} duree={elapsed_ms}ms"
)
return {
"success": success,
"product_id": product_id,