codex2
This commit is contained in:
5
pricewatch/app/api/__init__.py
Normal file
5
pricewatch/app/api/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Module API FastAPI."""
|
||||
|
||||
from pricewatch.app.api.main import app
|
||||
|
||||
__all__ = ["app"]
|
||||
BIN
pricewatch/app/api/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
pricewatch/app/api/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
pricewatch/app/api/__pycache__/main.cpython-313.pyc
Normal file
BIN
pricewatch/app/api/__pycache__/main.cpython-313.pyc
Normal file
Binary file not shown.
BIN
pricewatch/app/api/__pycache__/schemas.cpython-313.pyc
Normal file
BIN
pricewatch/app/api/__pycache__/schemas.cpython-313.pyc
Normal file
Binary file not shown.
876
pricewatch/app/api/main.py
Normal file
876
pricewatch/app/api/main.py
Normal file
@@ -0,0 +1,876 @@
|
||||
"""
|
||||
API REST FastAPI pour PriceWatch (Phase 3).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from collections import deque
|
||||
from datetime import datetime, timezone
|
||||
import os
|
||||
from pathlib import Path
|
||||
from io import StringIO
|
||||
from typing import Generator, Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import Depends, FastAPI, Header, HTTPException, Response
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse
|
||||
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
|
||||
from sqlalchemy import and_, desc, func
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from pricewatch.app.api.schemas import (
|
||||
EnqueueRequest,
|
||||
EnqueueResponse,
|
||||
HealthStatus,
|
||||
PriceHistoryOut,
|
||||
PriceHistoryCreate,
|
||||
PriceHistoryUpdate,
|
||||
ProductOut,
|
||||
ProductCreate,
|
||||
ProductUpdate,
|
||||
ScheduleRequest,
|
||||
ScheduleResponse,
|
||||
ScrapingLogOut,
|
||||
ScrapingLogCreate,
|
||||
ScrapingLogUpdate,
|
||||
ScrapePreviewRequest,
|
||||
ScrapePreviewResponse,
|
||||
ScrapeCommitRequest,
|
||||
ScrapeCommitResponse,
|
||||
VersionResponse,
|
||||
BackendLogEntry,
|
||||
UvicornLogEntry,
|
||||
WebhookOut,
|
||||
WebhookCreate,
|
||||
WebhookUpdate,
|
||||
WebhookTestResponse,
|
||||
)
|
||||
from pricewatch.app.core.config import get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
from pricewatch.app.db.connection import check_db_connection, get_session
|
||||
from pricewatch.app.db.models import PriceHistory, Product, ScrapingLog, Webhook
|
||||
from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
from pricewatch.app.tasks.scheduler import RedisUnavailableError, check_redis_connection, ScrapingScheduler
|
||||
|
||||
logger = get_logger("api")
|
||||
|
||||
app = FastAPI(title="PriceWatch API", version="0.4.0")
|
||||
|
||||
# Buffer de logs backend en memoire pour debug UI.
|
||||
BACKEND_LOGS = deque(maxlen=200)
|
||||
|
||||
UVICORN_LOG_PATH = Path(
|
||||
os.environ.get("PW_UVICORN_LOG_PATH", "/app/logs/uvicorn.log")
|
||||
)
|
||||
|
||||
|
||||
def get_db_session() -> Generator[Session, None, None]:
|
||||
"""Dependency: session SQLAlchemy."""
|
||||
with get_session(get_config()) as session:
|
||||
yield session
|
||||
|
||||
|
||||
def require_token(authorization: Optional[str] = Header(default=None)) -> None:
|
||||
"""Auth simple via token Bearer."""
|
||||
config = get_config()
|
||||
token = config.api_token
|
||||
if not token:
|
||||
raise HTTPException(status_code=500, detail="API token non configure")
|
||||
|
||||
if not authorization or not authorization.startswith("Bearer "):
|
||||
raise HTTPException(status_code=401, detail="Token manquant")
|
||||
|
||||
provided = authorization.split("Bearer ")[-1].strip()
|
||||
if provided != token:
|
||||
raise HTTPException(status_code=403, detail="Token invalide")
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthStatus)
|
||||
def health_check() -> HealthStatus:
|
||||
"""Health check DB + Redis."""
|
||||
config = get_config()
|
||||
return HealthStatus(
|
||||
db=check_db_connection(config),
|
||||
redis=check_redis_connection(config.redis.url),
|
||||
)
|
||||
|
||||
|
||||
@app.get("/version", response_model=VersionResponse)
|
||||
def version_info() -> VersionResponse:
|
||||
"""Expose la version API."""
|
||||
return VersionResponse(api_version=app.version)
|
||||
|
||||
|
||||
@app.get("/logs/backend", response_model=list[BackendLogEntry], dependencies=[Depends(require_token)])
|
||||
def list_backend_logs() -> list[BackendLogEntry]:
|
||||
"""Expose un buffer de logs backend."""
|
||||
return list(BACKEND_LOGS)
|
||||
|
||||
|
||||
@app.get("/logs/uvicorn", response_model=list[UvicornLogEntry], dependencies=[Depends(require_token)])
|
||||
def list_uvicorn_logs(limit: int = 200) -> list[UvicornLogEntry]:
|
||||
"""Expose les dernieres lignes du log Uvicorn."""
|
||||
lines = _read_uvicorn_lines(limit=limit)
|
||||
return [UvicornLogEntry(line=line) for line in lines]
|
||||
|
||||
|
||||
@app.get("/products", response_model=list[ProductOut], dependencies=[Depends(require_token)])
|
||||
def list_products(
|
||||
source: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
updated_after: Optional[datetime] = None,
|
||||
price_min: Optional[float] = None,
|
||||
price_max: Optional[float] = None,
|
||||
fetched_after: Optional[datetime] = None,
|
||||
fetched_before: Optional[datetime] = None,
|
||||
stock_status: Optional[str] = None,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> list[ProductOut]:
|
||||
"""Liste des produits avec filtres optionnels."""
|
||||
latest_price_subquery = (
|
||||
session.query(
|
||||
PriceHistory.product_id.label("product_id"),
|
||||
func.max(PriceHistory.fetched_at).label("latest_fetched_at"),
|
||||
)
|
||||
.group_by(PriceHistory.product_id)
|
||||
.subquery()
|
||||
)
|
||||
latest_price = (
|
||||
session.query(PriceHistory)
|
||||
.join(
|
||||
latest_price_subquery,
|
||||
and_(
|
||||
PriceHistory.product_id == latest_price_subquery.c.product_id,
|
||||
PriceHistory.fetched_at == latest_price_subquery.c.latest_fetched_at,
|
||||
),
|
||||
)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
query = session.query(Product).outerjoin(latest_price, Product.id == latest_price.c.product_id)
|
||||
if source:
|
||||
query = query.filter(Product.source == source)
|
||||
if reference:
|
||||
query = query.filter(Product.reference == reference)
|
||||
if updated_after:
|
||||
query = query.filter(Product.last_updated_at >= updated_after)
|
||||
if price_min is not None:
|
||||
query = query.filter(latest_price.c.price >= price_min)
|
||||
if price_max is not None:
|
||||
query = query.filter(latest_price.c.price <= price_max)
|
||||
if fetched_after:
|
||||
query = query.filter(latest_price.c.fetched_at >= fetched_after)
|
||||
if fetched_before:
|
||||
query = query.filter(latest_price.c.fetched_at <= fetched_before)
|
||||
if stock_status:
|
||||
query = query.filter(latest_price.c.stock_status == stock_status)
|
||||
|
||||
products = query.order_by(desc(Product.last_updated_at)).offset(offset).limit(limit).all()
|
||||
return [_product_to_out(session, product) for product in products]
|
||||
|
||||
|
||||
@app.post("/products", response_model=ProductOut, dependencies=[Depends(require_token)])
|
||||
def create_product(
|
||||
payload: ProductCreate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ProductOut:
|
||||
"""Cree un produit."""
|
||||
product = Product(
|
||||
source=payload.source,
|
||||
reference=payload.reference,
|
||||
url=payload.url,
|
||||
title=payload.title,
|
||||
category=payload.category,
|
||||
description=payload.description,
|
||||
currency=payload.currency,
|
||||
msrp=payload.msrp,
|
||||
)
|
||||
session.add(product)
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(product)
|
||||
except IntegrityError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=409, detail="Produit deja existant") from exc
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _product_to_out(session, product)
|
||||
|
||||
|
||||
@app.get("/products/{product_id}", response_model=ProductOut, dependencies=[Depends(require_token)])
|
||||
def get_product(
|
||||
product_id: int,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ProductOut:
|
||||
"""Detail produit + dernier prix."""
|
||||
product = session.query(Product).filter(Product.id == product_id).one_or_none()
|
||||
if not product:
|
||||
raise HTTPException(status_code=404, detail="Produit non trouve")
|
||||
return _product_to_out(session, product)
|
||||
|
||||
|
||||
@app.patch("/products/{product_id}", response_model=ProductOut, dependencies=[Depends(require_token)])
|
||||
def update_product(
|
||||
product_id: int,
|
||||
payload: ProductUpdate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ProductOut:
|
||||
"""Met a jour un produit (partial)."""
|
||||
product = session.query(Product).filter(Product.id == product_id).one_or_none()
|
||||
if not product:
|
||||
raise HTTPException(status_code=404, detail="Produit non trouve")
|
||||
|
||||
updates = payload.model_dump(exclude_unset=True)
|
||||
for key, value in updates.items():
|
||||
setattr(product, key, value)
|
||||
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(product)
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _product_to_out(session, product)
|
||||
|
||||
|
||||
@app.delete("/products/{product_id}", dependencies=[Depends(require_token)])
|
||||
def delete_product(
|
||||
product_id: int,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> dict[str, str]:
|
||||
"""Supprime un produit (cascade)."""
|
||||
product = session.query(Product).filter(Product.id == product_id).one_or_none()
|
||||
if not product:
|
||||
raise HTTPException(status_code=404, detail="Produit non trouve")
|
||||
|
||||
session.delete(product)
|
||||
try:
|
||||
session.commit()
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@app.get(
|
||||
"/products/{product_id}/prices",
|
||||
response_model=list[PriceHistoryOut],
|
||||
dependencies=[Depends(require_token)],
|
||||
)
|
||||
def list_prices(
|
||||
product_id: int,
|
||||
price_min: Optional[float] = None,
|
||||
price_max: Optional[float] = None,
|
||||
fetched_after: Optional[datetime] = None,
|
||||
fetched_before: Optional[datetime] = None,
|
||||
fetch_status: Optional[str] = None,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> list[PriceHistoryOut]:
|
||||
"""Historique de prix pour un produit."""
|
||||
query = session.query(PriceHistory).filter(PriceHistory.product_id == product_id)
|
||||
if price_min is not None:
|
||||
query = query.filter(PriceHistory.price >= price_min)
|
||||
if price_max is not None:
|
||||
query = query.filter(PriceHistory.price <= price_max)
|
||||
if fetched_after:
|
||||
query = query.filter(PriceHistory.fetched_at >= fetched_after)
|
||||
if fetched_before:
|
||||
query = query.filter(PriceHistory.fetched_at <= fetched_before)
|
||||
if fetch_status:
|
||||
query = query.filter(PriceHistory.fetch_status == fetch_status)
|
||||
|
||||
prices = query.order_by(desc(PriceHistory.fetched_at)).offset(offset).limit(limit).all()
|
||||
return [_price_to_out(price) for price in prices]
|
||||
|
||||
|
||||
@app.post("/prices", response_model=PriceHistoryOut, dependencies=[Depends(require_token)])
|
||||
def create_price(
|
||||
payload: PriceHistoryCreate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> PriceHistoryOut:
|
||||
"""Ajoute une entree d'historique de prix."""
|
||||
price = PriceHistory(
|
||||
product_id=payload.product_id,
|
||||
price=payload.price,
|
||||
shipping_cost=payload.shipping_cost,
|
||||
stock_status=payload.stock_status,
|
||||
fetch_method=payload.fetch_method,
|
||||
fetch_status=payload.fetch_status,
|
||||
fetched_at=payload.fetched_at,
|
||||
)
|
||||
session.add(price)
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(price)
|
||||
except IntegrityError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=409, detail="Entree prix deja existante") from exc
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _price_to_out(price)
|
||||
|
||||
|
||||
@app.patch("/prices/{price_id}", response_model=PriceHistoryOut, dependencies=[Depends(require_token)])
|
||||
def update_price(
|
||||
price_id: int,
|
||||
payload: PriceHistoryUpdate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> PriceHistoryOut:
|
||||
"""Met a jour une entree de prix."""
|
||||
price = session.query(PriceHistory).filter(PriceHistory.id == price_id).one_or_none()
|
||||
if not price:
|
||||
raise HTTPException(status_code=404, detail="Entree prix non trouvee")
|
||||
|
||||
updates = payload.model_dump(exclude_unset=True)
|
||||
for key, value in updates.items():
|
||||
setattr(price, key, value)
|
||||
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(price)
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _price_to_out(price)
|
||||
|
||||
|
||||
@app.delete("/prices/{price_id}", dependencies=[Depends(require_token)])
|
||||
def delete_price(
|
||||
price_id: int,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> dict[str, str]:
|
||||
"""Supprime une entree de prix."""
|
||||
price = session.query(PriceHistory).filter(PriceHistory.id == price_id).one_or_none()
|
||||
if not price:
|
||||
raise HTTPException(status_code=404, detail="Entree prix non trouvee")
|
||||
|
||||
session.delete(price)
|
||||
try:
|
||||
session.commit()
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@app.get("/logs", response_model=list[ScrapingLogOut], dependencies=[Depends(require_token)])
|
||||
def list_logs(
|
||||
source: Optional[str] = None,
|
||||
fetch_status: Optional[str] = None,
|
||||
fetched_after: Optional[datetime] = None,
|
||||
fetched_before: Optional[datetime] = None,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> list[ScrapingLogOut]:
|
||||
"""Liste des logs de scraping."""
|
||||
query = session.query(ScrapingLog)
|
||||
if source:
|
||||
query = query.filter(ScrapingLog.source == source)
|
||||
if fetch_status:
|
||||
query = query.filter(ScrapingLog.fetch_status == fetch_status)
|
||||
if fetched_after:
|
||||
query = query.filter(ScrapingLog.fetched_at >= fetched_after)
|
||||
if fetched_before:
|
||||
query = query.filter(ScrapingLog.fetched_at <= fetched_before)
|
||||
|
||||
logs = query.order_by(desc(ScrapingLog.fetched_at)).offset(offset).limit(limit).all()
|
||||
return [_log_to_out(log) for log in logs]
|
||||
|
||||
|
||||
@app.post("/logs", response_model=ScrapingLogOut, dependencies=[Depends(require_token)])
|
||||
def create_log(
|
||||
payload: ScrapingLogCreate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ScrapingLogOut:
|
||||
"""Cree un log de scraping."""
|
||||
log_entry = ScrapingLog(
|
||||
product_id=payload.product_id,
|
||||
url=payload.url,
|
||||
source=payload.source,
|
||||
reference=payload.reference,
|
||||
fetch_method=payload.fetch_method,
|
||||
fetch_status=payload.fetch_status,
|
||||
fetched_at=payload.fetched_at,
|
||||
duration_ms=payload.duration_ms,
|
||||
html_size_bytes=payload.html_size_bytes,
|
||||
errors=payload.errors,
|
||||
notes=payload.notes,
|
||||
)
|
||||
session.add(log_entry)
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(log_entry)
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _log_to_out(log_entry)
|
||||
|
||||
|
||||
@app.patch("/logs/{log_id}", response_model=ScrapingLogOut, dependencies=[Depends(require_token)])
|
||||
def update_log(
|
||||
log_id: int,
|
||||
payload: ScrapingLogUpdate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ScrapingLogOut:
|
||||
"""Met a jour un log."""
|
||||
log_entry = session.query(ScrapingLog).filter(ScrapingLog.id == log_id).one_or_none()
|
||||
if not log_entry:
|
||||
raise HTTPException(status_code=404, detail="Log non trouve")
|
||||
|
||||
updates = payload.model_dump(exclude_unset=True)
|
||||
for key, value in updates.items():
|
||||
setattr(log_entry, key, value)
|
||||
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(log_entry)
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _log_to_out(log_entry)
|
||||
|
||||
|
||||
@app.delete("/logs/{log_id}", dependencies=[Depends(require_token)])
|
||||
def delete_log(
|
||||
log_id: int,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> dict[str, str]:
|
||||
"""Supprime un log."""
|
||||
log_entry = session.query(ScrapingLog).filter(ScrapingLog.id == log_id).one_or_none()
|
||||
if not log_entry:
|
||||
raise HTTPException(status_code=404, detail="Log non trouve")
|
||||
|
||||
session.delete(log_entry)
|
||||
try:
|
||||
session.commit()
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@app.get("/products/export", dependencies=[Depends(require_token)])
|
||||
def export_products(
|
||||
source: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
updated_after: Optional[datetime] = None,
|
||||
price_min: Optional[float] = None,
|
||||
price_max: Optional[float] = None,
|
||||
fetched_after: Optional[datetime] = None,
|
||||
fetched_before: Optional[datetime] = None,
|
||||
stock_status: Optional[str] = None,
|
||||
format: str = "csv",
|
||||
limit: int = 500,
|
||||
offset: int = 0,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> Response:
|
||||
"""Export produits en CSV/JSON."""
|
||||
products = list_products(
|
||||
source=source,
|
||||
reference=reference,
|
||||
updated_after=updated_after,
|
||||
price_min=price_min,
|
||||
price_max=price_max,
|
||||
fetched_after=fetched_after,
|
||||
fetched_before=fetched_before,
|
||||
stock_status=stock_status,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
session=session,
|
||||
)
|
||||
rows = [product.model_dump() for product in products]
|
||||
fieldnames = list(ProductOut.model_fields.keys())
|
||||
return _export_response(rows, fieldnames, "products", format)
|
||||
|
||||
|
||||
@app.get("/prices/export", dependencies=[Depends(require_token)])
|
||||
def export_prices(
|
||||
product_id: Optional[int] = None,
|
||||
price_min: Optional[float] = None,
|
||||
price_max: Optional[float] = None,
|
||||
fetched_after: Optional[datetime] = None,
|
||||
fetched_before: Optional[datetime] = None,
|
||||
fetch_status: Optional[str] = None,
|
||||
format: str = "csv",
|
||||
limit: int = 500,
|
||||
offset: int = 0,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> Response:
|
||||
"""Export historique de prix en CSV/JSON."""
|
||||
query = session.query(PriceHistory)
|
||||
if product_id is not None:
|
||||
query = query.filter(PriceHistory.product_id == product_id)
|
||||
if price_min is not None:
|
||||
query = query.filter(PriceHistory.price >= price_min)
|
||||
if price_max is not None:
|
||||
query = query.filter(PriceHistory.price <= price_max)
|
||||
if fetched_after:
|
||||
query = query.filter(PriceHistory.fetched_at >= fetched_after)
|
||||
if fetched_before:
|
||||
query = query.filter(PriceHistory.fetched_at <= fetched_before)
|
||||
if fetch_status:
|
||||
query = query.filter(PriceHistory.fetch_status == fetch_status)
|
||||
|
||||
prices = query.order_by(desc(PriceHistory.fetched_at)).offset(offset).limit(limit).all()
|
||||
rows = [_price_to_out(price).model_dump() for price in prices]
|
||||
fieldnames = list(PriceHistoryOut.model_fields.keys())
|
||||
return _export_response(rows, fieldnames, "prices", format)
|
||||
|
||||
|
||||
@app.get("/logs/export", dependencies=[Depends(require_token)])
|
||||
def export_logs(
|
||||
source: Optional[str] = None,
|
||||
fetch_status: Optional[str] = None,
|
||||
fetched_after: Optional[datetime] = None,
|
||||
fetched_before: Optional[datetime] = None,
|
||||
format: str = "csv",
|
||||
limit: int = 500,
|
||||
offset: int = 0,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> Response:
|
||||
"""Export logs de scraping en CSV/JSON."""
|
||||
logs = list_logs(
|
||||
source=source,
|
||||
fetch_status=fetch_status,
|
||||
fetched_after=fetched_after,
|
||||
fetched_before=fetched_before,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
session=session,
|
||||
)
|
||||
rows = [log.model_dump() for log in logs]
|
||||
fieldnames = list(ScrapingLogOut.model_fields.keys())
|
||||
return _export_response(rows, fieldnames, "logs", format)
|
||||
|
||||
|
||||
@app.get("/webhooks", response_model=list[WebhookOut], dependencies=[Depends(require_token)])
|
||||
def list_webhooks(
|
||||
event: Optional[str] = None,
|
||||
enabled: Optional[bool] = None,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> list[WebhookOut]:
|
||||
"""Liste des webhooks."""
|
||||
query = session.query(Webhook)
|
||||
if event:
|
||||
query = query.filter(Webhook.event == event)
|
||||
if enabled is not None:
|
||||
query = query.filter(Webhook.enabled == enabled)
|
||||
|
||||
webhooks = query.order_by(desc(Webhook.created_at)).offset(offset).limit(limit).all()
|
||||
return [_webhook_to_out(webhook) for webhook in webhooks]
|
||||
|
||||
|
||||
@app.post("/webhooks", response_model=WebhookOut, dependencies=[Depends(require_token)])
|
||||
def create_webhook(
|
||||
payload: WebhookCreate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> WebhookOut:
|
||||
"""Cree un webhook."""
|
||||
webhook = Webhook(
|
||||
event=payload.event,
|
||||
url=payload.url,
|
||||
enabled=payload.enabled,
|
||||
secret=payload.secret,
|
||||
)
|
||||
session.add(webhook)
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(webhook)
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _webhook_to_out(webhook)
|
||||
|
||||
|
||||
@app.patch("/webhooks/{webhook_id}", response_model=WebhookOut, dependencies=[Depends(require_token)])
|
||||
def update_webhook(
|
||||
webhook_id: int,
|
||||
payload: WebhookUpdate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> WebhookOut:
|
||||
"""Met a jour un webhook."""
|
||||
webhook = session.query(Webhook).filter(Webhook.id == webhook_id).one_or_none()
|
||||
if not webhook:
|
||||
raise HTTPException(status_code=404, detail="Webhook non trouve")
|
||||
|
||||
updates = payload.model_dump(exclude_unset=True)
|
||||
for key, value in updates.items():
|
||||
setattr(webhook, key, value)
|
||||
|
||||
try:
|
||||
session.commit()
|
||||
session.refresh(webhook)
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return _webhook_to_out(webhook)
|
||||
|
||||
|
||||
@app.delete("/webhooks/{webhook_id}", dependencies=[Depends(require_token)])
|
||||
def delete_webhook(
|
||||
webhook_id: int,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> dict[str, str]:
|
||||
"""Supprime un webhook."""
|
||||
webhook = session.query(Webhook).filter(Webhook.id == webhook_id).one_or_none()
|
||||
if not webhook:
|
||||
raise HTTPException(status_code=404, detail="Webhook non trouve")
|
||||
|
||||
session.delete(webhook)
|
||||
try:
|
||||
session.commit()
|
||||
except SQLAlchemyError as exc:
|
||||
session.rollback()
|
||||
raise HTTPException(status_code=500, detail="Erreur DB") from exc
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@app.post(
|
||||
"/webhooks/{webhook_id}/test",
|
||||
response_model=WebhookTestResponse,
|
||||
dependencies=[Depends(require_token)],
|
||||
)
|
||||
def send_webhook_test(
|
||||
webhook_id: int,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> WebhookTestResponse:
|
||||
"""Envoie un evenement de test."""
|
||||
webhook = session.query(Webhook).filter(Webhook.id == webhook_id).one_or_none()
|
||||
if not webhook:
|
||||
raise HTTPException(status_code=404, detail="Webhook non trouve")
|
||||
if not webhook.enabled:
|
||||
raise HTTPException(status_code=409, detail="Webhook desactive")
|
||||
|
||||
payload = {"message": "test webhook", "webhook_id": webhook.id}
|
||||
_send_webhook(webhook, "test", payload)
|
||||
return WebhookTestResponse(status="sent")
|
||||
|
||||
@app.post("/enqueue", response_model=EnqueueResponse, dependencies=[Depends(require_token)])
|
||||
def enqueue_job(payload: EnqueueRequest) -> EnqueueResponse:
|
||||
"""Enqueue un job immediat."""
|
||||
try:
|
||||
scheduler = ScrapingScheduler(get_config())
|
||||
job = scheduler.enqueue_immediate(
|
||||
payload.url,
|
||||
use_playwright=payload.use_playwright,
|
||||
save_db=payload.save_db,
|
||||
)
|
||||
return EnqueueResponse(job_id=job.id)
|
||||
except RedisUnavailableError as exc:
|
||||
raise HTTPException(status_code=503, detail=str(exc)) from exc
|
||||
|
||||
|
||||
@app.post("/schedule", response_model=ScheduleResponse, dependencies=[Depends(require_token)])
|
||||
def schedule_job(payload: ScheduleRequest) -> ScheduleResponse:
|
||||
"""Planifie un job recurrent."""
|
||||
try:
|
||||
scheduler = ScrapingScheduler(get_config())
|
||||
job_info = scheduler.schedule_product(
|
||||
payload.url,
|
||||
interval_hours=payload.interval_hours,
|
||||
use_playwright=payload.use_playwright,
|
||||
save_db=payload.save_db,
|
||||
)
|
||||
return ScheduleResponse(job_id=job_info.job_id, next_run=job_info.next_run)
|
||||
except RedisUnavailableError as exc:
|
||||
raise HTTPException(status_code=503, detail=str(exc)) from exc
|
||||
|
||||
|
||||
@app.post("/scrape/preview", response_model=ScrapePreviewResponse, dependencies=[Depends(require_token)])
|
||||
def preview_scrape(payload: ScrapePreviewRequest) -> ScrapePreviewResponse:
|
||||
"""Scrape un produit sans persistence pour previsualisation."""
|
||||
_add_backend_log("INFO", f"Preview scraping: {payload.url}")
|
||||
result = scrape_product(
|
||||
payload.url,
|
||||
use_playwright=payload.use_playwright,
|
||||
save_db=False,
|
||||
)
|
||||
snapshot = result.get("snapshot")
|
||||
if snapshot is None:
|
||||
_add_backend_log("ERROR", f"Preview scraping KO: {payload.url}")
|
||||
return ScrapePreviewResponse(success=False, snapshot=None, error=result.get("error"))
|
||||
return ScrapePreviewResponse(
|
||||
success=bool(result.get("success")),
|
||||
snapshot=snapshot.model_dump(mode="json"),
|
||||
error=result.get("error"),
|
||||
)
|
||||
|
||||
|
||||
@app.post("/scrape/commit", response_model=ScrapeCommitResponse, dependencies=[Depends(require_token)])
|
||||
def commit_scrape(payload: ScrapeCommitRequest) -> ScrapeCommitResponse:
|
||||
"""Persiste un snapshot previsualise."""
|
||||
try:
|
||||
snapshot = ProductSnapshot.model_validate(payload.snapshot)
|
||||
except Exception as exc:
|
||||
_add_backend_log("ERROR", "Commit scraping KO: snapshot invalide")
|
||||
raise HTTPException(status_code=400, detail="Snapshot invalide") from exc
|
||||
|
||||
product_id = ScrapingPipeline(config=get_config()).process_snapshot(snapshot, save_to_db=True)
|
||||
_add_backend_log("INFO", f"Commit scraping OK: product_id={product_id}")
|
||||
return ScrapeCommitResponse(success=True, product_id=product_id)
|
||||
|
||||
|
||||
def _export_response(
|
||||
rows: list[dict[str, object]],
|
||||
fieldnames: list[str],
|
||||
filename_prefix: str,
|
||||
format: str,
|
||||
) -> Response:
|
||||
"""Expose une reponse CSV/JSON avec un nom de fichier stable."""
|
||||
if format not in {"csv", "json"}:
|
||||
raise HTTPException(status_code=400, detail="Format invalide (csv ou json)")
|
||||
|
||||
headers = {"Content-Disposition": f'attachment; filename="{filename_prefix}.{format}"'}
|
||||
if format == "json":
|
||||
return JSONResponse(content=jsonable_encoder(rows), headers=headers)
|
||||
return _to_csv_response(rows, fieldnames, headers)
|
||||
|
||||
|
||||
def _to_csv_response(
|
||||
rows: list[dict[str, object]],
|
||||
fieldnames: list[str],
|
||||
headers: dict[str, str],
|
||||
) -> Response:
|
||||
buffer = StringIO()
|
||||
writer = csv.DictWriter(buffer, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
return Response(content=buffer.getvalue(), media_type="text/csv", headers=headers)
|
||||
|
||||
|
||||
def _send_webhook(webhook: Webhook, event: str, payload: dict[str, object]) -> None:
|
||||
"""Envoie un webhook avec gestion d'erreur explicite."""
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if webhook.secret:
|
||||
headers["X-Webhook-Secret"] = webhook.secret
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
webhook.url,
|
||||
json={"event": event, "payload": payload},
|
||||
headers=headers,
|
||||
timeout=5.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPError as exc:
|
||||
logger.error("Erreur webhook", extra={"url": webhook.url, "event": event, "error": str(exc)})
|
||||
raise HTTPException(status_code=502, detail="Echec webhook") from exc
|
||||
|
||||
|
||||
def _add_backend_log(level: str, message: str) -> None:
|
||||
BACKEND_LOGS.append(
|
||||
BackendLogEntry(
|
||||
time=datetime.now(timezone.utc),
|
||||
level=level,
|
||||
message=message,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _read_uvicorn_lines(limit: int = 200) -> list[str]:
|
||||
"""Lit les dernieres lignes du log Uvicorn si disponible."""
|
||||
if limit <= 0:
|
||||
return []
|
||||
try:
|
||||
if not UVICORN_LOG_PATH.exists():
|
||||
return []
|
||||
with UVICORN_LOG_PATH.open("r", encoding="utf-8", errors="ignore") as handle:
|
||||
lines = handle.readlines()
|
||||
return [line.rstrip("\n") for line in lines[-limit:]]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _product_to_out(session: Session, product: Product) -> ProductOut:
|
||||
"""Helper pour mapper Product + dernier prix."""
|
||||
latest = (
|
||||
session.query(PriceHistory)
|
||||
.filter(PriceHistory.product_id == product.id)
|
||||
.order_by(desc(PriceHistory.fetched_at))
|
||||
.first()
|
||||
)
|
||||
images = [image.image_url for image in product.images]
|
||||
specs = {spec.spec_key: spec.spec_value for spec in product.specs}
|
||||
discount_amount = None
|
||||
discount_percent = None
|
||||
if latest and latest.price is not None and product.msrp:
|
||||
discount_amount = float(product.msrp) - float(latest.price)
|
||||
if product.msrp > 0:
|
||||
discount_percent = (discount_amount / float(product.msrp)) * 100
|
||||
return ProductOut(
|
||||
id=product.id,
|
||||
source=product.source,
|
||||
reference=product.reference,
|
||||
url=product.url,
|
||||
title=product.title,
|
||||
category=product.category,
|
||||
description=product.description,
|
||||
currency=product.currency,
|
||||
msrp=float(product.msrp) if product.msrp is not None else None,
|
||||
first_seen_at=product.first_seen_at,
|
||||
last_updated_at=product.last_updated_at,
|
||||
latest_price=float(latest.price) if latest and latest.price is not None else None,
|
||||
latest_shipping_cost=(
|
||||
float(latest.shipping_cost) if latest and latest.shipping_cost is not None else None
|
||||
),
|
||||
latest_stock_status=latest.stock_status if latest else None,
|
||||
latest_fetched_at=latest.fetched_at if latest else None,
|
||||
images=images,
|
||||
specs=specs,
|
||||
discount_amount=discount_amount,
|
||||
discount_percent=discount_percent,
|
||||
)
|
||||
|
||||
|
||||
def _price_to_out(price: PriceHistory) -> PriceHistoryOut:
|
||||
return PriceHistoryOut(
|
||||
id=price.id,
|
||||
product_id=price.product_id,
|
||||
price=float(price.price) if price.price is not None else None,
|
||||
shipping_cost=float(price.shipping_cost) if price.shipping_cost is not None else None,
|
||||
stock_status=price.stock_status,
|
||||
fetch_method=price.fetch_method,
|
||||
fetch_status=price.fetch_status,
|
||||
fetched_at=price.fetched_at,
|
||||
)
|
||||
|
||||
|
||||
def _log_to_out(log: ScrapingLog) -> ScrapingLogOut:
|
||||
return ScrapingLogOut(
|
||||
id=log.id,
|
||||
product_id=log.product_id,
|
||||
url=log.url,
|
||||
source=log.source,
|
||||
reference=log.reference,
|
||||
fetch_method=log.fetch_method,
|
||||
fetch_status=log.fetch_status,
|
||||
fetched_at=log.fetched_at,
|
||||
duration_ms=log.duration_ms,
|
||||
html_size_bytes=log.html_size_bytes,
|
||||
errors=log.errors,
|
||||
notes=log.notes,
|
||||
)
|
||||
|
||||
|
||||
def _webhook_to_out(webhook: Webhook) -> WebhookOut:
|
||||
return WebhookOut(
|
||||
id=webhook.id,
|
||||
event=webhook.event,
|
||||
url=webhook.url,
|
||||
enabled=webhook.enabled,
|
||||
secret=webhook.secret,
|
||||
created_at=webhook.created_at,
|
||||
)
|
||||
212
pricewatch/app/api/schemas.py
Normal file
212
pricewatch/app/api/schemas.py
Normal file
@@ -0,0 +1,212 @@
|
||||
"""
|
||||
Schemas API FastAPI pour Phase 3.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class HealthStatus(BaseModel):
|
||||
db: bool
|
||||
redis: bool
|
||||
|
||||
|
||||
class ProductOut(BaseModel):
|
||||
id: int
|
||||
source: str
|
||||
reference: str
|
||||
url: str
|
||||
title: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
currency: Optional[str] = None
|
||||
msrp: Optional[float] = None
|
||||
first_seen_at: datetime
|
||||
last_updated_at: datetime
|
||||
latest_price: Optional[float] = None
|
||||
latest_shipping_cost: Optional[float] = None
|
||||
latest_stock_status: Optional[str] = None
|
||||
latest_fetched_at: Optional[datetime] = None
|
||||
images: list[str] = []
|
||||
specs: dict[str, str] = {}
|
||||
discount_amount: Optional[float] = None
|
||||
discount_percent: Optional[float] = None
|
||||
|
||||
|
||||
class ProductCreate(BaseModel):
|
||||
source: str
|
||||
reference: str
|
||||
url: str
|
||||
title: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
currency: Optional[str] = None
|
||||
msrp: Optional[float] = None
|
||||
|
||||
|
||||
class ProductUpdate(BaseModel):
|
||||
url: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
currency: Optional[str] = None
|
||||
msrp: Optional[float] = None
|
||||
|
||||
|
||||
class PriceHistoryOut(BaseModel):
|
||||
id: int
|
||||
product_id: int
|
||||
price: Optional[float] = None
|
||||
shipping_cost: Optional[float] = None
|
||||
stock_status: Optional[str] = None
|
||||
fetch_method: str
|
||||
fetch_status: str
|
||||
fetched_at: datetime
|
||||
|
||||
|
||||
class PriceHistoryCreate(BaseModel):
|
||||
product_id: int
|
||||
price: Optional[float] = None
|
||||
shipping_cost: Optional[float] = None
|
||||
stock_status: Optional[str] = None
|
||||
fetch_method: str
|
||||
fetch_status: str
|
||||
fetched_at: datetime
|
||||
|
||||
|
||||
class PriceHistoryUpdate(BaseModel):
|
||||
price: Optional[float] = None
|
||||
shipping_cost: Optional[float] = None
|
||||
stock_status: Optional[str] = None
|
||||
fetch_method: Optional[str] = None
|
||||
fetch_status: Optional[str] = None
|
||||
fetched_at: Optional[datetime] = None
|
||||
|
||||
|
||||
class ScrapingLogOut(BaseModel):
|
||||
id: int
|
||||
product_id: Optional[int] = None
|
||||
url: str
|
||||
source: str
|
||||
reference: Optional[str] = None
|
||||
fetch_method: str
|
||||
fetch_status: str
|
||||
fetched_at: datetime
|
||||
duration_ms: Optional[int] = None
|
||||
html_size_bytes: Optional[int] = None
|
||||
errors: Optional[list[str]] = None
|
||||
notes: Optional[list[str]] = None
|
||||
|
||||
|
||||
class WebhookOut(BaseModel):
|
||||
id: int
|
||||
event: str
|
||||
url: str
|
||||
enabled: bool
|
||||
secret: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class WebhookCreate(BaseModel):
|
||||
event: str
|
||||
url: str
|
||||
enabled: bool = True
|
||||
secret: Optional[str] = None
|
||||
|
||||
|
||||
class WebhookUpdate(BaseModel):
|
||||
event: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
enabled: Optional[bool] = None
|
||||
secret: Optional[str] = None
|
||||
|
||||
|
||||
class WebhookTestResponse(BaseModel):
|
||||
status: str
|
||||
|
||||
|
||||
class ScrapingLogCreate(BaseModel):
|
||||
product_id: Optional[int] = None
|
||||
url: str
|
||||
source: str
|
||||
reference: Optional[str] = None
|
||||
fetch_method: str
|
||||
fetch_status: str
|
||||
fetched_at: datetime
|
||||
duration_ms: Optional[int] = None
|
||||
html_size_bytes: Optional[int] = None
|
||||
errors: Optional[list[str]] = None
|
||||
notes: Optional[list[str]] = None
|
||||
|
||||
|
||||
class ScrapingLogUpdate(BaseModel):
|
||||
product_id: Optional[int] = None
|
||||
url: Optional[str] = None
|
||||
source: Optional[str] = None
|
||||
reference: Optional[str] = None
|
||||
fetch_method: Optional[str] = None
|
||||
fetch_status: Optional[str] = None
|
||||
fetched_at: Optional[datetime] = None
|
||||
duration_ms: Optional[int] = None
|
||||
html_size_bytes: Optional[int] = None
|
||||
errors: Optional[list[str]] = None
|
||||
notes: Optional[list[str]] = None
|
||||
|
||||
|
||||
class EnqueueRequest(BaseModel):
|
||||
url: str = Field(..., description="URL du produit")
|
||||
use_playwright: Optional[bool] = None
|
||||
save_db: bool = True
|
||||
|
||||
|
||||
class EnqueueResponse(BaseModel):
|
||||
job_id: str
|
||||
|
||||
|
||||
class ScheduleRequest(BaseModel):
|
||||
url: str = Field(..., description="URL du produit")
|
||||
interval_hours: int = Field(default=24, ge=1)
|
||||
use_playwright: Optional[bool] = None
|
||||
save_db: bool = True
|
||||
|
||||
|
||||
class ScheduleResponse(BaseModel):
|
||||
job_id: str
|
||||
next_run: datetime
|
||||
|
||||
|
||||
class ScrapePreviewRequest(BaseModel):
|
||||
url: str
|
||||
use_playwright: Optional[bool] = None
|
||||
|
||||
|
||||
class ScrapePreviewResponse(BaseModel):
|
||||
success: bool
|
||||
snapshot: Optional[dict[str, object]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class ScrapeCommitRequest(BaseModel):
|
||||
snapshot: dict[str, object]
|
||||
|
||||
|
||||
class ScrapeCommitResponse(BaseModel):
|
||||
success: bool
|
||||
product_id: Optional[int] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class VersionResponse(BaseModel):
|
||||
api_version: str
|
||||
|
||||
|
||||
class BackendLogEntry(BaseModel):
|
||||
time: datetime
|
||||
level: str
|
||||
message: str
|
||||
|
||||
|
||||
class UvicornLogEntry(BaseModel):
|
||||
line: str
|
||||
BIN
pricewatch/app/cli/__pycache__/main.cpython-313.pyc
Executable file → Normal file
BIN
pricewatch/app/cli/__pycache__/main.cpython-313.pyc
Executable file → Normal file
Binary file not shown.
@@ -15,7 +15,7 @@ from typing import Optional
|
||||
|
||||
import redis
|
||||
import typer
|
||||
from rq import Connection, Worker
|
||||
from rq import Worker
|
||||
from alembic import command as alembic_command
|
||||
from alembic.config import Config as AlembicConfig
|
||||
from rich import print as rprint
|
||||
@@ -34,7 +34,7 @@ from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
from pricewatch.app.stores.amazon.store import AmazonStore
|
||||
from pricewatch.app.stores.cdiscount.store import CdiscountStore
|
||||
from pricewatch.app.tasks.scheduler import ScrapingScheduler
|
||||
from pricewatch.app.tasks.scheduler import RedisUnavailableError, ScrapingScheduler
|
||||
|
||||
# Créer l'application Typer
|
||||
app = typer.Typer(
|
||||
@@ -197,18 +197,21 @@ def run(
|
||||
html = None
|
||||
fetch_method = FetchMethod.HTTP
|
||||
fetch_error = None
|
||||
http_result = None
|
||||
|
||||
# Tenter HTTP d'abord
|
||||
logger.info("Tentative HTTP...")
|
||||
http_result = fetch_http(canonical_url)
|
||||
if config.options.force_playwright:
|
||||
logger.info("Playwright force, skip HTTP")
|
||||
else:
|
||||
logger.info("Tentative HTTP...")
|
||||
http_result = fetch_http(canonical_url)
|
||||
|
||||
if http_result.success:
|
||||
if http_result and http_result.success:
|
||||
html = http_result.html
|
||||
fetch_method = FetchMethod.HTTP
|
||||
logger.info("✓ HTTP réussi")
|
||||
elif config.options.use_playwright:
|
||||
# Fallback Playwright
|
||||
logger.warning(f"HTTP échoué: {http_result.error}, fallback Playwright")
|
||||
fallback_reason = http_result.error if http_result else "force_playwright"
|
||||
logger.warning(f"HTTP échoué: {fallback_reason}, fallback Playwright")
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not config.options.headful,
|
||||
@@ -231,7 +234,7 @@ def run(
|
||||
fetch_error = pw_result.error
|
||||
logger.error(f"✗ Playwright échoué: {fetch_error}")
|
||||
else:
|
||||
fetch_error = http_result.error
|
||||
fetch_error = http_result.error if http_result else "skip_http"
|
||||
logger.error(f"✗ HTTP échoué: {fetch_error}")
|
||||
|
||||
# Parser si on a du HTML
|
||||
@@ -467,11 +470,25 @@ def worker(
|
||||
Lance un worker RQ.
|
||||
"""
|
||||
config = get_config()
|
||||
connection = redis.from_url(config.redis.url)
|
||||
try:
|
||||
connection = redis.from_url(config.redis.url)
|
||||
# Verification connexion avant de lancer le worker
|
||||
connection.ping()
|
||||
except redis.exceptions.ConnectionError as e:
|
||||
rprint(f"[red]✗ Impossible de se connecter a Redis ({config.redis.url})[/red]")
|
||||
rprint(f"[red] Erreur: {e}[/red]")
|
||||
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
|
||||
rprint(" docker compose up -d redis")
|
||||
rprint(" # ou")
|
||||
rprint(" redis-server")
|
||||
raise typer.Exit(code=1)
|
||||
except redis.exceptions.RedisError as e:
|
||||
rprint(f"[red]✗ Erreur Redis: {e}[/red]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
with Connection(connection):
|
||||
worker_instance = Worker([queue])
|
||||
worker_instance.work(with_scheduler=with_scheduler)
|
||||
# RQ 2.x: connexion passee directement au Worker
|
||||
worker_instance = Worker([queue], connection=connection)
|
||||
worker_instance.work(with_scheduler=with_scheduler)
|
||||
|
||||
|
||||
@app.command()
|
||||
@@ -486,9 +503,15 @@ def enqueue(
|
||||
"""
|
||||
Enqueue un scraping immediat.
|
||||
"""
|
||||
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
|
||||
job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
|
||||
rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
|
||||
try:
|
||||
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
|
||||
job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
|
||||
rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
|
||||
except RedisUnavailableError as e:
|
||||
rprint(f"[red]✗ {e.message}[/red]")
|
||||
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
|
||||
rprint(" docker compose up -d redis")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
@@ -504,16 +527,22 @@ def schedule(
|
||||
"""
|
||||
Planifie un scraping recurrent.
|
||||
"""
|
||||
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
|
||||
job_info = scheduler.schedule_product(
|
||||
url,
|
||||
interval_hours=interval,
|
||||
use_playwright=use_playwright,
|
||||
save_db=save_db,
|
||||
)
|
||||
rprint(
|
||||
f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
|
||||
)
|
||||
try:
|
||||
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
|
||||
job_info = scheduler.schedule_product(
|
||||
url,
|
||||
interval_hours=interval,
|
||||
use_playwright=use_playwright,
|
||||
save_db=save_db,
|
||||
)
|
||||
rprint(
|
||||
f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
|
||||
)
|
||||
except RedisUnavailableError as e:
|
||||
rprint(f"[red]✗ {e.message}[/red]")
|
||||
rprint("\n[yellow]Verifiez que Redis est demarre:[/yellow]")
|
||||
rprint(" docker compose up -d redis")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
BIN
pricewatch/app/core/__pycache__/config.cpython-313.pyc
Executable file → Normal file
BIN
pricewatch/app/core/__pycache__/config.cpython-313.pyc
Executable file → Normal file
Binary file not shown.
BIN
pricewatch/app/core/__pycache__/schema.cpython-313.pyc
Executable file → Normal file
BIN
pricewatch/app/core/__pycache__/schema.cpython-313.pyc
Executable file → Normal file
Binary file not shown.
6
pricewatch/app/core/config.py
Executable file → Normal file
6
pricewatch/app/core/config.py
Executable file → Normal file
@@ -108,6 +108,11 @@ class AppConfig(BaseSettings):
|
||||
default=True, description="Enable background worker functionality"
|
||||
)
|
||||
|
||||
# API auth
|
||||
api_token: Optional[str] = Field(
|
||||
default=None, description="API token simple (Bearer)"
|
||||
)
|
||||
|
||||
# Scraping defaults
|
||||
default_playwright_timeout: int = Field(
|
||||
default=60000, description="Default Playwright timeout in milliseconds"
|
||||
@@ -138,6 +143,7 @@ class AppConfig(BaseSettings):
|
||||
logger.info(f"Worker enabled: {self.enable_worker}")
|
||||
logger.info(f"Worker timeout: {self.worker_timeout}s")
|
||||
logger.info(f"Worker concurrency: {self.worker_concurrency}")
|
||||
logger.info(f"API token configured: {bool(self.api_token)}")
|
||||
logger.info("================================")
|
||||
|
||||
|
||||
|
||||
@@ -23,6 +23,9 @@ class ScrapingOptions(BaseModel):
|
||||
use_playwright: bool = Field(
|
||||
default=True, description="Utiliser Playwright en fallback"
|
||||
)
|
||||
force_playwright: bool = Field(
|
||||
default=False, description="Forcer Playwright même si HTTP réussi"
|
||||
)
|
||||
headful: bool = Field(default=False, description="Mode headful (voir le navigateur)")
|
||||
save_html: bool = Field(
|
||||
default=True, description="Sauvegarder HTML pour debug"
|
||||
@@ -94,7 +97,8 @@ def read_yaml_config(yaml_path: str | Path) -> ScrapingConfig:
|
||||
config = ScrapingConfig.model_validate(data)
|
||||
logger.info(
|
||||
f"Configuration chargée: {len(config.urls)} URL(s), "
|
||||
f"playwright={config.options.use_playwright}"
|
||||
f"playwright={config.options.use_playwright}, "
|
||||
f"force_playwright={config.options.force_playwright}"
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl, field_validator
|
||||
from pydantic import BaseModel, ConfigDict, Field, HttpUrl, field_validator
|
||||
|
||||
|
||||
class StockStatus(str, Enum):
|
||||
@@ -38,6 +38,8 @@ class DebugStatus(str, Enum):
|
||||
class DebugInfo(BaseModel):
|
||||
"""Informations de debug pour tracer les problèmes de scraping."""
|
||||
|
||||
model_config = ConfigDict(use_enum_values=True)
|
||||
|
||||
method: FetchMethod = Field(
|
||||
description="Méthode utilisée pour la récupération (http ou playwright)"
|
||||
)
|
||||
@@ -55,9 +57,6 @@ class DebugInfo(BaseModel):
|
||||
default=None, description="Taille du HTML récupéré en octets"
|
||||
)
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
|
||||
class ProductSnapshot(BaseModel):
|
||||
"""
|
||||
@@ -81,6 +80,7 @@ class ProductSnapshot(BaseModel):
|
||||
# Données produit principales
|
||||
title: Optional[str] = Field(default=None, description="Nom du produit")
|
||||
price: Optional[float] = Field(default=None, description="Prix du produit", ge=0)
|
||||
msrp: Optional[float] = Field(default=None, description="Prix conseille", ge=0)
|
||||
currency: str = Field(default="EUR", description="Devise (EUR, USD, etc.)")
|
||||
shipping_cost: Optional[float] = Field(
|
||||
default=None, description="Frais de port", ge=0
|
||||
@@ -94,6 +94,7 @@ class ProductSnapshot(BaseModel):
|
||||
default=None, description="Référence produit (ASIN, SKU, etc.)"
|
||||
)
|
||||
category: Optional[str] = Field(default=None, description="Catégorie du produit")
|
||||
description: Optional[str] = Field(default=None, description="Description produit")
|
||||
|
||||
# Médias
|
||||
images: list[str] = Field(
|
||||
@@ -133,20 +134,22 @@ class ProductSnapshot(BaseModel):
|
||||
"""Filtre les URLs d'images vides."""
|
||||
return [url.strip() for url in v if url and url.strip()]
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
json_schema_extra = {
|
||||
model_config = ConfigDict(
|
||||
use_enum_values=True,
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"source": "amazon",
|
||||
"url": "https://www.amazon.fr/dp/B08N5WRWNW",
|
||||
"fetched_at": "2026-01-13T10:30:00Z",
|
||||
"title": "Exemple de produit",
|
||||
"price": 299.99,
|
||||
"msrp": 349.99,
|
||||
"currency": "EUR",
|
||||
"shipping_cost": 0.0,
|
||||
"stock_status": "in_stock",
|
||||
"reference": "B08N5WRWNW",
|
||||
"category": "Electronics",
|
||||
"description": "Chargeur USB-C multi-ports.",
|
||||
"images": [
|
||||
"https://example.com/image1.jpg",
|
||||
"https://example.com/image2.jpg",
|
||||
@@ -165,7 +168,8 @@ class ProductSnapshot(BaseModel):
|
||||
"html_size_bytes": 145000,
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serialize vers un dictionnaire Python natif."""
|
||||
|
||||
2
pricewatch/app/db/__init__.py
Executable file → Normal file
2
pricewatch/app/db/__init__.py
Executable file → Normal file
@@ -20,6 +20,7 @@ from pricewatch.app.db.models import (
|
||||
ProductImage,
|
||||
ProductSpec,
|
||||
ScrapingLog,
|
||||
Webhook,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@@ -30,6 +31,7 @@ __all__ = [
|
||||
"ProductImage",
|
||||
"ProductSpec",
|
||||
"ScrapingLog",
|
||||
"Webhook",
|
||||
"ProductRepository",
|
||||
# Connection
|
||||
"get_engine",
|
||||
|
||||
BIN
pricewatch/app/db/__pycache__/__init__.cpython-313.pyc
Executable file → Normal file
BIN
pricewatch/app/db/__pycache__/__init__.cpython-313.pyc
Executable file → Normal file
Binary file not shown.
0
pricewatch/app/db/__pycache__/connection.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/db/__pycache__/connection.cpython-313.pyc
Executable file → Normal file
BIN
pricewatch/app/db/__pycache__/models.cpython-313.pyc
Executable file → Normal file
BIN
pricewatch/app/db/__pycache__/models.cpython-313.pyc
Executable file → Normal file
Binary file not shown.
0
pricewatch/app/db/__pycache__/repository.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/db/__pycache__/repository.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/db/connection.py
Executable file → Normal file
0
pricewatch/app/db/connection.py
Executable file → Normal file
0
pricewatch/app/db/migrations/__pycache__/env.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/db/migrations/__pycache__/env.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/db/migrations/env.py
Executable file → Normal file
0
pricewatch/app/db/migrations/env.py
Executable file → Normal file
0
pricewatch/app/db/migrations/script.py.mako
Executable file → Normal file
0
pricewatch/app/db/migrations/script.py.mako
Executable file → Normal file
0
pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py
Executable file → Normal file
0
pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py
Executable file → Normal file
@@ -0,0 +1,35 @@
|
||||
"""Add webhooks table
|
||||
|
||||
Revision ID: 20260114_02
|
||||
Revises: 20260114_01
|
||||
Create Date: 2026-01-14 00:00:00
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# Revision identifiers, used by Alembic.
|
||||
revision = "20260114_02"
|
||||
down_revision = "20260114_01"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"webhooks",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("event", sa.String(length=50), nullable=False),
|
||||
sa.Column("url", sa.Text(), nullable=False),
|
||||
sa.Column("enabled", sa.Boolean(), nullable=False, server_default=sa.text("true")),
|
||||
sa.Column("secret", sa.String(length=200), nullable=True),
|
||||
sa.Column("created_at", sa.TIMESTAMP(), nullable=False),
|
||||
)
|
||||
op.create_index("ix_webhook_event", "webhooks", ["event"], unique=False)
|
||||
op.create_index("ix_webhook_enabled", "webhooks", ["enabled"], unique=False)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_webhook_enabled", table_name="webhooks")
|
||||
op.drop_index("ix_webhook_event", table_name="webhooks")
|
||||
op.drop_table("webhooks")
|
||||
@@ -0,0 +1,26 @@
|
||||
"""Ajout description et msrp sur products.
|
||||
|
||||
Revision ID: 20260115_02_product_details
|
||||
Revises: 20260114_02
|
||||
Create Date: 2026-01-15 10:00:00.000000
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "20260115_02_product_details"
|
||||
down_revision = "20260114_02"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column("products", sa.Column("description", sa.Text(), nullable=True))
|
||||
op.add_column("products", sa.Column("msrp", sa.Numeric(10, 2), nullable=True))
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("products", "msrp")
|
||||
op.drop_column("products", "description")
|
||||
0
pricewatch/app/db/migrations/versions/__pycache__/20260114_01_initial_schema.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/db/migrations/versions/__pycache__/20260114_01_initial_schema.cpython-313.pyc
Executable file → Normal file
48
pricewatch/app/db/models.py
Executable file → Normal file
48
pricewatch/app/db/models.py
Executable file → Normal file
@@ -15,7 +15,7 @@ Justification technique:
|
||||
- JSONB uniquement pour données variables: errors, notes dans logs
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
from decimal import Decimal
|
||||
from typing import List, Optional
|
||||
|
||||
@@ -28,6 +28,7 @@ from sqlalchemy import (
|
||||
Integer,
|
||||
JSON,
|
||||
Numeric,
|
||||
Boolean,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
@@ -42,6 +43,10 @@ class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
def utcnow() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
class Product(Base):
|
||||
"""
|
||||
Catalogue produits (1 ligne par produit unique).
|
||||
@@ -70,19 +75,25 @@ class Product(Base):
|
||||
category: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Product category (breadcrumb)"
|
||||
)
|
||||
description: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Product description"
|
||||
)
|
||||
currency: Mapped[Optional[str]] = mapped_column(
|
||||
String(3), nullable=True, comment="Currency code (EUR, USD, GBP)"
|
||||
)
|
||||
msrp: Mapped[Optional[Decimal]] = mapped_column(
|
||||
Numeric(10, 2), nullable=True, comment="Recommended price"
|
||||
)
|
||||
|
||||
# Timestamps
|
||||
first_seen_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="First scraping timestamp"
|
||||
TIMESTAMP, nullable=False, default=utcnow, comment="First scraping timestamp"
|
||||
)
|
||||
last_updated_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP,
|
||||
nullable=False,
|
||||
default=datetime.utcnow,
|
||||
onupdate=datetime.utcnow,
|
||||
default=utcnow,
|
||||
onupdate=utcnow,
|
||||
comment="Last metadata update",
|
||||
)
|
||||
|
||||
@@ -280,7 +291,7 @@ class ScrapingLog(Base):
|
||||
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
|
||||
)
|
||||
fetched_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="Scraping timestamp"
|
||||
TIMESTAMP, nullable=False, default=utcnow, comment="Scraping timestamp"
|
||||
)
|
||||
|
||||
# Performance metrics
|
||||
@@ -318,3 +329,30 @@ class ScrapingLog(Base):
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<ScrapingLog(id={self.id}, url={self.url}, status={self.fetch_status}, fetched_at={self.fetched_at})>"
|
||||
|
||||
|
||||
class Webhook(Base):
|
||||
"""
|
||||
Webhooks pour notifications externes.
|
||||
"""
|
||||
|
||||
__tablename__ = "webhooks"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
event: Mapped[str] = mapped_column(String(50), nullable=False, comment="Event name")
|
||||
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Webhook URL")
|
||||
enabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
|
||||
secret: Mapped[Optional[str]] = mapped_column(
|
||||
String(200), nullable=True, comment="Secret optionnel"
|
||||
)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP, nullable=False, default=utcnow, comment="Creation timestamp"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_webhook_event", "event"),
|
||||
Index("ix_webhook_enabled", "enabled"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Webhook(id={self.id}, event={self.event}, url={self.url})>"
|
||||
|
||||
4
pricewatch/app/db/repository.py
Executable file → Normal file
4
pricewatch/app/db/repository.py
Executable file → Normal file
@@ -49,8 +49,12 @@ class ProductRepository:
|
||||
product.title = snapshot.title
|
||||
if snapshot.category:
|
||||
product.category = snapshot.category
|
||||
if snapshot.description:
|
||||
product.description = snapshot.description
|
||||
if snapshot.currency:
|
||||
product.currency = snapshot.currency
|
||||
if snapshot.msrp is not None:
|
||||
product.msrp = snapshot.msrp
|
||||
|
||||
def add_price_history(self, product: Product, snapshot: ProductSnapshot) -> Optional[PriceHistory]:
|
||||
"""Ajoute une entree d'historique de prix si inexistante."""
|
||||
|
||||
0
pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/scraping/__pycache__/pipeline.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/scraping/__pycache__/pipeline.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/scraping/pipeline.py
Executable file → Normal file
0
pricewatch/app/scraping/pipeline.py
Executable file → Normal file
BIN
pricewatch/app/stores/__pycache__/price_parser.cpython-313.pyc
Normal file
BIN
pricewatch/app/stores/__pycache__/price_parser.cpython-313.pyc
Normal file
Binary file not shown.
@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.aliexpress")
|
||||
|
||||
@@ -126,6 +127,8 @@ class AliexpressStore(BaseStore):
|
||||
images = self._extract_images(html, soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(html, debug_info)
|
||||
reference = self.extract_reference(url)
|
||||
|
||||
# Note sur le rendu client-side
|
||||
@@ -150,8 +153,10 @@ class AliexpressStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -183,6 +188,17 @@ class AliexpressStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
return None
|
||||
|
||||
def _extract_price(
|
||||
self, html: str, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> Optional[float]:
|
||||
@@ -193,35 +209,39 @@ class AliexpressStore(BaseStore):
|
||||
On utilise regex sur le HTML brut.
|
||||
"""
|
||||
# Pattern 1: Prix avant € (ex: "136,69 €")
|
||||
match = re.search(r"([0-9]+[.,][0-9]{2})\s*€", html)
|
||||
match = re.search(r"([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)\\s*€", html)
|
||||
if match:
|
||||
price_str = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(match.group(1))
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# Pattern 2: € avant prix (ex: "€ 136.69")
|
||||
match = re.search(r"€\s*([0-9]+[.,][0-9]{2})", html)
|
||||
match = re.search(r"€\\s*([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)", html)
|
||||
if match:
|
||||
price_str = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(match.group(1))
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# Pattern 3: Chercher dans meta tags (moins fiable)
|
||||
og_price = soup.find("meta", property="og:price:amount")
|
||||
if og_price:
|
||||
price_str = og_price.get("content", "")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(price_str)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, html: str, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille si present."""
|
||||
match = re.search(r"originalPrice\"\\s*:\\s*\"([0-9\\s.,]+)\"", html)
|
||||
if match:
|
||||
price = parse_price_text(match.group(1))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(
|
||||
self, url: str, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> str:
|
||||
|
||||
0
pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc
Executable file → Normal file
0
pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc
Executable file → Normal file
@@ -54,12 +54,12 @@ specs_table:
|
||||
# ASIN (parfois dans les métadonnées)
|
||||
asin:
|
||||
- "input[name='ASIN']"
|
||||
- "th:contains('ASIN') + td"
|
||||
- "th:-soup-contains('ASIN') + td"
|
||||
|
||||
# Messages captcha / robot check
|
||||
captcha_indicators:
|
||||
- "form[action*='validateCaptcha']"
|
||||
- "p.a-last:contains('Sorry')"
|
||||
- "p.a-last:-soup-contains('Sorry')"
|
||||
- "img[alt*='captcha']"
|
||||
|
||||
# Notes pour le parsing:
|
||||
|
||||
@@ -4,7 +4,9 @@ Store Amazon - Parsing de produits Amazon.fr et Amazon.com.
|
||||
Supporte l'extraction de: titre, prix, ASIN, images, specs, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from html import unescape
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
@@ -21,6 +23,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.amazon")
|
||||
|
||||
@@ -131,6 +134,8 @@ class AmazonStore(BaseStore):
|
||||
images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
|
||||
|
||||
# Déterminer le statut final (ne pas écraser FAILED)
|
||||
@@ -150,8 +155,10 @@ class AmazonStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -195,6 +202,17 @@ class AmazonStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
@@ -205,14 +223,9 @@ class AmazonStore(BaseStore):
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
text = element.get_text(strip=True)
|
||||
# Extraire nombre (format: "299,99" ou "299.99")
|
||||
match = re.search(r"(\d+)[.,](\d+)", text)
|
||||
if match:
|
||||
price_str = f"{match.group(1)}.{match.group(2)}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
price = parse_price_text(text)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# Fallback: chercher les spans séparés a-price-whole et a-price-fraction
|
||||
whole = soup.select_one("span.a-price-whole")
|
||||
@@ -220,15 +233,24 @@ class AmazonStore(BaseStore):
|
||||
if whole and fraction:
|
||||
whole_text = whole.get_text(strip=True)
|
||||
fraction_text = fraction.get_text(strip=True)
|
||||
try:
|
||||
price_str = f"{whole_text}.{fraction_text}"
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(f"{whole_text}.{fraction_text}")
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille."""
|
||||
strike = soup.select_one("span.priceBlockStrikePriceString") or soup.select_one(
|
||||
"span.a-text-price span.a-offscreen"
|
||||
)
|
||||
if strike:
|
||||
price = parse_price_text(strike.get_text(strip=True))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
@@ -270,6 +292,7 @@ class AmazonStore(BaseStore):
|
||||
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
||||
"""Extrait les URLs d'images."""
|
||||
images = []
|
||||
seen = set()
|
||||
selectors = self.get_selector("images", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
@@ -278,19 +301,57 @@ class AmazonStore(BaseStore):
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Attribut src ou data-src
|
||||
url = element.get("src") or element.get("data-src")
|
||||
url = element.get("src") or element.get("data-src") or element.get("data-old-hires")
|
||||
if url and url.startswith("http"):
|
||||
images.append(url)
|
||||
if self._is_product_image(url) and url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
dynamic = element.get("data-a-dynamic-image")
|
||||
if dynamic:
|
||||
urls = self._extract_dynamic_images(dynamic)
|
||||
for dyn_url in urls:
|
||||
if self._is_product_image(dyn_url) and dyn_url not in seen:
|
||||
images.append(dyn_url)
|
||||
seen.add(dyn_url)
|
||||
|
||||
# Fallback: chercher tous les img tags si aucune image trouvée
|
||||
if not images:
|
||||
all_imgs = soup.find_all("img")
|
||||
for img in all_imgs:
|
||||
url = img.get("src") or img.get("data-src")
|
||||
if url and url.startswith("http"):
|
||||
images.append(url)
|
||||
if url and url.startswith("http") and self._is_product_image(url):
|
||||
if url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
|
||||
return list(set(images)) # Dédupliquer
|
||||
return images
|
||||
|
||||
def _extract_dynamic_images(self, raw: str) -> list[str]:
|
||||
"""Extrait les URLs du JSON data-a-dynamic-image."""
|
||||
try:
|
||||
data = json.loads(unescape(raw))
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
urls = []
|
||||
if isinstance(data, dict):
|
||||
candidates = []
|
||||
for url, dims in data.items():
|
||||
if not isinstance(url, str) or not url.startswith("http"):
|
||||
continue
|
||||
size = dims[0] if isinstance(dims, list) and dims else 0
|
||||
candidates.append((size, url))
|
||||
candidates.sort(key=lambda item: item[0], reverse=True)
|
||||
for _, url in candidates:
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
def _is_product_image(self, url: str) -> bool:
|
||||
"""Filtre basique pour eviter les logos et sprites."""
|
||||
lowered = url.lower()
|
||||
if "prime_logo" in lowered or "sprite" in lowered:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis les breadcrumbs."""
|
||||
|
||||
@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.backmarket")
|
||||
|
||||
@@ -116,6 +117,8 @@ class BackmarketStore(BaseStore):
|
||||
images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url)
|
||||
|
||||
# Spécifique Backmarket: condition (état du reconditionné)
|
||||
@@ -140,8 +143,10 @@ class BackmarketStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -213,6 +218,17 @@ class BackmarketStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
@@ -225,20 +241,29 @@ class BackmarketStore(BaseStore):
|
||||
# Attribut content (schema.org) ou texte
|
||||
price_text = element.get("content") or element.get_text(strip=True)
|
||||
|
||||
# Extraire nombre (format: "299,99" ou "299.99" ou "299")
|
||||
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
|
||||
if match:
|
||||
integer_part = match.group(1)
|
||||
decimal_part = match.group(2) or "00"
|
||||
price_str = f"{integer_part}.{decimal_part}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
price = parse_price_text(price_text)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille."""
|
||||
selectors = [
|
||||
".price--old",
|
||||
".price--striked",
|
||||
".price__old",
|
||||
"del",
|
||||
]
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
price = parse_price_text(element.get_text(strip=True))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
|
||||
@@ -4,6 +4,7 @@ Store Cdiscount - Parsing de produits Cdiscount.com.
|
||||
Supporte l'extraction de: titre, prix, SKU, images, specs, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -21,6 +22,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.cdiscount")
|
||||
|
||||
@@ -112,6 +114,8 @@ class CdiscountStore(BaseStore):
|
||||
images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_sku_from_html(soup)
|
||||
|
||||
# Déterminer le statut final
|
||||
@@ -130,8 +134,10 @@ class CdiscountStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -158,6 +164,21 @@ class CdiscountStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
product_ld = self._find_product_ld(soup)
|
||||
desc_ld = product_ld.get("description") if product_ld else None
|
||||
if isinstance(desc_ld, str) and desc_ld.strip():
|
||||
return desc_ld.strip()
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
@@ -170,20 +191,29 @@ class CdiscountStore(BaseStore):
|
||||
# Attribut content (schema.org) ou texte
|
||||
price_text = element.get("content") or element.get_text(strip=True)
|
||||
|
||||
# Extraire nombre (format: "299,99" ou "299.99")
|
||||
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
|
||||
if match:
|
||||
integer_part = match.group(1)
|
||||
decimal_part = match.group(2) or "00"
|
||||
price_str = f"{integer_part}.{decimal_part}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
price = parse_price_text(price_text)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille."""
|
||||
selectors = [
|
||||
".jsStrikePrice",
|
||||
".price__old",
|
||||
".c-price__strike",
|
||||
".price-strike",
|
||||
]
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
price = parse_price_text(element.get_text(strip=True))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
@@ -249,7 +279,14 @@ class CdiscountStore(BaseStore):
|
||||
url = f"https:{url}"
|
||||
images.append(url)
|
||||
|
||||
return list(set(images)) # Dédupliquer
|
||||
ld_images = self._extract_ld_images(self._find_product_ld(soup))
|
||||
for url in ld_images:
|
||||
if url and url not in images:
|
||||
if url.startswith("//"):
|
||||
url = f"https:{url}"
|
||||
images.append(url)
|
||||
|
||||
return list(dict.fromkeys(images)) # Préserver l’ordre
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis les breadcrumbs."""
|
||||
@@ -275,6 +312,53 @@ class CdiscountStore(BaseStore):
|
||||
|
||||
return None
|
||||
|
||||
def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:
|
||||
"""Parse les scripts JSON-LD et retourne les objets."""
|
||||
entries = []
|
||||
scripts = soup.find_all("script", type="application/ld+json")
|
||||
for script in scripts:
|
||||
raw = script.string or script.text
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
payload = json.loads(raw.strip())
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
if isinstance(payload, list):
|
||||
entries.extend(payload)
|
||||
else:
|
||||
entries.append(payload)
|
||||
return entries
|
||||
|
||||
def _find_product_ld(self, soup: BeautifulSoup) -> dict:
|
||||
"""Retourne l’objet Product JSON-LD si présent."""
|
||||
for entry in self._extract_json_ld_entries(soup):
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
type_field = entry.get("@type") or entry.get("type")
|
||||
if isinstance(type_field, str) and "product" in type_field.lower():
|
||||
return entry
|
||||
return {}
|
||||
|
||||
def _extract_ld_images(self, product_ld: dict) -> list[str]:
|
||||
"""Récupère les images listées dans le JSON-LD."""
|
||||
if not product_ld:
|
||||
return []
|
||||
images = product_ld.get("image") or product_ld.get("images")
|
||||
if not images:
|
||||
return []
|
||||
if isinstance(images, str):
|
||||
images = [images]
|
||||
extracted = []
|
||||
for item in images:
|
||||
if isinstance(item, str):
|
||||
extracted.append(item)
|
||||
elif isinstance(item, dict):
|
||||
url = item.get("url")
|
||||
if isinstance(url, str):
|
||||
extracted.append(url)
|
||||
return extracted
|
||||
|
||||
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
||||
"""Extrait les caractéristiques techniques."""
|
||||
specs = {}
|
||||
@@ -298,6 +382,19 @@ class CdiscountStore(BaseStore):
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
product_ld = self._find_product_ld(soup)
|
||||
additional = product_ld.get("additionalProperty") if product_ld else None
|
||||
if isinstance(additional, dict):
|
||||
additional = [additional]
|
||||
if isinstance(additional, list):
|
||||
for item in additional:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
key = item.get("name") or item.get("propertyID")
|
||||
value = item.get("value") or item.get("valueReference")
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
|
||||
def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
|
||||
48
pricewatch/app/stores/price_parser.py
Normal file
48
pricewatch/app/stores/price_parser.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Helpers pour parser des prix avec separateurs de milliers.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def parse_price_text(text: str) -> Optional[float]:
|
||||
"""
|
||||
Parse un texte de prix en float.
|
||||
|
||||
Gere les separateurs espace, point, virgule et espaces insécables.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text = re.sub(r"(\d)\s*€\s*(\d)", r"\1,\2", text)
|
||||
cleaned = text.replace("\u00a0", " ").replace("\u202f", " ").replace("\u2009", " ")
|
||||
cleaned = "".join(ch for ch in cleaned if ch.isdigit() or ch in ".,")
|
||||
if not cleaned:
|
||||
return None
|
||||
|
||||
if "," in cleaned and "." in cleaned:
|
||||
if cleaned.rfind(",") > cleaned.rfind("."):
|
||||
cleaned = cleaned.replace(".", "")
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
else:
|
||||
cleaned = cleaned.replace(",", "")
|
||||
elif "," in cleaned:
|
||||
parts = cleaned.split(",")
|
||||
if len(parts) > 1:
|
||||
decimal = parts[-1]
|
||||
integer = "".join(parts[:-1])
|
||||
cleaned = f"{integer}.{decimal}" if decimal else integer
|
||||
elif "." in cleaned:
|
||||
parts = cleaned.split(".")
|
||||
if len(parts) > 1:
|
||||
decimal = parts[-1]
|
||||
integer = "".join(parts[:-1])
|
||||
cleaned = f"{integer}.{decimal}" if decimal else integer
|
||||
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
13
pricewatch/app/tasks/__init__.py
Executable file → Normal file
13
pricewatch/app/tasks/__init__.py
Executable file → Normal file
@@ -3,6 +3,15 @@ Module tasks pour les jobs RQ.
|
||||
"""
|
||||
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
from pricewatch.app.tasks.scheduler import ScrapingScheduler
|
||||
from pricewatch.app.tasks.scheduler import (
|
||||
RedisUnavailableError,
|
||||
ScrapingScheduler,
|
||||
check_redis_connection,
|
||||
)
|
||||
|
||||
__all__ = ["scrape_product", "ScrapingScheduler"]
|
||||
__all__ = [
|
||||
"scrape_product",
|
||||
"ScrapingScheduler",
|
||||
"RedisUnavailableError",
|
||||
"check_redis_connection",
|
||||
]
|
||||
|
||||
BIN
pricewatch/app/tasks/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
pricewatch/app/tasks/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
pricewatch/app/tasks/__pycache__/scheduler.cpython-313.pyc
Normal file
BIN
pricewatch/app/tasks/__pycache__/scheduler.cpython-313.pyc
Normal file
Binary file not shown.
BIN
pricewatch/app/tasks/__pycache__/scrape.cpython-313.pyc
Normal file
BIN
pricewatch/app/tasks/__pycache__/scrape.cpython-313.pyc
Normal file
Binary file not shown.
75
pricewatch/app/tasks/scheduler.py
Executable file → Normal file
75
pricewatch/app/tasks/scheduler.py
Executable file → Normal file
@@ -9,6 +9,8 @@ from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
from redis.exceptions import ConnectionError as RedisConnectionError
|
||||
from redis.exceptions import RedisError, TimeoutError as RedisTimeoutError
|
||||
from rq import Queue
|
||||
from rq_scheduler import Scheduler
|
||||
|
||||
@@ -19,6 +21,15 @@ from pricewatch.app.tasks.scrape import scrape_product
|
||||
logger = get_logger("tasks.scheduler")
|
||||
|
||||
|
||||
class RedisUnavailableError(Exception):
|
||||
"""Exception levee quand Redis n'est pas disponible."""
|
||||
|
||||
def __init__(self, message: str = "Redis non disponible", cause: Optional[Exception] = None):
|
||||
self.message = message
|
||||
self.cause = cause
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScheduledJobInfo:
|
||||
"""Infos de retour pour un job planifie."""
|
||||
@@ -27,14 +38,72 @@ class ScheduledJobInfo:
|
||||
next_run: datetime
|
||||
|
||||
|
||||
def check_redis_connection(redis_url: str) -> bool:
|
||||
"""
|
||||
Verifie si Redis est accessible.
|
||||
|
||||
Returns:
|
||||
True si Redis repond, False sinon.
|
||||
"""
|
||||
try:
|
||||
conn = redis.from_url(redis_url)
|
||||
conn.ping()
|
||||
return True
|
||||
except (RedisConnectionError, RedisTimeoutError, RedisError) as e:
|
||||
logger.debug(f"Redis ping echoue: {e}")
|
||||
return False
|
||||
|
||||
|
||||
class ScrapingScheduler:
|
||||
"""Scheduler pour les jobs de scraping avec RQ."""
|
||||
|
||||
def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
|
||||
self.config = config or get_config()
|
||||
self.redis = redis.from_url(self.config.redis.url)
|
||||
self.queue = Queue(queue_name, connection=self.redis)
|
||||
self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
|
||||
self._queue_name = queue_name
|
||||
self._redis: Optional[redis.Redis] = None
|
||||
self._queue: Optional[Queue] = None
|
||||
self._scheduler: Optional[Scheduler] = None
|
||||
|
||||
def _ensure_connected(self) -> None:
|
||||
"""Etablit la connexion Redis si necessaire, leve RedisUnavailableError si echec."""
|
||||
if self._redis is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
self._redis = redis.from_url(self.config.redis.url)
|
||||
# Ping pour verifier la connexion
|
||||
self._redis.ping()
|
||||
self._queue = Queue(self._queue_name, connection=self._redis)
|
||||
self._scheduler = Scheduler(queue=self._queue, connection=self._redis)
|
||||
logger.debug(f"Connexion Redis etablie: {self.config.redis.url}")
|
||||
except (RedisConnectionError, RedisTimeoutError) as e:
|
||||
self._redis = None
|
||||
msg = f"Impossible de se connecter a Redis ({self.config.redis.url}): {e}"
|
||||
logger.error(msg)
|
||||
raise RedisUnavailableError(msg, cause=e) from e
|
||||
except RedisError as e:
|
||||
self._redis = None
|
||||
msg = f"Erreur Redis: {e}"
|
||||
logger.error(msg)
|
||||
raise RedisUnavailableError(msg, cause=e) from e
|
||||
|
||||
@property
|
||||
def redis(self) -> redis.Redis:
|
||||
"""Acces a la connexion Redis (lazy)."""
|
||||
self._ensure_connected()
|
||||
return self._redis # type: ignore
|
||||
|
||||
@property
|
||||
def queue(self) -> Queue:
|
||||
"""Acces a la queue RQ (lazy)."""
|
||||
self._ensure_connected()
|
||||
return self._queue # type: ignore
|
||||
|
||||
@property
|
||||
def scheduler(self) -> Scheduler:
|
||||
"""Acces au scheduler RQ (lazy)."""
|
||||
self._ensure_connected()
|
||||
return self._scheduler # type: ignore
|
||||
|
||||
def enqueue_immediate(
|
||||
self,
|
||||
|
||||
33
pricewatch/app/tasks/scrape.py
Executable file → Normal file
33
pricewatch/app/tasks/scrape.py
Executable file → Normal file
@@ -4,6 +4,7 @@ Tache de scraping asynchrone pour RQ.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
@@ -46,6 +47,9 @@ def scrape_product(
|
||||
|
||||
Retourne un dict avec success, product_id, snapshot, error.
|
||||
"""
|
||||
job_start_time = time.time()
|
||||
logger.info(f"[JOB START] Scraping: {url}")
|
||||
|
||||
config: AppConfig = get_config()
|
||||
setup_stores()
|
||||
|
||||
@@ -58,6 +62,8 @@ def scrape_product(
|
||||
registry = get_registry()
|
||||
store = registry.detect_store(url)
|
||||
if not store:
|
||||
elapsed_ms = int((time.time() - job_start_time) * 1000)
|
||||
logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
|
||||
snapshot = ProductSnapshot(
|
||||
source="unknown",
|
||||
url=url,
|
||||
@@ -70,6 +76,8 @@ def scrape_product(
|
||||
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
|
||||
|
||||
logger.info(f"[STORE] Detecte: {store.store_id}")
|
||||
|
||||
canonical_url = store.canonicalize(url)
|
||||
|
||||
html = None
|
||||
@@ -79,13 +87,16 @@ def scrape_product(
|
||||
html_size_bytes = None
|
||||
pw_result = None
|
||||
|
||||
logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
|
||||
http_result = fetch_http(canonical_url)
|
||||
duration_ms = http_result.duration_ms
|
||||
|
||||
if http_result.success:
|
||||
html = http_result.html
|
||||
fetch_method = FetchMethod.HTTP
|
||||
logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
|
||||
elif use_playwright:
|
||||
logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not headful,
|
||||
@@ -97,10 +108,13 @@ def scrape_product(
|
||||
if pw_result.success:
|
||||
html = pw_result.html
|
||||
fetch_method = FetchMethod.PLAYWRIGHT
|
||||
logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
|
||||
else:
|
||||
fetch_error = pw_result.error
|
||||
logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
|
||||
else:
|
||||
fetch_error = http_result.error
|
||||
logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")
|
||||
|
||||
if html:
|
||||
html_size_bytes = len(html.encode("utf-8"))
|
||||
@@ -118,12 +132,18 @@ def scrape_product(
|
||||
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
|
||||
|
||||
try:
|
||||
logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
|
||||
snapshot = store.parse(html, canonical_url)
|
||||
snapshot.debug.method = fetch_method
|
||||
snapshot.debug.duration_ms = duration_ms
|
||||
snapshot.debug.html_size_bytes = html_size_bytes
|
||||
success = snapshot.debug.status != DebugStatus.FAILED
|
||||
if success:
|
||||
logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
|
||||
else:
|
||||
logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
|
||||
except Exception as exc:
|
||||
logger.error(f"[PARSE] Exception: {exc}")
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
url=canonical_url,
|
||||
@@ -152,6 +172,19 @@ def scrape_product(
|
||||
|
||||
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
|
||||
# Log final du job
|
||||
elapsed_ms = int((time.time() - job_start_time) * 1000)
|
||||
if success:
|
||||
logger.info(
|
||||
f"[JOB OK] {store.store_id}/{snapshot.reference} "
|
||||
f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
|
||||
f"erreur={fetch_error} duree={elapsed_ms}ms"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": success,
|
||||
"product_id": product_id,
|
||||
|
||||
Reference in New Issue
Block a user