before claude
This commit is contained in:
@@ -22,6 +22,10 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from pricewatch.app.api.schemas import (
|
||||
BackendLogEntry,
|
||||
ClassificationOptionsOut,
|
||||
ClassificationRuleCreate,
|
||||
ClassificationRuleOut,
|
||||
ClassificationRuleUpdate,
|
||||
EnqueueRequest,
|
||||
EnqueueResponse,
|
||||
HealthStatus,
|
||||
@@ -52,7 +56,8 @@ from pricewatch.app.core.config import get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
from pricewatch.app.db.connection import check_db_connection, get_session
|
||||
from pricewatch.app.db.models import PriceHistory, Product, ScrapingLog, Webhook
|
||||
from pricewatch.app.db.models import ClassificationRule, PriceHistory, Product, ScrapingLog, Webhook
|
||||
from pricewatch.app.db.repository import ProductRepository
|
||||
from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
from pricewatch.app.tasks.scheduler import RedisUnavailableError, check_redis_connection, ScrapingScheduler
|
||||
@@ -188,6 +193,7 @@ def create_product(
|
||||
url=payload.url,
|
||||
title=payload.title,
|
||||
category=payload.category,
|
||||
type=payload.type,
|
||||
description=payload.description,
|
||||
currency=payload.currency,
|
||||
msrp=payload.msrp,
|
||||
@@ -241,6 +247,129 @@ def update_product(
|
||||
return _product_to_out(session, product)
|
||||
|
||||
|
||||
@app.get(
|
||||
"/classification/rules",
|
||||
response_model=list[ClassificationRuleOut],
|
||||
dependencies=[Depends(require_token)],
|
||||
)
|
||||
def list_classification_rules(
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> list[ClassificationRuleOut]:
|
||||
"""Liste les regles de classification."""
|
||||
rules = (
|
||||
session.query(ClassificationRule)
|
||||
.order_by(ClassificationRule.sort_order, ClassificationRule.id)
|
||||
.all()
|
||||
)
|
||||
return [
|
||||
ClassificationRuleOut(
|
||||
id=rule.id,
|
||||
category=rule.category,
|
||||
type=rule.type,
|
||||
keywords=rule.keywords or [],
|
||||
sort_order=rule.sort_order,
|
||||
is_active=rule.is_active,
|
||||
)
|
||||
for rule in rules
|
||||
]
|
||||
|
||||
|
||||
@app.post(
|
||||
"/classification/rules",
|
||||
response_model=ClassificationRuleOut,
|
||||
dependencies=[Depends(require_token)],
|
||||
)
|
||||
def create_classification_rule(
|
||||
payload: ClassificationRuleCreate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ClassificationRuleOut:
|
||||
"""Cree une regle de classification."""
|
||||
rule = ClassificationRule(
|
||||
category=payload.category,
|
||||
type=payload.type,
|
||||
keywords=payload.keywords,
|
||||
sort_order=payload.sort_order or 0,
|
||||
is_active=True if payload.is_active is None else payload.is_active,
|
||||
)
|
||||
session.add(rule)
|
||||
session.commit()
|
||||
session.refresh(rule)
|
||||
return ClassificationRuleOut(
|
||||
id=rule.id,
|
||||
category=rule.category,
|
||||
type=rule.type,
|
||||
keywords=rule.keywords or [],
|
||||
sort_order=rule.sort_order,
|
||||
is_active=rule.is_active,
|
||||
)
|
||||
|
||||
|
||||
@app.patch(
|
||||
"/classification/rules/{rule_id}",
|
||||
response_model=ClassificationRuleOut,
|
||||
dependencies=[Depends(require_token)],
|
||||
)
|
||||
def update_classification_rule(
|
||||
rule_id: int,
|
||||
payload: ClassificationRuleUpdate,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ClassificationRuleOut:
|
||||
"""Met a jour une regle de classification."""
|
||||
rule = session.query(ClassificationRule).filter(ClassificationRule.id == rule_id).one_or_none()
|
||||
if not rule:
|
||||
raise HTTPException(status_code=404, detail="Regle non trouvee")
|
||||
updates = payload.model_dump(exclude_unset=True)
|
||||
for key, value in updates.items():
|
||||
setattr(rule, key, value)
|
||||
session.commit()
|
||||
session.refresh(rule)
|
||||
return ClassificationRuleOut(
|
||||
id=rule.id,
|
||||
category=rule.category,
|
||||
type=rule.type,
|
||||
keywords=rule.keywords or [],
|
||||
sort_order=rule.sort_order,
|
||||
is_active=rule.is_active,
|
||||
)
|
||||
|
||||
|
||||
@app.delete(
|
||||
"/classification/rules/{rule_id}",
|
||||
dependencies=[Depends(require_token)],
|
||||
)
|
||||
def delete_classification_rule(
|
||||
rule_id: int,
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> dict[str, str]:
|
||||
"""Supprime une regle de classification."""
|
||||
rule = session.query(ClassificationRule).filter(ClassificationRule.id == rule_id).one_or_none()
|
||||
if not rule:
|
||||
raise HTTPException(status_code=404, detail="Regle non trouvee")
|
||||
session.delete(rule)
|
||||
session.commit()
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@app.get(
|
||||
"/classification/options",
|
||||
response_model=ClassificationOptionsOut,
|
||||
dependencies=[Depends(require_token)],
|
||||
)
|
||||
def get_classification_options(
|
||||
session: Session = Depends(get_db_session),
|
||||
) -> ClassificationOptionsOut:
|
||||
"""Expose la liste des categories et types issus des regles actives."""
|
||||
rules = (
|
||||
session.query(ClassificationRule)
|
||||
.filter(ClassificationRule.is_active == True)
|
||||
.order_by(ClassificationRule.sort_order, ClassificationRule.id)
|
||||
.all()
|
||||
)
|
||||
categories = sorted({rule.category for rule in rules if rule.category})
|
||||
types = sorted({rule.type for rule in rules if rule.type})
|
||||
return ClassificationOptionsOut(categories=categories, types=types)
|
||||
|
||||
|
||||
@app.delete("/products/{product_id}", dependencies=[Depends(require_token)])
|
||||
def delete_product(
|
||||
product_id: int,
|
||||
@@ -703,6 +832,13 @@ def preview_scrape(payload: ScrapePreviewRequest) -> ScrapePreviewResponse:
|
||||
if snapshot is None:
|
||||
_add_backend_log("ERROR", f"Preview scraping KO: {payload.url}")
|
||||
return ScrapePreviewResponse(success=False, snapshot=None, error=result.get("error"))
|
||||
config = get_config()
|
||||
if config.enable_db:
|
||||
try:
|
||||
with get_session(config) as session:
|
||||
ProductRepository(session).apply_classification(snapshot)
|
||||
except Exception as exc:
|
||||
snapshot.add_note(f"Classification ignoree: {exc}")
|
||||
return ScrapePreviewResponse(
|
||||
success=bool(result.get("success")),
|
||||
snapshot=snapshot.model_dump(mode="json"),
|
||||
@@ -719,7 +855,9 @@ def commit_scrape(payload: ScrapeCommitRequest) -> ScrapeCommitResponse:
|
||||
_add_backend_log("ERROR", "Commit scraping KO: snapshot invalide")
|
||||
raise HTTPException(status_code=400, detail="Snapshot invalide") from exc
|
||||
|
||||
product_id = ScrapingPipeline(config=get_config()).process_snapshot(snapshot, save_to_db=True)
|
||||
product_id = ScrapingPipeline(config=get_config()).process_snapshot(
|
||||
snapshot, save_to_db=True, apply_classification=False
|
||||
)
|
||||
_add_backend_log("INFO", f"Commit scraping OK: product_id={product_id}")
|
||||
return ScrapeCommitResponse(success=True, product_id=product_id)
|
||||
|
||||
@@ -808,12 +946,9 @@ def _product_to_out(session: Session, product: Product) -> ProductOut:
|
||||
)
|
||||
images = [image.image_url for image in product.images]
|
||||
specs = {spec.spec_key: spec.spec_value for spec in product.specs}
|
||||
discount_amount = None
|
||||
discount_percent = None
|
||||
if latest and latest.price is not None and product.msrp:
|
||||
discount_amount = float(product.msrp) - float(latest.price)
|
||||
if product.msrp > 0:
|
||||
discount_percent = (discount_amount / float(product.msrp)) * 100
|
||||
main_image = images[0] if images else None
|
||||
gallery_images = images[1:] if len(images) > 1 else []
|
||||
asin = product.reference if product.source == "amazon" else None
|
||||
history_rows = (
|
||||
session.query(PriceHistory)
|
||||
.filter(PriceHistory.product_id == product.id, PriceHistory.price != None)
|
||||
@@ -830,12 +965,23 @@ def _product_to_out(session: Session, product: Product) -> ProductOut:
|
||||
id=product.id,
|
||||
source=product.source,
|
||||
reference=product.reference,
|
||||
asin=asin,
|
||||
url=product.url,
|
||||
title=product.title,
|
||||
category=product.category,
|
||||
type=product.type,
|
||||
description=product.description,
|
||||
currency=product.currency,
|
||||
msrp=float(product.msrp) if product.msrp is not None else None,
|
||||
rating_value=float(product.rating_value) if product.rating_value is not None else None,
|
||||
rating_count=product.rating_count,
|
||||
amazon_choice=product.amazon_choice,
|
||||
amazon_choice_label=product.amazon_choice_label,
|
||||
discount_text=product.discount_text,
|
||||
stock_text=product.stock_text,
|
||||
in_stock=product.in_stock,
|
||||
model_number=product.model_number,
|
||||
model_name=product.model_name,
|
||||
first_seen_at=product.first_seen_at,
|
||||
last_updated_at=product.last_updated_at,
|
||||
latest_price=float(latest.price) if latest and latest.price is not None else None,
|
||||
@@ -845,9 +991,11 @@ def _product_to_out(session: Session, product: Product) -> ProductOut:
|
||||
latest_stock_status=latest.stock_status if latest else None,
|
||||
latest_fetched_at=latest.fetched_at if latest else None,
|
||||
images=images,
|
||||
main_image=main_image,
|
||||
gallery_images=gallery_images,
|
||||
specs=specs,
|
||||
discount_amount=discount_amount,
|
||||
discount_percent=discount_percent,
|
||||
discount_amount=None,
|
||||
discount_percent=None,
|
||||
history=history_points,
|
||||
)
|
||||
|
||||
|
||||
@@ -22,12 +22,23 @@ class ProductOut(BaseModel):
|
||||
id: int
|
||||
source: str
|
||||
reference: str
|
||||
asin: Optional[str] = None
|
||||
url: str
|
||||
title: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
currency: Optional[str] = None
|
||||
msrp: Optional[float] = None
|
||||
rating_value: Optional[float] = None
|
||||
rating_count: Optional[int] = None
|
||||
amazon_choice: Optional[bool] = None
|
||||
amazon_choice_label: Optional[str] = None
|
||||
discount_text: Optional[str] = None
|
||||
stock_text: Optional[str] = None
|
||||
in_stock: Optional[bool] = None
|
||||
model_number: Optional[str] = None
|
||||
model_name: Optional[str] = None
|
||||
first_seen_at: datetime
|
||||
last_updated_at: datetime
|
||||
latest_price: Optional[float] = None
|
||||
@@ -35,6 +46,8 @@ class ProductOut(BaseModel):
|
||||
latest_stock_status: Optional[str] = None
|
||||
latest_fetched_at: Optional[datetime] = None
|
||||
images: list[str] = []
|
||||
main_image: Optional[str] = None
|
||||
gallery_images: list[str] = []
|
||||
specs: dict[str, str] = {}
|
||||
discount_amount: Optional[float] = None
|
||||
discount_percent: Optional[float] = None
|
||||
@@ -47,6 +60,7 @@ class ProductCreate(BaseModel):
|
||||
url: str
|
||||
title: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
currency: Optional[str] = None
|
||||
msrp: Optional[float] = None
|
||||
@@ -56,6 +70,7 @@ class ProductUpdate(BaseModel):
|
||||
url: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
currency: Optional[str] = None
|
||||
msrp: Optional[float] = None
|
||||
@@ -208,6 +223,36 @@ class VersionResponse(BaseModel):
|
||||
api_version: str
|
||||
|
||||
|
||||
class ClassificationRuleOut(BaseModel):
|
||||
id: int
|
||||
category: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
keywords: list[str] = Field(default_factory=list)
|
||||
sort_order: int = 0
|
||||
is_active: bool = True
|
||||
|
||||
|
||||
class ClassificationRuleCreate(BaseModel):
|
||||
category: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
keywords: list[str] = Field(default_factory=list)
|
||||
sort_order: Optional[int] = 0
|
||||
is_active: Optional[bool] = True
|
||||
|
||||
|
||||
class ClassificationRuleUpdate(BaseModel):
|
||||
category: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
keywords: Optional[list[str]] = None
|
||||
sort_order: Optional[int] = None
|
||||
is_active: Optional[bool] = None
|
||||
|
||||
|
||||
class ClassificationOptionsOut(BaseModel):
|
||||
categories: list[str] = Field(default_factory=list)
|
||||
types: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class BackendLogEntry(BaseModel):
|
||||
time: datetime
|
||||
level: str
|
||||
|
||||
Binary file not shown.
@@ -93,13 +93,52 @@ class ProductSnapshot(BaseModel):
|
||||
reference: Optional[str] = Field(
|
||||
default=None, description="Référence produit (ASIN, SKU, etc.)"
|
||||
)
|
||||
asin: Optional[str] = Field(
|
||||
default=None, description="ASIN Amazon si disponible"
|
||||
)
|
||||
category: Optional[str] = Field(default=None, description="Catégorie du produit")
|
||||
type: Optional[str] = Field(default=None, description="Type du produit")
|
||||
description: Optional[str] = Field(default=None, description="Description produit")
|
||||
|
||||
# Données Amazon explicites (si disponibles)
|
||||
rating_value: Optional[float] = Field(
|
||||
default=None, description="Note moyenne affichée"
|
||||
)
|
||||
rating_count: Optional[int] = Field(
|
||||
default=None, description="Nombre d'évaluations"
|
||||
)
|
||||
amazon_choice: Optional[bool] = Field(
|
||||
default=None, description="Badge Choix d'Amazon présent"
|
||||
)
|
||||
amazon_choice_label: Optional[str] = Field(
|
||||
default=None, description="Libellé du badge Choix d'Amazon"
|
||||
)
|
||||
discount_text: Optional[str] = Field(
|
||||
default=None, description="Texte de réduction affiché"
|
||||
)
|
||||
stock_text: Optional[str] = Field(
|
||||
default=None, description="Texte brut de stock"
|
||||
)
|
||||
in_stock: Optional[bool] = Field(
|
||||
default=None, description="Disponibilité dérivée"
|
||||
)
|
||||
model_number: Optional[str] = Field(
|
||||
default=None, description="Numéro du modèle de l'article"
|
||||
)
|
||||
model_name: Optional[str] = Field(
|
||||
default=None, description="Nom du modèle explicite"
|
||||
)
|
||||
|
||||
# Médias
|
||||
images: list[str] = Field(
|
||||
default_factory=list, description="Liste des URLs d'images du produit"
|
||||
)
|
||||
main_image: Optional[str] = Field(
|
||||
default=None, description="Image principale du produit"
|
||||
)
|
||||
gallery_images: list[str] = Field(
|
||||
default_factory=list, description="Images de galerie dédoublonnées"
|
||||
)
|
||||
|
||||
# Caractéristiques techniques
|
||||
specs: dict[str, str] = Field(
|
||||
@@ -134,6 +173,12 @@ class ProductSnapshot(BaseModel):
|
||||
"""Filtre les URLs d'images vides."""
|
||||
return [url.strip() for url in v if url and url.strip()]
|
||||
|
||||
@field_validator("gallery_images")
|
||||
@classmethod
|
||||
def validate_gallery_images(cls, v: list[str]) -> list[str]:
|
||||
"""Filtre les URLs de galerie vides."""
|
||||
return [url.strip() for url in v if url and url.strip()]
|
||||
|
||||
model_config = ConfigDict(
|
||||
use_enum_values=True,
|
||||
json_schema_extra={
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,350 @@
|
||||
"""Ajout champs Amazon produit
|
||||
|
||||
Revision ID: 0014e51c4927
|
||||
Revises: 20260115_02_product_details
|
||||
Create Date: 2026-01-17 19:23:01.866891
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# Revision identifiers, used by Alembic.
|
||||
revision = '0014e51c4927'
|
||||
down_revision = '20260115_02_product_details'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.alter_column('price_history', 'price',
|
||||
existing_type=sa.NUMERIC(precision=10, scale=2),
|
||||
comment='Product price',
|
||||
existing_nullable=True)
|
||||
op.alter_column('price_history', 'shipping_cost',
|
||||
existing_type=sa.NUMERIC(precision=10, scale=2),
|
||||
comment='Shipping cost',
|
||||
existing_nullable=True)
|
||||
op.alter_column('price_history', 'stock_status',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment='Stock status (in_stock, out_of_stock, unknown)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('price_history', 'fetch_method',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment='Fetch method (http, playwright)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('price_history', 'fetch_status',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment='Fetch status (success, partial, failed)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('price_history', 'fetched_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment='Scraping timestamp',
|
||||
existing_nullable=False)
|
||||
op.alter_column('product_images', 'image_url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Image URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('product_images', 'position',
|
||||
existing_type=sa.INTEGER(),
|
||||
comment='Image position (0=main)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('product_specs', 'spec_key',
|
||||
existing_type=sa.VARCHAR(length=200),
|
||||
comment="Specification key (e.g., 'Brand', 'Color')",
|
||||
existing_nullable=False)
|
||||
op.alter_column('product_specs', 'spec_value',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Specification value',
|
||||
existing_nullable=False)
|
||||
op.add_column('products', sa.Column('rating_value', sa.Numeric(precision=3, scale=2), nullable=True, comment='Note moyenne'))
|
||||
op.add_column('products', sa.Column('rating_count', sa.Integer(), nullable=True, comment="Nombre d'evaluations"))
|
||||
op.add_column('products', sa.Column('amazon_choice', sa.Boolean(), nullable=True, comment="Badge Choix d'Amazon"))
|
||||
op.add_column('products', sa.Column('amazon_choice_label', sa.Text(), nullable=True, comment="Libelle Choix d'Amazon"))
|
||||
op.add_column('products', sa.Column('discount_text', sa.Text(), nullable=True, comment='Texte de reduction affiche'))
|
||||
op.add_column('products', sa.Column('stock_text', sa.Text(), nullable=True, comment='Texte brut du stock'))
|
||||
op.add_column('products', sa.Column('in_stock', sa.Boolean(), nullable=True, comment='Disponibilite derivee'))
|
||||
op.add_column('products', sa.Column('model_number', sa.Text(), nullable=True, comment='Numero du modele'))
|
||||
op.add_column('products', sa.Column('model_name', sa.Text(), nullable=True, comment='Nom du modele'))
|
||||
op.alter_column('products', 'source',
|
||||
existing_type=sa.VARCHAR(length=50),
|
||||
comment='Store ID (amazon, cdiscount, etc.)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'reference',
|
||||
existing_type=sa.VARCHAR(length=100),
|
||||
comment='Product reference (ASIN, SKU, etc.)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Canonical product URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'title',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Product title',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'category',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Product category (breadcrumb)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'description',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Product description',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'currency',
|
||||
existing_type=sa.VARCHAR(length=3),
|
||||
comment='Currency code (EUR, USD, GBP)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'msrp',
|
||||
existing_type=sa.NUMERIC(precision=10, scale=2),
|
||||
comment='Recommended price',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'first_seen_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment='First scraping timestamp',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'last_updated_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment='Last metadata update',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Scraped URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'source',
|
||||
existing_type=sa.VARCHAR(length=50),
|
||||
comment='Store ID (amazon, cdiscount, etc.)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'reference',
|
||||
existing_type=sa.VARCHAR(length=100),
|
||||
comment='Product reference (if extracted)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'fetch_method',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment='Fetch method (http, playwright)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'fetch_status',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment='Fetch status (success, partial, failed)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'fetched_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment='Scraping timestamp',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'duration_ms',
|
||||
existing_type=sa.INTEGER(),
|
||||
comment='Fetch duration in milliseconds',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'html_size_bytes',
|
||||
existing_type=sa.INTEGER(),
|
||||
comment='HTML response size in bytes',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'errors',
|
||||
existing_type=postgresql.JSONB(astext_type=sa.Text()),
|
||||
comment='Error messages (list of strings)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'notes',
|
||||
existing_type=postgresql.JSONB(astext_type=sa.Text()),
|
||||
comment='Debug notes (list of strings)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('webhooks', 'event',
|
||||
existing_type=sa.VARCHAR(length=50),
|
||||
comment='Event name',
|
||||
existing_nullable=False)
|
||||
op.alter_column('webhooks', 'url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment='Webhook URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('webhooks', 'secret',
|
||||
existing_type=sa.VARCHAR(length=200),
|
||||
comment='Secret optionnel',
|
||||
existing_nullable=True)
|
||||
op.alter_column('webhooks', 'created_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment='Creation timestamp',
|
||||
existing_nullable=False)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.alter_column('webhooks', 'created_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment=None,
|
||||
existing_comment='Creation timestamp',
|
||||
existing_nullable=False)
|
||||
op.alter_column('webhooks', 'secret',
|
||||
existing_type=sa.VARCHAR(length=200),
|
||||
comment=None,
|
||||
existing_comment='Secret optionnel',
|
||||
existing_nullable=True)
|
||||
op.alter_column('webhooks', 'url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Webhook URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('webhooks', 'event',
|
||||
existing_type=sa.VARCHAR(length=50),
|
||||
comment=None,
|
||||
existing_comment='Event name',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'notes',
|
||||
existing_type=postgresql.JSONB(astext_type=sa.Text()),
|
||||
comment=None,
|
||||
existing_comment='Debug notes (list of strings)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'errors',
|
||||
existing_type=postgresql.JSONB(astext_type=sa.Text()),
|
||||
comment=None,
|
||||
existing_comment='Error messages (list of strings)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'html_size_bytes',
|
||||
existing_type=sa.INTEGER(),
|
||||
comment=None,
|
||||
existing_comment='HTML response size in bytes',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'duration_ms',
|
||||
existing_type=sa.INTEGER(),
|
||||
comment=None,
|
||||
existing_comment='Fetch duration in milliseconds',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'fetched_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment=None,
|
||||
existing_comment='Scraping timestamp',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'fetch_status',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment=None,
|
||||
existing_comment='Fetch status (success, partial, failed)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'fetch_method',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment=None,
|
||||
existing_comment='Fetch method (http, playwright)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'reference',
|
||||
existing_type=sa.VARCHAR(length=100),
|
||||
comment=None,
|
||||
existing_comment='Product reference (if extracted)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('scraping_logs', 'source',
|
||||
existing_type=sa.VARCHAR(length=50),
|
||||
comment=None,
|
||||
existing_comment='Store ID (amazon, cdiscount, etc.)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('scraping_logs', 'url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Scraped URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'last_updated_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment=None,
|
||||
existing_comment='Last metadata update',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'first_seen_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment=None,
|
||||
existing_comment='First scraping timestamp',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'msrp',
|
||||
existing_type=sa.NUMERIC(precision=10, scale=2),
|
||||
comment=None,
|
||||
existing_comment='Recommended price',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'currency',
|
||||
existing_type=sa.VARCHAR(length=3),
|
||||
comment=None,
|
||||
existing_comment='Currency code (EUR, USD, GBP)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'description',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Product description',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'category',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Product category (breadcrumb)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'title',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Product title',
|
||||
existing_nullable=True)
|
||||
op.alter_column('products', 'url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Canonical product URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'reference',
|
||||
existing_type=sa.VARCHAR(length=100),
|
||||
comment=None,
|
||||
existing_comment='Product reference (ASIN, SKU, etc.)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('products', 'source',
|
||||
existing_type=sa.VARCHAR(length=50),
|
||||
comment=None,
|
||||
existing_comment='Store ID (amazon, cdiscount, etc.)',
|
||||
existing_nullable=False)
|
||||
op.drop_column('products', 'model_name')
|
||||
op.drop_column('products', 'model_number')
|
||||
op.drop_column('products', 'in_stock')
|
||||
op.drop_column('products', 'stock_text')
|
||||
op.drop_column('products', 'discount_text')
|
||||
op.drop_column('products', 'amazon_choice_label')
|
||||
op.drop_column('products', 'amazon_choice')
|
||||
op.drop_column('products', 'rating_count')
|
||||
op.drop_column('products', 'rating_value')
|
||||
op.alter_column('product_specs', 'spec_value',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Specification value',
|
||||
existing_nullable=False)
|
||||
op.alter_column('product_specs', 'spec_key',
|
||||
existing_type=sa.VARCHAR(length=200),
|
||||
comment=None,
|
||||
existing_comment="Specification key (e.g., 'Brand', 'Color')",
|
||||
existing_nullable=False)
|
||||
op.alter_column('product_images', 'position',
|
||||
existing_type=sa.INTEGER(),
|
||||
comment=None,
|
||||
existing_comment='Image position (0=main)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('product_images', 'image_url',
|
||||
existing_type=sa.TEXT(),
|
||||
comment=None,
|
||||
existing_comment='Image URL',
|
||||
existing_nullable=False)
|
||||
op.alter_column('price_history', 'fetched_at',
|
||||
existing_type=postgresql.TIMESTAMP(),
|
||||
comment=None,
|
||||
existing_comment='Scraping timestamp',
|
||||
existing_nullable=False)
|
||||
op.alter_column('price_history', 'fetch_status',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment=None,
|
||||
existing_comment='Fetch status (success, partial, failed)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('price_history', 'fetch_method',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment=None,
|
||||
existing_comment='Fetch method (http, playwright)',
|
||||
existing_nullable=False)
|
||||
op.alter_column('price_history', 'stock_status',
|
||||
existing_type=sa.VARCHAR(length=20),
|
||||
comment=None,
|
||||
existing_comment='Stock status (in_stock, out_of_stock, unknown)',
|
||||
existing_nullable=True)
|
||||
op.alter_column('price_history', 'shipping_cost',
|
||||
existing_type=sa.NUMERIC(precision=10, scale=2),
|
||||
comment=None,
|
||||
existing_comment='Shipping cost',
|
||||
existing_nullable=True)
|
||||
op.alter_column('price_history', 'price',
|
||||
existing_type=sa.NUMERIC(precision=10, scale=2),
|
||||
comment=None,
|
||||
existing_comment='Product price',
|
||||
existing_nullable=True)
|
||||
# ### end Alembic commands ###
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Ajout champs Amazon produit
|
||||
|
||||
Revision ID: 1467e98fcbea
|
||||
Revises: 3e68b0f0c9e4
|
||||
Create Date: 2026-01-17 20:08:32.991650
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# Revision identifiers, used by Alembic.
|
||||
revision = '1467e98fcbea'
|
||||
down_revision = '3e68b0f0c9e4'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
pass
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
pass
|
||||
# ### end Alembic commands ###
|
||||
@@ -0,0 +1,114 @@
|
||||
"""Ajout classification rules et type produit
|
||||
|
||||
Revision ID: 20260117_03_classification_rules
|
||||
Revises: 3e68b0f0c9e4
|
||||
Create Date: 2026-01-17 20:05:00.000000
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# Revision identifiers, used by Alembic.
|
||||
revision = "20260117_03_classification_rules"
|
||||
down_revision = "3e68b0f0c9e4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"products",
|
||||
sa.Column("type", sa.Text(), nullable=True, comment="Product type"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"classification_rules",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("category", sa.String(length=80), nullable=True, comment="Categorie cible"),
|
||||
sa.Column("type", sa.String(length=80), nullable=True, comment="Type cible"),
|
||||
sa.Column(
|
||||
"keywords",
|
||||
postgresql.JSONB(astext_type=sa.Text()),
|
||||
nullable=False,
|
||||
comment="Mots-cles de matching",
|
||||
),
|
||||
sa.Column("sort_order", sa.Integer(), nullable=False, server_default="0"),
|
||||
sa.Column("is_active", sa.Boolean(), nullable=False, server_default=sa.text("true")),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.TIMESTAMP(),
|
||||
nullable=False,
|
||||
server_default=sa.text("CURRENT_TIMESTAMP"),
|
||||
comment="Creation timestamp",
|
||||
),
|
||||
)
|
||||
op.create_index("ix_classification_rule_order", "classification_rules", ["sort_order"])
|
||||
op.create_index("ix_classification_rule_active", "classification_rules", ["is_active"])
|
||||
|
||||
rules_table = sa.table(
|
||||
"classification_rules",
|
||||
sa.column("category", sa.String),
|
||||
sa.column("type", sa.String),
|
||||
sa.column("keywords", postgresql.JSONB),
|
||||
sa.column("sort_order", sa.Integer),
|
||||
sa.column("is_active", sa.Boolean),
|
||||
sa.column("created_at", sa.TIMESTAMP),
|
||||
)
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
op.bulk_insert(
|
||||
rules_table,
|
||||
[
|
||||
{
|
||||
"category": "Informatique",
|
||||
"type": "Ecran",
|
||||
"keywords": ["ecran", "moniteur", "display"],
|
||||
"sort_order": 0,
|
||||
"is_active": True,
|
||||
"created_at": now,
|
||||
},
|
||||
{
|
||||
"category": "Informatique",
|
||||
"type": "PC portable",
|
||||
"keywords": ["pc portable", "ordinateur portable", "laptop", "notebook"],
|
||||
"sort_order": 1,
|
||||
"is_active": True,
|
||||
"created_at": now,
|
||||
},
|
||||
{
|
||||
"category": "Informatique",
|
||||
"type": "Unite centrale",
|
||||
"keywords": ["unite centrale", "tour", "desktop", "pc fixe"],
|
||||
"sort_order": 2,
|
||||
"is_active": True,
|
||||
"created_at": now,
|
||||
},
|
||||
{
|
||||
"category": "Informatique",
|
||||
"type": "Clavier",
|
||||
"keywords": ["clavier", "keyboard"],
|
||||
"sort_order": 3,
|
||||
"is_active": True,
|
||||
"created_at": now,
|
||||
},
|
||||
{
|
||||
"category": "Informatique",
|
||||
"type": "Souris",
|
||||
"keywords": ["souris", "mouse"],
|
||||
"sort_order": 4,
|
||||
"is_active": True,
|
||||
"created_at": now,
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_classification_rule_active", table_name="classification_rules")
|
||||
op.drop_index("ix_classification_rule_order", table_name="classification_rules")
|
||||
op.drop_table("classification_rules")
|
||||
op.drop_column("products", "type")
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Ajout champs Amazon produit
|
||||
|
||||
Revision ID: 3e68b0f0c9e4
|
||||
Revises: 0014e51c4927
|
||||
Create Date: 2026-01-17 19:45:03.730218
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# Revision identifiers, used by Alembic.
|
||||
revision = '3e68b0f0c9e4'
|
||||
down_revision = '0014e51c4927'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
pass
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
pass
|
||||
# ### end Alembic commands ###
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -84,6 +84,36 @@ class Product(Base):
|
||||
msrp: Mapped[Optional[Decimal]] = mapped_column(
|
||||
Numeric(10, 2), nullable=True, comment="Recommended price"
|
||||
)
|
||||
type: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Product type"
|
||||
)
|
||||
rating_value: Mapped[Optional[Decimal]] = mapped_column(
|
||||
Numeric(3, 2), nullable=True, comment="Note moyenne"
|
||||
)
|
||||
rating_count: Mapped[Optional[int]] = mapped_column(
|
||||
Integer, nullable=True, comment="Nombre d'evaluations"
|
||||
)
|
||||
amazon_choice: Mapped[Optional[bool]] = mapped_column(
|
||||
Boolean, nullable=True, comment="Badge Choix d'Amazon"
|
||||
)
|
||||
amazon_choice_label: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Libelle Choix d'Amazon"
|
||||
)
|
||||
discount_text: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Texte de reduction affiche"
|
||||
)
|
||||
stock_text: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Texte brut du stock"
|
||||
)
|
||||
in_stock: Mapped[Optional[bool]] = mapped_column(
|
||||
Boolean, nullable=True, comment="Disponibilite derivee"
|
||||
)
|
||||
model_number: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Numero du modele"
|
||||
)
|
||||
model_name: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Nom du modele"
|
||||
)
|
||||
|
||||
# Timestamps
|
||||
first_seen_at: Mapped[datetime] = mapped_column(
|
||||
@@ -331,6 +361,45 @@ class ScrapingLog(Base):
|
||||
return f"<ScrapingLog(id={self.id}, url={self.url}, status={self.fetch_status}, fetched_at={self.fetched_at})>"
|
||||
|
||||
|
||||
class ClassificationRule(Base):
|
||||
"""
|
||||
Regles de classification categorie/type basees sur des mots-cles.
|
||||
"""
|
||||
|
||||
__tablename__ = "classification_rules"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
category: Mapped[Optional[str]] = mapped_column(
|
||||
String(80), nullable=True, comment="Categorie cible"
|
||||
)
|
||||
type: Mapped[Optional[str]] = mapped_column(
|
||||
String(80), nullable=True, comment="Type cible"
|
||||
)
|
||||
keywords: Mapped[list[str]] = mapped_column(
|
||||
JSON().with_variant(JSONB, "postgresql"),
|
||||
nullable=False,
|
||||
default=list,
|
||||
comment="Mots-cles de matching",
|
||||
)
|
||||
sort_order: Mapped[int] = mapped_column(
|
||||
Integer, nullable=False, default=0, comment="Ordre de priorite (0=haut)"
|
||||
)
|
||||
is_active: Mapped[bool] = mapped_column(
|
||||
Boolean, nullable=False, default=True, comment="Regle active"
|
||||
)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP, nullable=False, default=utcnow, comment="Creation timestamp"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_classification_rule_order", "sort_order"),
|
||||
Index("ix_classification_rule_active", "is_active"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<ClassificationRule(id={self.id}, category={self.category}, type={self.type})>"
|
||||
|
||||
|
||||
class Webhook(Base):
|
||||
"""
|
||||
Webhooks pour notifications externes.
|
||||
|
||||
@@ -13,7 +13,14 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
from pricewatch.app.db.models import PriceHistory, Product, ProductImage, ProductSpec, ScrapingLog
|
||||
from pricewatch.app.db.models import (
|
||||
ClassificationRule,
|
||||
PriceHistory,
|
||||
Product,
|
||||
ProductImage,
|
||||
ProductSpec,
|
||||
ScrapingLog,
|
||||
)
|
||||
|
||||
logger = get_logger("db.repository")
|
||||
|
||||
@@ -49,12 +56,58 @@ class ProductRepository:
|
||||
product.title = snapshot.title
|
||||
if snapshot.category:
|
||||
product.category = snapshot.category
|
||||
if snapshot.type:
|
||||
product.type = snapshot.type
|
||||
if snapshot.description:
|
||||
product.description = snapshot.description
|
||||
if snapshot.currency:
|
||||
product.currency = snapshot.currency
|
||||
if snapshot.msrp is not None:
|
||||
product.msrp = snapshot.msrp
|
||||
if snapshot.rating_value is not None:
|
||||
product.rating_value = snapshot.rating_value
|
||||
if snapshot.rating_count is not None:
|
||||
product.rating_count = snapshot.rating_count
|
||||
if snapshot.amazon_choice is not None:
|
||||
product.amazon_choice = snapshot.amazon_choice
|
||||
if snapshot.amazon_choice_label:
|
||||
product.amazon_choice_label = snapshot.amazon_choice_label
|
||||
if snapshot.discount_text:
|
||||
product.discount_text = snapshot.discount_text
|
||||
if snapshot.stock_text:
|
||||
product.stock_text = snapshot.stock_text
|
||||
if snapshot.in_stock is not None:
|
||||
product.in_stock = snapshot.in_stock
|
||||
if snapshot.model_number:
|
||||
product.model_number = snapshot.model_number
|
||||
if snapshot.model_name:
|
||||
product.model_name = snapshot.model_name
|
||||
|
||||
def apply_classification(self, snapshot: ProductSnapshot) -> None:
|
||||
"""Applique les regles de classification au snapshot."""
|
||||
if not snapshot.title:
|
||||
return
|
||||
|
||||
rules = (
|
||||
self.session.query(ClassificationRule)
|
||||
.filter(ClassificationRule.is_active == True)
|
||||
.order_by(ClassificationRule.sort_order, ClassificationRule.id)
|
||||
.all()
|
||||
)
|
||||
if not rules:
|
||||
return
|
||||
|
||||
title = snapshot.title.lower()
|
||||
for rule in rules:
|
||||
keywords = rule.keywords or []
|
||||
if isinstance(keywords, str):
|
||||
keywords = [keywords]
|
||||
if any(keyword and keyword.lower() in title for keyword in keywords):
|
||||
if rule.category:
|
||||
snapshot.category = rule.category
|
||||
if rule.type:
|
||||
snapshot.type = rule.type
|
||||
return
|
||||
|
||||
def add_price_history(self, product: Product, snapshot: ProductSnapshot) -> Optional[PriceHistory]:
|
||||
"""Ajoute une entree d'historique de prix si inexistante."""
|
||||
|
||||
Binary file not shown.
@@ -25,7 +25,12 @@ class ScrapingPipeline:
|
||||
def __init__(self, config: Optional[AppConfig] = None) -> None:
|
||||
self.config = config
|
||||
|
||||
def process_snapshot(self, snapshot: ProductSnapshot, save_to_db: bool = True) -> Optional[int]:
|
||||
def process_snapshot(
|
||||
self,
|
||||
snapshot: ProductSnapshot,
|
||||
save_to_db: bool = True,
|
||||
apply_classification: bool = True,
|
||||
) -> Optional[int]:
|
||||
"""
|
||||
Persiste un snapshot en base si active.
|
||||
|
||||
@@ -39,6 +44,8 @@ class ScrapingPipeline:
|
||||
try:
|
||||
with get_session(app_config) as session:
|
||||
repo = ProductRepository(session)
|
||||
if apply_classification:
|
||||
repo.apply_classification(snapshot)
|
||||
product_id = repo.safe_save_snapshot(snapshot)
|
||||
session.commit()
|
||||
return product_id
|
||||
|
||||
Binary file not shown.
@@ -15,6 +15,13 @@ price:
|
||||
- "#priceblock_dealprice"
|
||||
- ".a-price-range .a-price .a-offscreen"
|
||||
|
||||
# Texte de réduction explicite
|
||||
discount_text:
|
||||
- "#regularprice_savings"
|
||||
- "#dealprice_savings"
|
||||
- "#savingsPercentage"
|
||||
- "span.savingsPercentage"
|
||||
|
||||
# Devise (généralement dans le symbole)
|
||||
currency:
|
||||
- "span.a-price-symbol"
|
||||
@@ -32,6 +39,24 @@ stock_status:
|
||||
- "#availability"
|
||||
- ".a-declarative .a-size-medium"
|
||||
|
||||
# Note moyenne
|
||||
rating_value:
|
||||
- "#acrPopover"
|
||||
- "#averageCustomerReviews .a-icon-alt"
|
||||
- "#averageCustomerReviews span.a-icon-alt"
|
||||
|
||||
# Nombre d'évaluations
|
||||
rating_count:
|
||||
- "#acrCustomerReviewText"
|
||||
- "#acrCustomerReviewLink"
|
||||
|
||||
# Badge Choix d'Amazon
|
||||
amazon_choice:
|
||||
- "#acBadge_feature_div"
|
||||
- "#acBadge_feature_div .ac-badge"
|
||||
- "#acBadge_feature_div .ac-badge-rectangle"
|
||||
- "#acBadge_feature_div .ac-badge-rectangle-icon"
|
||||
|
||||
# Images produit
|
||||
images:
|
||||
- "#landingImage"
|
||||
@@ -44,6 +69,13 @@ category:
|
||||
- "#wayfinding-breadcrumbs_feature_div"
|
||||
- ".a-breadcrumb"
|
||||
|
||||
# Description (détails de l'article)
|
||||
description:
|
||||
- "#detailBullets_feature_div"
|
||||
- "#detailBulletsWrapper_feature_div"
|
||||
- "#productDetails_detailBullets_sections1"
|
||||
- "#feature-bullets"
|
||||
|
||||
# Caractéristiques techniques (table specs)
|
||||
specs_table:
|
||||
- "#productDetails_techSpec_section_1"
|
||||
|
||||
@@ -130,13 +130,19 @@ class AmazonStore(BaseStore):
|
||||
title = self._extract_title(soup, debug_info)
|
||||
price = self._extract_price(soup, debug_info)
|
||||
currency = self._extract_currency(soup, debug_info)
|
||||
stock_status = self._extract_stock(soup, debug_info)
|
||||
images = self._extract_images(soup, debug_info)
|
||||
stock_status, stock_text, in_stock = self._extract_stock_details(soup, debug_info)
|
||||
main_image, gallery_images, images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
|
||||
rating_value = self._extract_rating_value(soup, debug_info)
|
||||
rating_count = self._extract_rating_count(soup, debug_info)
|
||||
amazon_choice, amazon_choice_label = self._extract_amazon_choice(soup, debug_info)
|
||||
discount_text = self._extract_discount_text(soup, debug_info)
|
||||
model_number, model_name = self._extract_model_details(specs)
|
||||
asin = reference
|
||||
|
||||
# Déterminer le statut final (ne pas écraser FAILED)
|
||||
if debug_info.status != DebugStatus.FAILED:
|
||||
@@ -153,12 +159,24 @@ class AmazonStore(BaseStore):
|
||||
currency=currency or "EUR",
|
||||
shipping_cost=None, # Difficile à extraire
|
||||
stock_status=stock_status,
|
||||
stock_text=stock_text,
|
||||
in_stock=in_stock,
|
||||
reference=reference,
|
||||
asin=asin,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
main_image=main_image,
|
||||
gallery_images=gallery_images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
rating_value=rating_value,
|
||||
rating_count=rating_count,
|
||||
amazon_choice=amazon_choice,
|
||||
amazon_choice_label=amazon_choice_label,
|
||||
discount_text=discount_text,
|
||||
model_number=model_number,
|
||||
model_name=model_name,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -203,14 +221,26 @@ class AmazonStore(BaseStore):
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
"""Extrait la description depuis les détails de l'article."""
|
||||
selectors = self.get_selector("description", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
items = [
|
||||
item.get_text(" ", strip=True)
|
||||
for item in element.select("li")
|
||||
if item.get_text(strip=True)
|
||||
]
|
||||
if items:
|
||||
return "\n".join(items)
|
||||
text = " ".join(element.stripped_strings)
|
||||
if text:
|
||||
return text
|
||||
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
@@ -271,8 +301,10 @@ class AmazonStore(BaseStore):
|
||||
# Défaut basé sur le domaine
|
||||
return "EUR"
|
||||
|
||||
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
||||
"""Extrait le statut de stock."""
|
||||
def _extract_stock_details(
|
||||
self, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> tuple[StockStatus, Optional[str], Optional[bool]]:
|
||||
"""Extrait le statut de stock avec texte brut."""
|
||||
selectors = self.get_selector("stock_status", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
@@ -280,22 +312,27 @@ class AmazonStore(BaseStore):
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True).lower()
|
||||
if "en stock" in text or "available" in text or "in stock" in text:
|
||||
return StockStatus.IN_STOCK
|
||||
text = element.get_text(strip=True)
|
||||
normalized = text.lower()
|
||||
if "en stock" in normalized or "available" in normalized or "in stock" in normalized:
|
||||
return StockStatus.IN_STOCK, text, True
|
||||
elif (
|
||||
"rupture" in text
|
||||
or "indisponible" in text
|
||||
or "out of stock" in text
|
||||
"rupture" in normalized
|
||||
or "indisponible" in normalized
|
||||
or "out of stock" in normalized
|
||||
):
|
||||
return StockStatus.OUT_OF_STOCK
|
||||
return StockStatus.OUT_OF_STOCK, text, False
|
||||
|
||||
return StockStatus.UNKNOWN
|
||||
return StockStatus.UNKNOWN, None, None
|
||||
|
||||
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
||||
"""Extrait les URLs d'images."""
|
||||
images = []
|
||||
seen = set()
|
||||
def _extract_images(
|
||||
self, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> tuple[Optional[str], list[str], list[str]]:
|
||||
"""Extrait l'image principale et la galerie."""
|
||||
images: list[str] = []
|
||||
seen: set[str] = set()
|
||||
main_image: Optional[str] = None
|
||||
max_gallery = 15
|
||||
selectors = self.get_selector("images", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
@@ -309,6 +346,8 @@ class AmazonStore(BaseStore):
|
||||
if self._is_product_image(url) and url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
if main_image is None:
|
||||
main_image = url
|
||||
dynamic = element.get("data-a-dynamic-image")
|
||||
if dynamic:
|
||||
urls = self._extract_dynamic_images(dynamic)
|
||||
@@ -316,6 +355,8 @@ class AmazonStore(BaseStore):
|
||||
if self._is_product_image(dyn_url) and dyn_url not in seen:
|
||||
images.append(dyn_url)
|
||||
seen.add(dyn_url)
|
||||
if main_image is None:
|
||||
main_image = dyn_url
|
||||
|
||||
# Fallback: chercher tous les img tags si aucune image trouvée
|
||||
if not images:
|
||||
@@ -326,8 +367,15 @@ class AmazonStore(BaseStore):
|
||||
if url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
if main_image is None:
|
||||
main_image = url
|
||||
|
||||
return images
|
||||
if main_image is None and images:
|
||||
main_image = images[0]
|
||||
gallery_images = [url for url in images if url != main_image]
|
||||
gallery_images = gallery_images[:max_gallery]
|
||||
final_images = [main_image] + gallery_images if main_image else gallery_images
|
||||
return main_image, gallery_images, final_images
|
||||
|
||||
def _extract_dynamic_images(self, raw: str) -> list[str]:
|
||||
"""Extrait les URLs du JSON data-a-dynamic-image."""
|
||||
@@ -393,8 +441,111 @@ class AmazonStore(BaseStore):
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
# Détails de l'article sous forme de liste
|
||||
detail_list = soup.select("#detailBullets_feature_div li")
|
||||
for item in detail_list:
|
||||
text = item.get_text(" ", strip=True)
|
||||
if ":" not in text:
|
||||
continue
|
||||
key, value = text.split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
if key and value and key not in specs:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
|
||||
def _extract_rating_value(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait la note moyenne."""
|
||||
selectors = self.get_selector("rating_value", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
text = element.get_text(" ", strip=True) or element.get("title", "").strip()
|
||||
match = re.search(r"([\d.,]+)", text)
|
||||
if match:
|
||||
value = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
def _extract_rating_count(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[int]:
|
||||
"""Extrait le nombre d'évaluations."""
|
||||
selectors = self.get_selector("rating_count", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
text = element.get_text(" ", strip=True)
|
||||
match = re.search(r"([\d\s\u202f\u00a0]+)", text)
|
||||
if match:
|
||||
numeric = re.sub(r"[^\d]", "", match.group(1))
|
||||
if numeric:
|
||||
return int(numeric)
|
||||
return None
|
||||
|
||||
def _extract_amazon_choice(
|
||||
self, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> tuple[Optional[bool], Optional[str]]:
|
||||
"""Extrait le badge Choix d'Amazon."""
|
||||
selectors = self.get_selector("amazon_choice", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
label_candidates = [
|
||||
element.get_text(" ", strip=True),
|
||||
element.get("aria-label", "").strip(),
|
||||
element.get("title", "").strip(),
|
||||
element.get("data-a-badge-label", "").strip(),
|
||||
]
|
||||
label = next((item for item in label_candidates if item), "")
|
||||
normalized = label.lower()
|
||||
if "choix d'amazon" in normalized or "amazon's choice" in normalized:
|
||||
return True, label
|
||||
if label:
|
||||
return True, label
|
||||
return True, None
|
||||
return None, None
|
||||
|
||||
def _extract_discount_text(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait le texte de réduction explicite."""
|
||||
selectors = self.get_selector("discount_text", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if not element:
|
||||
continue
|
||||
text = element.get_text(" ", strip=True)
|
||||
if text:
|
||||
return text
|
||||
return None
|
||||
|
||||
def _extract_model_details(self, specs: dict[str, str]) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Extrait le numero et le nom du modele depuis les specs."""
|
||||
model_number = None
|
||||
model_name = None
|
||||
for key, value in specs.items():
|
||||
normalized = key.lower()
|
||||
if "numéro du modèle de l'article" in normalized or "numero du modele de l'article" in normalized:
|
||||
model_number = value
|
||||
if "nom du modèle" in normalized or "nom du modele" in normalized:
|
||||
model_name = value
|
||||
return model_number, model_name
|
||||
|
||||
def _extract_asin_from_html(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extrait l'ASIN depuis le HTML (fallback)."""
|
||||
selectors = self.get_selector("asin", [])
|
||||
|
||||
@@ -6,6 +6,7 @@ from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import hashlib
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
@@ -127,11 +128,13 @@ class ScrapingScheduler:
|
||||
interval_hours: int = 24,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
job_id: Optional[str] = None,
|
||||
) -> ScheduledJobInfo:
|
||||
"""Planifie un scraping recurrent (intervalle en heures)."""
|
||||
interval_seconds = int(timedelta(hours=interval_hours).total_seconds())
|
||||
next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds)
|
||||
|
||||
resolved_job_id = job_id or self._job_id_for_url(url)
|
||||
job = self.scheduler.schedule(
|
||||
scheduled_time=next_run,
|
||||
func=scrape_product,
|
||||
@@ -139,6 +142,13 @@ class ScrapingScheduler:
|
||||
kwargs={"use_playwright": use_playwright, "save_db": save_db},
|
||||
interval=interval_seconds,
|
||||
repeat=None,
|
||||
id=resolved_job_id,
|
||||
)
|
||||
logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}")
|
||||
return ScheduledJobInfo(job_id=job.id, next_run=next_run)
|
||||
|
||||
@staticmethod
|
||||
def _job_id_for_url(url: str) -> str:
|
||||
"""Genere un job_id stable pour eviter les doublons."""
|
||||
fingerprint = hashlib.sha1(url.strip().lower().encode("utf-8")).hexdigest()
|
||||
return f"scrape_{fingerprint}"
|
||||
|
||||
@@ -157,6 +157,36 @@ def scrape_product(
|
||||
)
|
||||
success = False
|
||||
fetch_error = str(exc)
|
||||
# Si captcha detecte via HTTP, forcer une tentative Playwright.
|
||||
if (
|
||||
fetch_method == FetchMethod.HTTP
|
||||
and use_playwright
|
||||
and snapshot.debug.errors
|
||||
and any("captcha" in error.lower() for error in snapshot.debug.errors)
|
||||
):
|
||||
logger.info("[FETCH] Captcha detecte, tentative Playwright")
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not headful,
|
||||
timeout_ms=timeout_ms,
|
||||
save_screenshot=save_screenshot,
|
||||
)
|
||||
if pw_result.success and pw_result.html:
|
||||
try:
|
||||
snapshot = store.parse(pw_result.html, canonical_url)
|
||||
snapshot.debug.method = FetchMethod.PLAYWRIGHT
|
||||
snapshot.debug.duration_ms = pw_result.duration_ms
|
||||
snapshot.debug.html_size_bytes = len(pw_result.html.encode("utf-8"))
|
||||
snapshot.add_note("Captcha detecte via HTTP, fallback Playwright")
|
||||
success = snapshot.debug.status != DebugStatus.FAILED
|
||||
except Exception as exc:
|
||||
snapshot.add_note(f"Fallback Playwright echoue: {exc}")
|
||||
logger.error(f"[PARSE] Exception fallback Playwright: {exc}")
|
||||
fetch_error = str(exc)
|
||||
else:
|
||||
error = pw_result.error or "Erreur Playwright"
|
||||
snapshot.add_note(f"Fallback Playwright echoue: {error}")
|
||||
fetch_error = error
|
||||
else:
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
|
||||
Reference in New Issue
Block a user