Files
scrap/pricewatch/app/db/models.py
Gilles Soulier d0b73b9319 codex2
2026-01-14 21:54:55 +01:00

359 lines
12 KiB
Python

"""
Modèles SQLAlchemy pour PriceWatch Phase 2.
Schéma normalisé pour persistence PostgreSQL:
- products: Catalogue produits (déduplication sur source + reference)
- price_history: Historique prix time-series
- product_images: Images produit (N par produit)
- product_specs: Caractéristiques produit (key-value)
- scraping_logs: Logs observabilité pour debugging
Justification technique:
- Normalisation: products séparée de price_history (catalogue vs time-series)
- Clé naturelle: (source, reference) comme unique constraint (ASIN Amazon, etc.)
- Pas de JSONB pour données structurées: tables séparées pour images/specs
- JSONB uniquement pour données variables: errors, notes dans logs
"""
from datetime import datetime, timezone
from decimal import Decimal
from typing import List, Optional
from sqlalchemy import (
TIMESTAMP,
CheckConstraint,
Column,
ForeignKey,
Index,
Integer,
JSON,
Numeric,
Boolean,
String,
Text,
UniqueConstraint,
)
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
class Base(DeclarativeBase):
"""Base class pour tous les modèles SQLAlchemy."""
pass
def utcnow() -> datetime:
return datetime.now(timezone.utc)
class Product(Base):
"""
Catalogue produits (1 ligne par produit unique).
Clé naturelle: (source, reference) - Ex: (amazon, B08N5WRWNW)
Mise à jour: title, category, url à chaque scraping
Historique prix: relation 1-N vers PriceHistory
"""
__tablename__ = "products"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Natural key (unique)
source: Mapped[str] = mapped_column(
String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)"
)
reference: Mapped[str] = mapped_column(
String(100), nullable=False, comment="Product reference (ASIN, SKU, etc.)"
)
# Product metadata
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Canonical product URL")
title: Mapped[Optional[str]] = mapped_column(Text, nullable=True, comment="Product title")
category: Mapped[Optional[str]] = mapped_column(
Text, nullable=True, comment="Product category (breadcrumb)"
)
description: Mapped[Optional[str]] = mapped_column(
Text, nullable=True, comment="Product description"
)
currency: Mapped[Optional[str]] = mapped_column(
String(3), nullable=True, comment="Currency code (EUR, USD, GBP)"
)
msrp: Mapped[Optional[Decimal]] = mapped_column(
Numeric(10, 2), nullable=True, comment="Recommended price"
)
# Timestamps
first_seen_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=utcnow, comment="First scraping timestamp"
)
last_updated_at: Mapped[datetime] = mapped_column(
TIMESTAMP,
nullable=False,
default=utcnow,
onupdate=utcnow,
comment="Last metadata update",
)
# Relationships
price_history: Mapped[List["PriceHistory"]] = relationship(
"PriceHistory", back_populates="product", cascade="all, delete-orphan"
)
images: Mapped[List["ProductImage"]] = relationship(
"ProductImage", back_populates="product", cascade="all, delete-orphan"
)
specs: Mapped[List["ProductSpec"]] = relationship(
"ProductSpec", back_populates="product", cascade="all, delete-orphan"
)
logs: Mapped[List["ScrapingLog"]] = relationship(
"ScrapingLog", back_populates="product", cascade="all, delete-orphan"
)
# Constraints
__table_args__ = (
UniqueConstraint("source", "reference", name="uq_product_source_reference"),
Index("ix_product_source", "source"),
Index("ix_product_reference", "reference"),
Index("ix_product_last_updated", "last_updated_at"),
)
def __repr__(self) -> str:
return f"<Product(id={self.id}, source={self.source}, reference={self.reference})>"
class PriceHistory(Base):
"""
Historique prix (time-series).
Une ligne par scraping réussi avec extraction prix.
Unique constraint sur (product_id, fetched_at) évite doublons.
"""
__tablename__ = "price_history"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key
product_id: Mapped[int] = mapped_column(
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
)
# Price data
price: Mapped[Optional[Decimal]] = mapped_column(
Numeric(10, 2), nullable=True, comment="Product price"
)
shipping_cost: Mapped[Optional[Decimal]] = mapped_column(
Numeric(10, 2), nullable=True, comment="Shipping cost"
)
stock_status: Mapped[Optional[str]] = mapped_column(
String(20), nullable=True, comment="Stock status (in_stock, out_of_stock, unknown)"
)
# Fetch metadata
fetch_method: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch method (http, playwright)"
)
fetch_status: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
)
fetched_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, comment="Scraping timestamp"
)
# Relationship
product: Mapped["Product"] = relationship("Product", back_populates="price_history")
# Constraints
__table_args__ = (
UniqueConstraint("product_id", "fetched_at", name="uq_price_history_product_time"),
Index("ix_price_history_product_id", "product_id"),
Index("ix_price_history_fetched_at", "fetched_at"),
CheckConstraint("stock_status IN ('in_stock', 'out_of_stock', 'unknown')"),
CheckConstraint("fetch_method IN ('http', 'playwright')"),
CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
)
def __repr__(self) -> str:
return f"<PriceHistory(id={self.id}, product_id={self.product_id}, price={self.price}, fetched_at={self.fetched_at})>"
class ProductImage(Base):
"""
Images produit (N images par produit).
Unique constraint sur (product_id, image_url) évite doublons.
Position permet de garder l'ordre des images.
"""
__tablename__ = "product_images"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key
product_id: Mapped[int] = mapped_column(
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
)
# Image data
image_url: Mapped[str] = mapped_column(Text, nullable=False, comment="Image URL")
position: Mapped[int] = mapped_column(
Integer, nullable=False, default=0, comment="Image position (0=main)"
)
# Relationship
product: Mapped["Product"] = relationship("Product", back_populates="images")
# Constraints
__table_args__ = (
UniqueConstraint("product_id", "image_url", name="uq_product_image_url"),
Index("ix_product_image_product_id", "product_id"),
)
def __repr__(self) -> str:
return f"<ProductImage(id={self.id}, product_id={self.product_id}, position={self.position})>"
class ProductSpec(Base):
"""
Caractéristiques produit (key-value).
Unique constraint sur (product_id, spec_key) évite doublons.
Permet queries efficaces par clé.
"""
__tablename__ = "product_specs"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key
product_id: Mapped[int] = mapped_column(
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
)
# Spec data
spec_key: Mapped[str] = mapped_column(
String(200), nullable=False, comment="Specification key (e.g., 'Brand', 'Color')"
)
spec_value: Mapped[str] = mapped_column(Text, nullable=False, comment="Specification value")
# Relationship
product: Mapped["Product"] = relationship("Product", back_populates="specs")
# Constraints
__table_args__ = (
UniqueConstraint("product_id", "spec_key", name="uq_product_spec_key"),
Index("ix_product_spec_product_id", "product_id"),
Index("ix_product_spec_key", "spec_key"),
)
def __repr__(self) -> str:
return f"<ProductSpec(id={self.id}, product_id={self.product_id}, key={self.spec_key})>"
class ScrapingLog(Base):
"""
Logs observabilité pour debugging.
FK optionnelle vers products (permet logs même si produit non créé).
JSONB pour errors/notes car structure variable.
Permet analytics: taux succès, durée moyenne, etc.
"""
__tablename__ = "scraping_logs"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key (optional)
product_id: Mapped[Optional[int]] = mapped_column(
Integer, ForeignKey("products.id", ondelete="SET NULL"), nullable=True
)
# Scraping metadata
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Scraped URL")
source: Mapped[str] = mapped_column(
String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)"
)
reference: Mapped[Optional[str]] = mapped_column(
String(100), nullable=True, comment="Product reference (if extracted)"
)
# Fetch metadata
fetch_method: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch method (http, playwright)"
)
fetch_status: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
)
fetched_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=utcnow, comment="Scraping timestamp"
)
# Performance metrics
duration_ms: Mapped[Optional[int]] = mapped_column(
Integer, nullable=True, comment="Fetch duration in milliseconds"
)
html_size_bytes: Mapped[Optional[int]] = mapped_column(
Integer, nullable=True, comment="HTML response size in bytes"
)
# Debug data (JSONB)
errors: Mapped[Optional[list[str]]] = mapped_column(
JSON().with_variant(JSONB, "postgresql"),
nullable=True,
comment="Error messages (list of strings)",
)
notes: Mapped[Optional[list[str]]] = mapped_column(
JSON().with_variant(JSONB, "postgresql"),
nullable=True,
comment="Debug notes (list of strings)",
)
# Relationship
product: Mapped[Optional["Product"]] = relationship("Product", back_populates="logs")
# Constraints
__table_args__ = (
Index("ix_scraping_log_product_id", "product_id"),
Index("ix_scraping_log_source", "source"),
Index("ix_scraping_log_fetched_at", "fetched_at"),
Index("ix_scraping_log_fetch_status", "fetch_status"),
CheckConstraint("fetch_method IN ('http', 'playwright')"),
CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
)
def __repr__(self) -> str:
return f"<ScrapingLog(id={self.id}, url={self.url}, status={self.fetch_status}, fetched_at={self.fetched_at})>"
class Webhook(Base):
"""
Webhooks pour notifications externes.
"""
__tablename__ = "webhooks"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
event: Mapped[str] = mapped_column(String(50), nullable=False, comment="Event name")
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Webhook URL")
enabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
secret: Mapped[Optional[str]] = mapped_column(
String(200), nullable=True, comment="Secret optionnel"
)
created_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=utcnow, comment="Creation timestamp"
)
__table_args__ = (
Index("ix_webhook_event", "event"),
Index("ix_webhook_enabled", "enabled"),
)
def __repr__(self) -> str:
return f"<Webhook(id={self.id}, event={self.event}, url={self.url})>"