This commit is contained in:
2026-01-14 07:03:38 +01:00
parent ecda149a4b
commit c91c0f1fc9
61 changed files with 4388 additions and 38 deletions

41
pricewatch/app/db/__init__.py Executable file
View File

@@ -0,0 +1,41 @@
"""
Module de base de données pour PriceWatch Phase 2.
Gère la persistence PostgreSQL avec SQLAlchemy ORM.
"""
from pricewatch.app.db.connection import (
check_db_connection,
get_engine,
get_session,
get_session_factory,
init_db,
reset_engine,
)
from pricewatch.app.db.repository import ProductRepository
from pricewatch.app.db.models import (
Base,
Product,
PriceHistory,
ProductImage,
ProductSpec,
ScrapingLog,
)
__all__ = [
# Models
"Base",
"Product",
"PriceHistory",
"ProductImage",
"ProductSpec",
"ScrapingLog",
"ProductRepository",
# Connection
"get_engine",
"get_session_factory",
"get_session",
"init_db",
"check_db_connection",
"reset_engine",
]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

238
pricewatch/app/db/connection.py Executable file
View File

@@ -0,0 +1,238 @@
"""
Gestion des connexions PostgreSQL pour PriceWatch Phase 2.
Fournit:
- Engine SQLAlchemy avec connection pooling
- Session factory avec context manager
- Initialisation des tables
- Health check
Justification technique:
- Connection pooling: réutilisation connexions pour performance
- Context manager: garantit fermeture session (pas de leak)
- pool_pre_ping: vérifie connexion avant usage (robustesse)
- echo=debug: logs SQL en mode debug
"""
from contextlib import contextmanager
from typing import Generator, Optional
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from sqlalchemy.engine.url import make_url
from sqlalchemy.exc import OperationalError, SQLAlchemyError
from sqlalchemy.orm import Session, sessionmaker
from pricewatch.app.core.config import AppConfig, get_config
from pricewatch.app.core.logging import get_logger
from pricewatch.app.db.models import Base
logger = get_logger("db.connection")
# Global engine instance (singleton)
_engine: Optional[Engine] = None
_session_factory: Optional[sessionmaker] = None
def get_engine(config: Optional[AppConfig] = None) -> Engine:
"""
Récupère ou crée l'Engine SQLAlchemy (singleton).
Args:
config: Configuration app (utilise get_config() si None)
Returns:
Engine SQLAlchemy configuré
Justification:
- Singleton: une seule pool de connexions par application
- pool_pre_ping: vérifie connexion avant usage (évite "connection closed")
- pool_size=5, max_overflow=10: limite connexions (15 max)
- echo=debug: logs SQL pour debugging
"""
global _engine
if _engine is None:
if config is None:
config = get_config()
db_url = config.db.url
url = make_url(db_url)
is_sqlite = url.get_backend_name() == "sqlite"
logger.info(f"Creating database engine: {db_url}")
engine_kwargs = {
"pool_pre_ping": True,
"pool_recycle": 3600,
"echo": config.debug,
}
if not is_sqlite:
engine_kwargs.update(
{
"pool_size": 5,
"max_overflow": 10,
}
)
_engine = create_engine(db_url, **engine_kwargs)
logger.info("Database engine created successfully")
return _engine
def init_db(config: Optional[AppConfig] = None) -> None:
"""
Initialise la base de données (crée toutes les tables).
Args:
config: Configuration app (utilise get_config() si None)
Raises:
OperationalError: Si connexion impossible
SQLAlchemyError: Si création tables échoue
Note:
Utilise Base.metadata.create_all() - idempotent (ne crash pas si tables existent)
"""
if config is None:
config = get_config()
logger.info("Initializing database...")
try:
engine = get_engine(config)
# Créer toutes les tables définies dans Base.metadata
Base.metadata.create_all(bind=engine)
logger.info("Database initialized successfully")
logger.info(f"Tables created: {', '.join(Base.metadata.tables.keys())}")
except OperationalError as e:
logger.error(f"Failed to connect to database: {e}")
raise
except SQLAlchemyError as e:
logger.error(f"Failed to create tables: {e}")
raise
def get_session_factory(config: Optional[AppConfig] = None) -> sessionmaker:
"""
Récupère ou crée la session factory (singleton).
Args:
config: Configuration app (utilise get_config() si None)
Returns:
Session factory SQLAlchemy
Justification:
- expire_on_commit=False: objets restent accessibles après commit
- autocommit=False, autoflush=False: contrôle explicite
"""
global _session_factory
if _session_factory is None:
engine = get_engine(config)
_session_factory = sessionmaker(
bind=engine,
expire_on_commit=False, # Objets restent accessibles après commit
autocommit=False, # Contrôle explicite du commit
autoflush=False, # Contrôle explicite du flush
)
logger.debug("Session factory created")
return _session_factory
@contextmanager
def get_session(config: Optional[AppConfig] = None) -> Generator[Session, None, None]:
"""
Context manager pour session SQLAlchemy.
Args:
config: Configuration app (utilise get_config() si None)
Yields:
Session SQLAlchemy
Usage:
with get_session() as session:
product = session.query(Product).filter_by(reference="B08N5WRWNW").first()
session.commit()
Justification:
- Context manager: garantit fermeture session (pas de leak)
- Rollback automatique sur exception
- Close automatique en fin de bloc
"""
factory = get_session_factory(config)
session = factory()
try:
logger.debug("Session opened")
yield session
except Exception as e:
logger.error(f"Session error, rolling back: {e}")
session.rollback()
raise
finally:
logger.debug("Session closed")
session.close()
def check_db_connection(config: Optional[AppConfig] = None) -> bool:
"""
Vérifie la connexion à la base de données (health check).
Args:
config: Configuration app (utilise get_config() si None)
Returns:
True si connexion OK, False sinon
Note:
Execute une query simple: SELECT 1
"""
if config is None:
config = get_config()
try:
engine = get_engine(config)
with engine.connect() as conn:
result = conn.execute(text("SELECT 1"))
result.scalar()
logger.info("Database connection OK")
return True
except OperationalError as e:
logger.error(f"Database connection failed: {e}")
return False
except SQLAlchemyError as e:
logger.error(f"Database health check failed: {e}")
return False
def reset_engine() -> None:
"""
Reset l'engine global (pour tests).
Note:
Dispose l'engine et reset les singletons.
"""
global _engine, _session_factory
if _engine is not None:
logger.debug("Disposing database engine")
_engine.dispose()
_engine = None
_session_factory = None
logger.debug("Engine reset complete")

Binary file not shown.

View File

@@ -0,0 +1,80 @@
"""
Configuration Alembic pour PriceWatch.
Recupere l'URL DB depuis AppConfig pour garantir un setup coherent.
"""
from logging.config import fileConfig
from alembic import context
from sqlalchemy import engine_from_config, pool
from pricewatch.app.core.config import get_config
from pricewatch.app.db.models import Base
# Alembic Config object
config = context.config
# Configure logging
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# Metadata SQLAlchemy pour autogenerate
target_metadata = Base.metadata
def _get_database_url() -> str:
"""Construit l'URL DB depuis la config applicative."""
app_config = get_config()
return app_config.db.url
def run_migrations_offline() -> None:
"""
Execute les migrations en mode offline.
Configure le contexte avec l'URL DB sans creer d'engine.
"""
url = _get_database_url()
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
"""
Execute les migrations en mode online.
Cree un engine SQLAlchemy et etablit la connexion.
"""
configuration = config.get_section(config.config_ini_section) or {}
configuration["sqlalchemy.url"] = _get_database_url()
connectable = engine_from_config(
configuration,
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
compare_type=True,
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View File

@@ -0,0 +1,24 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# Revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}

View File

@@ -0,0 +1,124 @@
"""Initial schema
Revision ID: 20260114_01
Revises: None
Create Date: 2026-01-14 00:00:00
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# Revision identifiers, used by Alembic.
revision = "20260114_01"
down_revision = None
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_table(
"products",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("source", sa.String(length=50), nullable=False),
sa.Column("reference", sa.String(length=100), nullable=False),
sa.Column("url", sa.Text(), nullable=False),
sa.Column("title", sa.Text(), nullable=True),
sa.Column("category", sa.Text(), nullable=True),
sa.Column("currency", sa.String(length=3), nullable=True),
sa.Column("first_seen_at", sa.TIMESTAMP(), nullable=False),
sa.Column("last_updated_at", sa.TIMESTAMP(), nullable=False),
sa.UniqueConstraint("source", "reference", name="uq_product_source_reference"),
)
op.create_index("ix_product_source", "products", ["source"], unique=False)
op.create_index("ix_product_reference", "products", ["reference"], unique=False)
op.create_index("ix_product_last_updated", "products", ["last_updated_at"], unique=False)
op.create_table(
"price_history",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("product_id", sa.Integer(), nullable=False),
sa.Column("price", sa.Numeric(10, 2), nullable=True),
sa.Column("shipping_cost", sa.Numeric(10, 2), nullable=True),
sa.Column("stock_status", sa.String(length=20), nullable=True),
sa.Column("fetch_method", sa.String(length=20), nullable=False),
sa.Column("fetch_status", sa.String(length=20), nullable=False),
sa.Column("fetched_at", sa.TIMESTAMP(), nullable=False),
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"),
sa.UniqueConstraint("product_id", "fetched_at", name="uq_price_history_product_time"),
sa.CheckConstraint("stock_status IN ('in_stock', 'out_of_stock', 'unknown')"),
sa.CheckConstraint("fetch_method IN ('http', 'playwright')"),
sa.CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
)
op.create_index("ix_price_history_product_id", "price_history", ["product_id"], unique=False)
op.create_index("ix_price_history_fetched_at", "price_history", ["fetched_at"], unique=False)
op.create_table(
"product_images",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("product_id", sa.Integer(), nullable=False),
sa.Column("image_url", sa.Text(), nullable=False),
sa.Column("position", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"),
sa.UniqueConstraint("product_id", "image_url", name="uq_product_image_url"),
)
op.create_index("ix_product_image_product_id", "product_images", ["product_id"], unique=False)
op.create_table(
"product_specs",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("product_id", sa.Integer(), nullable=False),
sa.Column("spec_key", sa.String(length=200), nullable=False),
sa.Column("spec_value", sa.Text(), nullable=False),
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"),
sa.UniqueConstraint("product_id", "spec_key", name="uq_product_spec_key"),
)
op.create_index("ix_product_spec_product_id", "product_specs", ["product_id"], unique=False)
op.create_index("ix_product_spec_key", "product_specs", ["spec_key"], unique=False)
op.create_table(
"scraping_logs",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("product_id", sa.Integer(), nullable=True),
sa.Column("url", sa.Text(), nullable=False),
sa.Column("source", sa.String(length=50), nullable=False),
sa.Column("reference", sa.String(length=100), nullable=True),
sa.Column("fetch_method", sa.String(length=20), nullable=False),
sa.Column("fetch_status", sa.String(length=20), nullable=False),
sa.Column("fetched_at", sa.TIMESTAMP(), nullable=False),
sa.Column("duration_ms", sa.Integer(), nullable=True),
sa.Column("html_size_bytes", sa.Integer(), nullable=True),
sa.Column("errors", postgresql.JSONB(), nullable=True),
sa.Column("notes", postgresql.JSONB(), nullable=True),
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="SET NULL"),
sa.CheckConstraint("fetch_method IN ('http', 'playwright')"),
sa.CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
)
op.create_index("ix_scraping_log_product_id", "scraping_logs", ["product_id"], unique=False)
op.create_index("ix_scraping_log_source", "scraping_logs", ["source"], unique=False)
op.create_index("ix_scraping_log_fetched_at", "scraping_logs", ["fetched_at"], unique=False)
op.create_index("ix_scraping_log_fetch_status", "scraping_logs", ["fetch_status"], unique=False)
def downgrade() -> None:
op.drop_index("ix_scraping_log_fetch_status", table_name="scraping_logs")
op.drop_index("ix_scraping_log_fetched_at", table_name="scraping_logs")
op.drop_index("ix_scraping_log_source", table_name="scraping_logs")
op.drop_index("ix_scraping_log_product_id", table_name="scraping_logs")
op.drop_table("scraping_logs")
op.drop_index("ix_product_spec_key", table_name="product_specs")
op.drop_index("ix_product_spec_product_id", table_name="product_specs")
op.drop_table("product_specs")
op.drop_index("ix_product_image_product_id", table_name="product_images")
op.drop_table("product_images")
op.drop_index("ix_price_history_fetched_at", table_name="price_history")
op.drop_index("ix_price_history_product_id", table_name="price_history")
op.drop_table("price_history")
op.drop_index("ix_product_last_updated", table_name="products")
op.drop_index("ix_product_reference", table_name="products")
op.drop_index("ix_product_source", table_name="products")
op.drop_table("products")

320
pricewatch/app/db/models.py Executable file
View File

@@ -0,0 +1,320 @@
"""
Modèles SQLAlchemy pour PriceWatch Phase 2.
Schéma normalisé pour persistence PostgreSQL:
- products: Catalogue produits (déduplication sur source + reference)
- price_history: Historique prix time-series
- product_images: Images produit (N par produit)
- product_specs: Caractéristiques produit (key-value)
- scraping_logs: Logs observabilité pour debugging
Justification technique:
- Normalisation: products séparée de price_history (catalogue vs time-series)
- Clé naturelle: (source, reference) comme unique constraint (ASIN Amazon, etc.)
- Pas de JSONB pour données structurées: tables séparées pour images/specs
- JSONB uniquement pour données variables: errors, notes dans logs
"""
from datetime import datetime
from decimal import Decimal
from typing import List, Optional
from sqlalchemy import (
TIMESTAMP,
CheckConstraint,
Column,
ForeignKey,
Index,
Integer,
JSON,
Numeric,
String,
Text,
UniqueConstraint,
)
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
class Base(DeclarativeBase):
"""Base class pour tous les modèles SQLAlchemy."""
pass
class Product(Base):
"""
Catalogue produits (1 ligne par produit unique).
Clé naturelle: (source, reference) - Ex: (amazon, B08N5WRWNW)
Mise à jour: title, category, url à chaque scraping
Historique prix: relation 1-N vers PriceHistory
"""
__tablename__ = "products"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Natural key (unique)
source: Mapped[str] = mapped_column(
String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)"
)
reference: Mapped[str] = mapped_column(
String(100), nullable=False, comment="Product reference (ASIN, SKU, etc.)"
)
# Product metadata
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Canonical product URL")
title: Mapped[Optional[str]] = mapped_column(Text, nullable=True, comment="Product title")
category: Mapped[Optional[str]] = mapped_column(
Text, nullable=True, comment="Product category (breadcrumb)"
)
currency: Mapped[Optional[str]] = mapped_column(
String(3), nullable=True, comment="Currency code (EUR, USD, GBP)"
)
# Timestamps
first_seen_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="First scraping timestamp"
)
last_updated_at: Mapped[datetime] = mapped_column(
TIMESTAMP,
nullable=False,
default=datetime.utcnow,
onupdate=datetime.utcnow,
comment="Last metadata update",
)
# Relationships
price_history: Mapped[List["PriceHistory"]] = relationship(
"PriceHistory", back_populates="product", cascade="all, delete-orphan"
)
images: Mapped[List["ProductImage"]] = relationship(
"ProductImage", back_populates="product", cascade="all, delete-orphan"
)
specs: Mapped[List["ProductSpec"]] = relationship(
"ProductSpec", back_populates="product", cascade="all, delete-orphan"
)
logs: Mapped[List["ScrapingLog"]] = relationship(
"ScrapingLog", back_populates="product", cascade="all, delete-orphan"
)
# Constraints
__table_args__ = (
UniqueConstraint("source", "reference", name="uq_product_source_reference"),
Index("ix_product_source", "source"),
Index("ix_product_reference", "reference"),
Index("ix_product_last_updated", "last_updated_at"),
)
def __repr__(self) -> str:
return f"<Product(id={self.id}, source={self.source}, reference={self.reference})>"
class PriceHistory(Base):
"""
Historique prix (time-series).
Une ligne par scraping réussi avec extraction prix.
Unique constraint sur (product_id, fetched_at) évite doublons.
"""
__tablename__ = "price_history"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key
product_id: Mapped[int] = mapped_column(
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
)
# Price data
price: Mapped[Optional[Decimal]] = mapped_column(
Numeric(10, 2), nullable=True, comment="Product price"
)
shipping_cost: Mapped[Optional[Decimal]] = mapped_column(
Numeric(10, 2), nullable=True, comment="Shipping cost"
)
stock_status: Mapped[Optional[str]] = mapped_column(
String(20), nullable=True, comment="Stock status (in_stock, out_of_stock, unknown)"
)
# Fetch metadata
fetch_method: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch method (http, playwright)"
)
fetch_status: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
)
fetched_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, comment="Scraping timestamp"
)
# Relationship
product: Mapped["Product"] = relationship("Product", back_populates="price_history")
# Constraints
__table_args__ = (
UniqueConstraint("product_id", "fetched_at", name="uq_price_history_product_time"),
Index("ix_price_history_product_id", "product_id"),
Index("ix_price_history_fetched_at", "fetched_at"),
CheckConstraint("stock_status IN ('in_stock', 'out_of_stock', 'unknown')"),
CheckConstraint("fetch_method IN ('http', 'playwright')"),
CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
)
def __repr__(self) -> str:
return f"<PriceHistory(id={self.id}, product_id={self.product_id}, price={self.price}, fetched_at={self.fetched_at})>"
class ProductImage(Base):
"""
Images produit (N images par produit).
Unique constraint sur (product_id, image_url) évite doublons.
Position permet de garder l'ordre des images.
"""
__tablename__ = "product_images"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key
product_id: Mapped[int] = mapped_column(
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
)
# Image data
image_url: Mapped[str] = mapped_column(Text, nullable=False, comment="Image URL")
position: Mapped[int] = mapped_column(
Integer, nullable=False, default=0, comment="Image position (0=main)"
)
# Relationship
product: Mapped["Product"] = relationship("Product", back_populates="images")
# Constraints
__table_args__ = (
UniqueConstraint("product_id", "image_url", name="uq_product_image_url"),
Index("ix_product_image_product_id", "product_id"),
)
def __repr__(self) -> str:
return f"<ProductImage(id={self.id}, product_id={self.product_id}, position={self.position})>"
class ProductSpec(Base):
"""
Caractéristiques produit (key-value).
Unique constraint sur (product_id, spec_key) évite doublons.
Permet queries efficaces par clé.
"""
__tablename__ = "product_specs"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key
product_id: Mapped[int] = mapped_column(
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
)
# Spec data
spec_key: Mapped[str] = mapped_column(
String(200), nullable=False, comment="Specification key (e.g., 'Brand', 'Color')"
)
spec_value: Mapped[str] = mapped_column(Text, nullable=False, comment="Specification value")
# Relationship
product: Mapped["Product"] = relationship("Product", back_populates="specs")
# Constraints
__table_args__ = (
UniqueConstraint("product_id", "spec_key", name="uq_product_spec_key"),
Index("ix_product_spec_product_id", "product_id"),
Index("ix_product_spec_key", "spec_key"),
)
def __repr__(self) -> str:
return f"<ProductSpec(id={self.id}, product_id={self.product_id}, key={self.spec_key})>"
class ScrapingLog(Base):
"""
Logs observabilité pour debugging.
FK optionnelle vers products (permet logs même si produit non créé).
JSONB pour errors/notes car structure variable.
Permet analytics: taux succès, durée moyenne, etc.
"""
__tablename__ = "scraping_logs"
# Primary key
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
# Foreign key (optional)
product_id: Mapped[Optional[int]] = mapped_column(
Integer, ForeignKey("products.id", ondelete="SET NULL"), nullable=True
)
# Scraping metadata
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Scraped URL")
source: Mapped[str] = mapped_column(
String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)"
)
reference: Mapped[Optional[str]] = mapped_column(
String(100), nullable=True, comment="Product reference (if extracted)"
)
# Fetch metadata
fetch_method: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch method (http, playwright)"
)
fetch_status: Mapped[str] = mapped_column(
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
)
fetched_at: Mapped[datetime] = mapped_column(
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="Scraping timestamp"
)
# Performance metrics
duration_ms: Mapped[Optional[int]] = mapped_column(
Integer, nullable=True, comment="Fetch duration in milliseconds"
)
html_size_bytes: Mapped[Optional[int]] = mapped_column(
Integer, nullable=True, comment="HTML response size in bytes"
)
# Debug data (JSONB)
errors: Mapped[Optional[list[str]]] = mapped_column(
JSON().with_variant(JSONB, "postgresql"),
nullable=True,
comment="Error messages (list of strings)",
)
notes: Mapped[Optional[list[str]]] = mapped_column(
JSON().with_variant(JSONB, "postgresql"),
nullable=True,
comment="Debug notes (list of strings)",
)
# Relationship
product: Mapped[Optional["Product"]] = relationship("Product", back_populates="logs")
# Constraints
__table_args__ = (
Index("ix_scraping_log_product_id", "product_id"),
Index("ix_scraping_log_source", "source"),
Index("ix_scraping_log_fetched_at", "fetched_at"),
Index("ix_scraping_log_fetch_status", "fetch_status"),
CheckConstraint("fetch_method IN ('http', 'playwright')"),
CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
)
def __repr__(self) -> str:
return f"<ScrapingLog(id={self.id}, url={self.url}, status={self.fetch_status}, fetched_at={self.fetched_at})>"

140
pricewatch/app/db/repository.py Executable file
View File

@@ -0,0 +1,140 @@
"""
Repository pattern pour la persistence SQLAlchemy.
Centralise les operations CRUD sur les modeles DB a partir d'un ProductSnapshot.
"""
from __future__ import annotations
from typing import Optional
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import Session
from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import ProductSnapshot
from pricewatch.app.db.models import PriceHistory, Product, ProductImage, ProductSpec, ScrapingLog
logger = get_logger("db.repository")
class ProductRepository:
"""Repository de persistence pour ProductSnapshot."""
def __init__(self, session: Session) -> None:
self.session = session
def get_or_create(self, source: str, reference: str, url: str) -> Product:
"""
Recuperer ou creer un produit par cle naturelle (source, reference).
"""
product = (
self.session.query(Product)
.filter(Product.source == source, Product.reference == reference)
.one_or_none()
)
if product:
return product
product = Product(source=source, reference=reference, url=url)
self.session.add(product)
self.session.flush()
return product
def update_product_metadata(self, product: Product, snapshot: ProductSnapshot) -> None:
"""Met a jour les metadonnees produit si disponibles."""
if snapshot.url:
product.url = snapshot.url
if snapshot.title:
product.title = snapshot.title
if snapshot.category:
product.category = snapshot.category
if snapshot.currency:
product.currency = snapshot.currency
def add_price_history(self, product: Product, snapshot: ProductSnapshot) -> Optional[PriceHistory]:
"""Ajoute une entree d'historique de prix si inexistante."""
existing = (
self.session.query(PriceHistory)
.filter(
PriceHistory.product_id == product.id,
PriceHistory.fetched_at == snapshot.fetched_at,
)
.one_or_none()
)
if existing:
return existing
price_entry = PriceHistory(
product_id=product.id,
price=snapshot.price,
shipping_cost=snapshot.shipping_cost,
stock_status=snapshot.stock_status,
fetch_method=snapshot.debug.method,
fetch_status=snapshot.debug.status,
fetched_at=snapshot.fetched_at,
)
self.session.add(price_entry)
return price_entry
def sync_images(self, product: Product, images: list[str]) -> None:
"""Synchronise les images (ajout des nouvelles)."""
existing_urls = {image.image_url for image in product.images}
for position, url in enumerate(images):
if url in existing_urls:
continue
self.session.add(ProductImage(product_id=product.id, image_url=url, position=position))
def sync_specs(self, product: Product, specs: dict[str, str]) -> None:
"""Synchronise les specs (upsert par cle)."""
existing_specs = {spec.spec_key: spec for spec in product.specs}
for key, value in specs.items():
if key in existing_specs:
existing_specs[key].spec_value = value
else:
self.session.add(ProductSpec(product_id=product.id, spec_key=key, spec_value=value))
def add_scraping_log(self, snapshot: ProductSnapshot, product_id: Optional[int]) -> ScrapingLog:
"""Ajoute un log de scraping pour observabilite."""
log_entry = ScrapingLog(
product_id=product_id,
url=snapshot.url,
source=snapshot.source,
reference=snapshot.reference,
fetch_method=snapshot.debug.method,
fetch_status=snapshot.debug.status,
fetched_at=snapshot.fetched_at,
duration_ms=snapshot.debug.duration_ms,
html_size_bytes=snapshot.debug.html_size_bytes,
errors=snapshot.debug.errors or None,
notes=snapshot.debug.notes or None,
)
self.session.add(log_entry)
return log_entry
def save_snapshot(self, snapshot: ProductSnapshot) -> Optional[int]:
"""
Persiste un ProductSnapshot complet dans la base.
Retourne l'id produit ou None si reference absente.
"""
if not snapshot.reference:
logger.warning("Reference absente: persistence ignoree")
self.add_scraping_log(snapshot, product_id=None)
return None
product = self.get_or_create(snapshot.source, snapshot.reference, snapshot.url)
self.update_product_metadata(product, snapshot)
self.add_price_history(product, snapshot)
self.sync_images(product, snapshot.images)
self.sync_specs(product, snapshot.specs)
self.add_scraping_log(snapshot, product_id=product.id)
return product.id
def safe_save_snapshot(self, snapshot: ProductSnapshot) -> Optional[int]:
"""Sauvegarde avec gestion d'erreur SQLAlchemy."""
try:
return self.save_snapshot(snapshot)
except SQLAlchemyError as exc:
logger.error(f"Erreur SQLAlchemy: {exc}")
raise