codex
This commit is contained in:
Binary file not shown.
@@ -13,20 +13,28 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
import typer
|
||||
from rq import Connection, Worker
|
||||
from alembic import command as alembic_command
|
||||
from alembic.config import Config as AlembicConfig
|
||||
from rich import print as rprint
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from pricewatch.app.core import logging as app_logging
|
||||
from pricewatch.app.core.config import get_config
|
||||
from pricewatch.app.core.io import read_yaml_config, write_json_results
|
||||
from pricewatch.app.core.logging import get_logger, set_level
|
||||
from pricewatch.app.core.registry import get_registry, register_store
|
||||
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod
|
||||
from pricewatch.app.db.connection import init_db
|
||||
from pricewatch.app.scraping.http_fetch import fetch_http
|
||||
from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
from pricewatch.app.stores.amazon.store import AmazonStore
|
||||
from pricewatch.app.stores.cdiscount.store import CdiscountStore
|
||||
from pricewatch.app.tasks.scheduler import ScrapingScheduler
|
||||
|
||||
# Créer l'application Typer
|
||||
app = typer.Typer(
|
||||
@@ -46,6 +54,75 @@ def setup_stores():
|
||||
registry.register(CdiscountStore())
|
||||
|
||||
|
||||
def get_alembic_config() -> AlembicConfig:
|
||||
"""Construit la configuration Alembic à partir du repository."""
|
||||
root_path = Path(__file__).resolve().parents[3]
|
||||
config_path = root_path / "alembic.ini"
|
||||
migrations_path = root_path / "pricewatch" / "app" / "db" / "migrations"
|
||||
|
||||
if not config_path.exists():
|
||||
logger.error(f"alembic.ini introuvable: {config_path}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
alembic_cfg = AlembicConfig(str(config_path))
|
||||
alembic_cfg.set_main_option("script_location", str(migrations_path))
|
||||
alembic_cfg.set_main_option("sqlalchemy.url", get_config().db.url)
|
||||
return alembic_cfg
|
||||
|
||||
|
||||
@app.command("init-db")
|
||||
def init_db_command():
|
||||
"""
|
||||
Initialise la base de donnees (creer toutes les tables).
|
||||
"""
|
||||
try:
|
||||
init_db(get_config())
|
||||
except Exception as e:
|
||||
logger.error(f"Init DB echoue: {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def migrate(
|
||||
message: str = typer.Argument(..., help="Message de migration"),
|
||||
autogenerate: bool = typer.Option(True, "--autogenerate/--no-autogenerate"),
|
||||
):
|
||||
"""
|
||||
Genere une migration Alembic.
|
||||
"""
|
||||
try:
|
||||
alembic_cfg = get_alembic_config()
|
||||
alembic_command.revision(alembic_cfg, message=message, autogenerate=autogenerate)
|
||||
except Exception as e:
|
||||
logger.error(f"Migration echouee: {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def upgrade(revision: str = typer.Argument("head", help="Revision cible")):
|
||||
"""
|
||||
Applique les migrations Alembic.
|
||||
"""
|
||||
try:
|
||||
alembic_cfg = get_alembic_config()
|
||||
alembic_command.upgrade(alembic_cfg, revision)
|
||||
except Exception as e:
|
||||
logger.error(f"Upgrade echoue: {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def downgrade(revision: str = typer.Argument("-1", help="Revision cible")):
|
||||
"""
|
||||
Rollback une migration Alembic.
|
||||
"""
|
||||
try:
|
||||
alembic_cfg = get_alembic_config()
|
||||
alembic_command.downgrade(alembic_cfg, revision)
|
||||
except Exception as e:
|
||||
logger.error(f"Downgrade echoue: {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
@app.command()
|
||||
def run(
|
||||
yaml: Path = typer.Option(
|
||||
@@ -67,6 +144,11 @@ def run(
|
||||
"-d",
|
||||
help="Activer le mode debug",
|
||||
),
|
||||
save_db: Optional[bool] = typer.Option(
|
||||
None,
|
||||
"--save-db/--no-db",
|
||||
help="Activer la persistence en base de donnees",
|
||||
),
|
||||
):
|
||||
"""
|
||||
Pipeline complet: scrape toutes les URLs du YAML et génère le JSON.
|
||||
@@ -88,6 +170,12 @@ def run(
|
||||
logger.error(f"Erreur lecture YAML: {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
app_config = get_config()
|
||||
if save_db is None:
|
||||
save_db = app_config.enable_db
|
||||
|
||||
pipeline = ScrapingPipeline(config=app_config)
|
||||
|
||||
logger.info(f"{len(config.urls)} URL(s) à scraper")
|
||||
|
||||
# Scraper chaque URL
|
||||
@@ -158,6 +246,11 @@ def run(
|
||||
|
||||
snapshot = store.parse(html, canonical_url)
|
||||
snapshot.debug.method = fetch_method
|
||||
if save_db:
|
||||
product_id = pipeline.process_snapshot(snapshot, save_to_db=True)
|
||||
if product_id:
|
||||
logger.info(f"DB: produit id={product_id}")
|
||||
|
||||
snapshots.append(snapshot)
|
||||
|
||||
status_emoji = "✓" if snapshot.is_complete() else "⚠"
|
||||
@@ -180,6 +273,8 @@ def run(
|
||||
errors=[f"Parsing failed: {str(e)}"],
|
||||
),
|
||||
)
|
||||
if save_db:
|
||||
pipeline.process_snapshot(snapshot, save_to_db=True)
|
||||
snapshots.append(snapshot)
|
||||
else:
|
||||
# Pas de HTML récupéré
|
||||
@@ -194,6 +289,8 @@ def run(
|
||||
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
|
||||
),
|
||||
)
|
||||
if save_db:
|
||||
pipeline.process_snapshot(snapshot, save_to_db=True)
|
||||
snapshots.append(snapshot)
|
||||
|
||||
# Écrire les résultats
|
||||
@@ -359,5 +456,65 @@ def doctor():
|
||||
rprint("\n[green]✓ PriceWatch est prêt![/green]")
|
||||
|
||||
|
||||
@app.command()
|
||||
def worker(
|
||||
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
|
||||
with_scheduler: bool = typer.Option(
|
||||
True, "--with-scheduler/--no-scheduler", help="Activer le scheduler RQ"
|
||||
),
|
||||
):
|
||||
"""
|
||||
Lance un worker RQ.
|
||||
"""
|
||||
config = get_config()
|
||||
connection = redis.from_url(config.redis.url)
|
||||
|
||||
with Connection(connection):
|
||||
worker_instance = Worker([queue])
|
||||
worker_instance.work(with_scheduler=with_scheduler)
|
||||
|
||||
|
||||
@app.command()
|
||||
def enqueue(
|
||||
url: str = typer.Argument(..., help="URL du produit a scraper"),
|
||||
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
|
||||
save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
|
||||
use_playwright: Optional[bool] = typer.Option(
|
||||
None, "--playwright/--no-playwright", help="Forcer Playwright"
|
||||
),
|
||||
):
|
||||
"""
|
||||
Enqueue un scraping immediat.
|
||||
"""
|
||||
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
|
||||
job = scheduler.enqueue_immediate(url, use_playwright=use_playwright, save_db=save_db)
|
||||
rprint(f"[green]✓ Job enqueued: {job.id}[/green]")
|
||||
|
||||
|
||||
@app.command()
|
||||
def schedule(
|
||||
url: str = typer.Argument(..., help="URL du produit a planifier"),
|
||||
interval: int = typer.Option(24, "--interval", help="Intervalle en heures"),
|
||||
queue: str = typer.Option("default", "--queue", "-q", help="Nom de la queue RQ"),
|
||||
save_db: bool = typer.Option(True, "--save-db/--no-db", help="Activer la DB"),
|
||||
use_playwright: Optional[bool] = typer.Option(
|
||||
None, "--playwright/--no-playwright", help="Forcer Playwright"
|
||||
),
|
||||
):
|
||||
"""
|
||||
Planifie un scraping recurrent.
|
||||
"""
|
||||
scheduler = ScrapingScheduler(get_config(), queue_name=queue)
|
||||
job_info = scheduler.schedule_product(
|
||||
url,
|
||||
interval_hours=interval,
|
||||
use_playwright=use_playwright,
|
||||
save_db=save_db,
|
||||
)
|
||||
rprint(
|
||||
f"[green]✓ Job planifie: {job_info.job_id} (next={job_info.next_run.isoformat()})[/green]"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
BIN
pricewatch/app/core/__pycache__/config.cpython-313.pyc
Executable file
BIN
pricewatch/app/core/__pycache__/config.cpython-313.pyc
Executable file
Binary file not shown.
186
pricewatch/app/core/config.py
Executable file
186
pricewatch/app/core/config.py
Executable file
@@ -0,0 +1,186 @@
|
||||
"""
|
||||
Configuration centralisée pour PriceWatch Phase 2.
|
||||
|
||||
Gère la configuration de la base de données, Redis, et l'application globale.
|
||||
Utilise Pydantic Settings pour validation et chargement depuis variables d'environnement.
|
||||
|
||||
Justification technique:
|
||||
- Pattern 12-factor app: configuration via env vars
|
||||
- Pydantic validation garantit config valide au démarrage
|
||||
- Valeurs par défaut pour développement local
|
||||
- Support .env file pour faciliter le setup
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
|
||||
logger = get_logger("core.config")
|
||||
|
||||
|
||||
class DatabaseConfig(BaseSettings):
|
||||
"""Configuration PostgreSQL."""
|
||||
|
||||
host: str = Field(default="localhost", description="PostgreSQL host")
|
||||
port: int = Field(default=5432, description="PostgreSQL port")
|
||||
database: str = Field(default="pricewatch", description="Database name")
|
||||
user: str = Field(default="pricewatch", description="Database user")
|
||||
password: str = Field(default="pricewatch", description="Database password")
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="PW_DB_", # PW_DB_HOST, PW_DB_PORT, etc.
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
"""
|
||||
SQLAlchemy connection URL.
|
||||
|
||||
Format: postgresql://user:password@host:port/database
|
||||
"""
|
||||
return f"postgresql://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}"
|
||||
|
||||
@property
|
||||
def url_async(self) -> str:
|
||||
"""
|
||||
Async SQLAlchemy connection URL (pour usage futur avec asyncpg).
|
||||
|
||||
Format: postgresql+asyncpg://user:password@host:port/database
|
||||
"""
|
||||
return f"postgresql+asyncpg://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}"
|
||||
|
||||
|
||||
class RedisConfig(BaseSettings):
|
||||
"""Configuration Redis pour RQ worker."""
|
||||
|
||||
host: str = Field(default="localhost", description="Redis host")
|
||||
port: int = Field(default=6379, description="Redis port")
|
||||
db: int = Field(default=0, description="Redis database number (0-15)")
|
||||
password: Optional[str] = Field(default=None, description="Redis password (optional)")
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="PW_REDIS_", # PW_REDIS_HOST, PW_REDIS_PORT, etc.
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
"""
|
||||
Redis connection URL pour RQ.
|
||||
|
||||
Format: redis://[password@]host:port/db
|
||||
"""
|
||||
auth = f":{self.password}@" if self.password else ""
|
||||
return f"redis://{auth}{self.host}:{self.port}/{self.db}"
|
||||
|
||||
|
||||
class AppConfig(BaseSettings):
|
||||
"""Configuration globale de l'application."""
|
||||
|
||||
# Mode debug
|
||||
debug: bool = Field(
|
||||
default=False, description="Enable debug mode (verbose logging, SQL echo)"
|
||||
)
|
||||
|
||||
# Worker configuration
|
||||
worker_timeout: int = Field(
|
||||
default=300, description="Worker job timeout in seconds (5 minutes)"
|
||||
)
|
||||
|
||||
worker_concurrency: int = Field(
|
||||
default=2, description="Number of concurrent worker processes"
|
||||
)
|
||||
|
||||
# Feature flags
|
||||
enable_db: bool = Field(
|
||||
default=True, description="Enable database persistence (can disable for testing)"
|
||||
)
|
||||
|
||||
enable_worker: bool = Field(
|
||||
default=True, description="Enable background worker functionality"
|
||||
)
|
||||
|
||||
# Scraping defaults
|
||||
default_playwright_timeout: int = Field(
|
||||
default=60000, description="Default Playwright timeout in milliseconds"
|
||||
)
|
||||
|
||||
default_use_playwright: bool = Field(
|
||||
default=True, description="Use Playwright fallback by default"
|
||||
)
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="PW_", # PW_DEBUG, PW_WORKER_TIMEOUT, etc.
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
# Nested configs (instances, not classes)
|
||||
db: DatabaseConfig = Field(default_factory=DatabaseConfig)
|
||||
redis: RedisConfig = Field(default_factory=RedisConfig)
|
||||
|
||||
def log_config(self) -> None:
|
||||
"""Log la configuration active (sans password)."""
|
||||
logger.info("=== Configuration PriceWatch ===")
|
||||
logger.info(f"Debug mode: {self.debug}")
|
||||
logger.info(f"Database: {self.db.host}:{self.db.port}/{self.db.database}")
|
||||
logger.info(f"Redis: {self.redis.host}:{self.redis.port}/{self.redis.db}")
|
||||
logger.info(f"DB enabled: {self.enable_db}")
|
||||
logger.info(f"Worker enabled: {self.enable_worker}")
|
||||
logger.info(f"Worker timeout: {self.worker_timeout}s")
|
||||
logger.info(f"Worker concurrency: {self.worker_concurrency}")
|
||||
logger.info("================================")
|
||||
|
||||
|
||||
# Singleton global config instance
|
||||
_config: Optional[AppConfig] = None
|
||||
|
||||
|
||||
def get_config() -> AppConfig:
|
||||
"""
|
||||
Récupère l'instance globale de configuration (singleton).
|
||||
|
||||
Returns:
|
||||
Instance AppConfig
|
||||
|
||||
Justification:
|
||||
- Évite de recharger la config à chaque appel
|
||||
- Centralise la configuration pour toute l'application
|
||||
- Permet d'override pour les tests
|
||||
"""
|
||||
global _config
|
||||
|
||||
if _config is None:
|
||||
_config = AppConfig()
|
||||
if _config.debug:
|
||||
_config.log_config()
|
||||
|
||||
return _config
|
||||
|
||||
|
||||
def set_config(config: AppConfig) -> None:
|
||||
"""
|
||||
Override la configuration globale (principalement pour tests).
|
||||
|
||||
Args:
|
||||
config: Instance AppConfig à utiliser
|
||||
"""
|
||||
global _config
|
||||
_config = config
|
||||
logger.debug("Configuration overridden")
|
||||
|
||||
|
||||
def reset_config() -> None:
|
||||
"""Reset la configuration globale (pour tests)."""
|
||||
global _config
|
||||
_config = None
|
||||
logger.debug("Configuration reset")
|
||||
41
pricewatch/app/db/__init__.py
Executable file
41
pricewatch/app/db/__init__.py
Executable file
@@ -0,0 +1,41 @@
|
||||
"""
|
||||
Module de base de données pour PriceWatch Phase 2.
|
||||
|
||||
Gère la persistence PostgreSQL avec SQLAlchemy ORM.
|
||||
"""
|
||||
|
||||
from pricewatch.app.db.connection import (
|
||||
check_db_connection,
|
||||
get_engine,
|
||||
get_session,
|
||||
get_session_factory,
|
||||
init_db,
|
||||
reset_engine,
|
||||
)
|
||||
from pricewatch.app.db.repository import ProductRepository
|
||||
from pricewatch.app.db.models import (
|
||||
Base,
|
||||
Product,
|
||||
PriceHistory,
|
||||
ProductImage,
|
||||
ProductSpec,
|
||||
ScrapingLog,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Models
|
||||
"Base",
|
||||
"Product",
|
||||
"PriceHistory",
|
||||
"ProductImage",
|
||||
"ProductSpec",
|
||||
"ScrapingLog",
|
||||
"ProductRepository",
|
||||
# Connection
|
||||
"get_engine",
|
||||
"get_session_factory",
|
||||
"get_session",
|
||||
"init_db",
|
||||
"check_db_connection",
|
||||
"reset_engine",
|
||||
]
|
||||
BIN
pricewatch/app/db/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/db/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/db/__pycache__/connection.cpython-313.pyc
Executable file
BIN
pricewatch/app/db/__pycache__/connection.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/db/__pycache__/models.cpython-313.pyc
Executable file
BIN
pricewatch/app/db/__pycache__/models.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/db/__pycache__/repository.cpython-313.pyc
Executable file
BIN
pricewatch/app/db/__pycache__/repository.cpython-313.pyc
Executable file
Binary file not shown.
238
pricewatch/app/db/connection.py
Executable file
238
pricewatch/app/db/connection.py
Executable file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Gestion des connexions PostgreSQL pour PriceWatch Phase 2.
|
||||
|
||||
Fournit:
|
||||
- Engine SQLAlchemy avec connection pooling
|
||||
- Session factory avec context manager
|
||||
- Initialisation des tables
|
||||
- Health check
|
||||
|
||||
Justification technique:
|
||||
- Connection pooling: réutilisation connexions pour performance
|
||||
- Context manager: garantit fermeture session (pas de leak)
|
||||
- pool_pre_ping: vérifie connexion avant usage (robustesse)
|
||||
- echo=debug: logs SQL en mode debug
|
||||
"""
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator, Optional
|
||||
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.engine.url import make_url
|
||||
from sqlalchemy.exc import OperationalError, SQLAlchemyError
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.db.models import Base
|
||||
|
||||
logger = get_logger("db.connection")
|
||||
|
||||
# Global engine instance (singleton)
|
||||
_engine: Optional[Engine] = None
|
||||
_session_factory: Optional[sessionmaker] = None
|
||||
|
||||
|
||||
def get_engine(config: Optional[AppConfig] = None) -> Engine:
|
||||
"""
|
||||
Récupère ou crée l'Engine SQLAlchemy (singleton).
|
||||
|
||||
Args:
|
||||
config: Configuration app (utilise get_config() si None)
|
||||
|
||||
Returns:
|
||||
Engine SQLAlchemy configuré
|
||||
|
||||
Justification:
|
||||
- Singleton: une seule pool de connexions par application
|
||||
- pool_pre_ping: vérifie connexion avant usage (évite "connection closed")
|
||||
- pool_size=5, max_overflow=10: limite connexions (15 max)
|
||||
- echo=debug: logs SQL pour debugging
|
||||
"""
|
||||
global _engine
|
||||
|
||||
if _engine is None:
|
||||
if config is None:
|
||||
config = get_config()
|
||||
|
||||
db_url = config.db.url
|
||||
url = make_url(db_url)
|
||||
is_sqlite = url.get_backend_name() == "sqlite"
|
||||
|
||||
logger.info(f"Creating database engine: {db_url}")
|
||||
|
||||
engine_kwargs = {
|
||||
"pool_pre_ping": True,
|
||||
"pool_recycle": 3600,
|
||||
"echo": config.debug,
|
||||
}
|
||||
|
||||
if not is_sqlite:
|
||||
engine_kwargs.update(
|
||||
{
|
||||
"pool_size": 5,
|
||||
"max_overflow": 10,
|
||||
}
|
||||
)
|
||||
|
||||
_engine = create_engine(db_url, **engine_kwargs)
|
||||
|
||||
logger.info("Database engine created successfully")
|
||||
|
||||
return _engine
|
||||
|
||||
|
||||
def init_db(config: Optional[AppConfig] = None) -> None:
|
||||
"""
|
||||
Initialise la base de données (crée toutes les tables).
|
||||
|
||||
Args:
|
||||
config: Configuration app (utilise get_config() si None)
|
||||
|
||||
Raises:
|
||||
OperationalError: Si connexion impossible
|
||||
SQLAlchemyError: Si création tables échoue
|
||||
|
||||
Note:
|
||||
Utilise Base.metadata.create_all() - idempotent (ne crash pas si tables existent)
|
||||
"""
|
||||
if config is None:
|
||||
config = get_config()
|
||||
|
||||
logger.info("Initializing database...")
|
||||
|
||||
try:
|
||||
engine = get_engine(config)
|
||||
|
||||
# Créer toutes les tables définies dans Base.metadata
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
logger.info("Database initialized successfully")
|
||||
logger.info(f"Tables created: {', '.join(Base.metadata.tables.keys())}")
|
||||
|
||||
except OperationalError as e:
|
||||
logger.error(f"Failed to connect to database: {e}")
|
||||
raise
|
||||
except SQLAlchemyError as e:
|
||||
logger.error(f"Failed to create tables: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def get_session_factory(config: Optional[AppConfig] = None) -> sessionmaker:
|
||||
"""
|
||||
Récupère ou crée la session factory (singleton).
|
||||
|
||||
Args:
|
||||
config: Configuration app (utilise get_config() si None)
|
||||
|
||||
Returns:
|
||||
Session factory SQLAlchemy
|
||||
|
||||
Justification:
|
||||
- expire_on_commit=False: objets restent accessibles après commit
|
||||
- autocommit=False, autoflush=False: contrôle explicite
|
||||
"""
|
||||
global _session_factory
|
||||
|
||||
if _session_factory is None:
|
||||
engine = get_engine(config)
|
||||
|
||||
_session_factory = sessionmaker(
|
||||
bind=engine,
|
||||
expire_on_commit=False, # Objets restent accessibles après commit
|
||||
autocommit=False, # Contrôle explicite du commit
|
||||
autoflush=False, # Contrôle explicite du flush
|
||||
)
|
||||
|
||||
logger.debug("Session factory created")
|
||||
|
||||
return _session_factory
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_session(config: Optional[AppConfig] = None) -> Generator[Session, None, None]:
|
||||
"""
|
||||
Context manager pour session SQLAlchemy.
|
||||
|
||||
Args:
|
||||
config: Configuration app (utilise get_config() si None)
|
||||
|
||||
Yields:
|
||||
Session SQLAlchemy
|
||||
|
||||
Usage:
|
||||
with get_session() as session:
|
||||
product = session.query(Product).filter_by(reference="B08N5WRWNW").first()
|
||||
session.commit()
|
||||
|
||||
Justification:
|
||||
- Context manager: garantit fermeture session (pas de leak)
|
||||
- Rollback automatique sur exception
|
||||
- Close automatique en fin de bloc
|
||||
"""
|
||||
factory = get_session_factory(config)
|
||||
session = factory()
|
||||
|
||||
try:
|
||||
logger.debug("Session opened")
|
||||
yield session
|
||||
except Exception as e:
|
||||
logger.error(f"Session error, rolling back: {e}")
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
logger.debug("Session closed")
|
||||
session.close()
|
||||
|
||||
|
||||
def check_db_connection(config: Optional[AppConfig] = None) -> bool:
|
||||
"""
|
||||
Vérifie la connexion à la base de données (health check).
|
||||
|
||||
Args:
|
||||
config: Configuration app (utilise get_config() si None)
|
||||
|
||||
Returns:
|
||||
True si connexion OK, False sinon
|
||||
|
||||
Note:
|
||||
Execute une query simple: SELECT 1
|
||||
"""
|
||||
if config is None:
|
||||
config = get_config()
|
||||
|
||||
try:
|
||||
engine = get_engine(config)
|
||||
|
||||
with engine.connect() as conn:
|
||||
result = conn.execute(text("SELECT 1"))
|
||||
result.scalar()
|
||||
|
||||
logger.info("Database connection OK")
|
||||
return True
|
||||
|
||||
except OperationalError as e:
|
||||
logger.error(f"Database connection failed: {e}")
|
||||
return False
|
||||
except SQLAlchemyError as e:
|
||||
logger.error(f"Database health check failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def reset_engine() -> None:
|
||||
"""
|
||||
Reset l'engine global (pour tests).
|
||||
|
||||
Note:
|
||||
Dispose l'engine et reset les singletons.
|
||||
"""
|
||||
global _engine, _session_factory
|
||||
|
||||
if _engine is not None:
|
||||
logger.debug("Disposing database engine")
|
||||
_engine.dispose()
|
||||
_engine = None
|
||||
|
||||
_session_factory = None
|
||||
logger.debug("Engine reset complete")
|
||||
BIN
pricewatch/app/db/migrations/__pycache__/env.cpython-313.pyc
Executable file
BIN
pricewatch/app/db/migrations/__pycache__/env.cpython-313.pyc
Executable file
Binary file not shown.
80
pricewatch/app/db/migrations/env.py
Executable file
80
pricewatch/app/db/migrations/env.py
Executable file
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Configuration Alembic pour PriceWatch.
|
||||
|
||||
Recupere l'URL DB depuis AppConfig pour garantir un setup coherent.
|
||||
"""
|
||||
|
||||
from logging.config import fileConfig
|
||||
|
||||
from alembic import context
|
||||
from sqlalchemy import engine_from_config, pool
|
||||
|
||||
from pricewatch.app.core.config import get_config
|
||||
from pricewatch.app.db.models import Base
|
||||
|
||||
# Alembic Config object
|
||||
config = context.config
|
||||
|
||||
# Configure logging
|
||||
if config.config_file_name is not None:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
# Metadata SQLAlchemy pour autogenerate
|
||||
target_metadata = Base.metadata
|
||||
|
||||
|
||||
def _get_database_url() -> str:
|
||||
"""Construit l'URL DB depuis la config applicative."""
|
||||
app_config = get_config()
|
||||
return app_config.db.url
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
"""
|
||||
Execute les migrations en mode offline.
|
||||
|
||||
Configure le contexte avec l'URL DB sans creer d'engine.
|
||||
"""
|
||||
url = _get_database_url()
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
compare_type=True,
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
"""
|
||||
Execute les migrations en mode online.
|
||||
|
||||
Cree un engine SQLAlchemy et etablit la connexion.
|
||||
"""
|
||||
configuration = config.get_section(config.config_ini_section) or {}
|
||||
configuration["sqlalchemy.url"] = _get_database_url()
|
||||
|
||||
connectable = engine_from_config(
|
||||
configuration,
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
|
||||
with connectable.connect() as connection:
|
||||
context.configure(
|
||||
connection=connection,
|
||||
target_metadata=target_metadata,
|
||||
compare_type=True,
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
24
pricewatch/app/db/migrations/script.py.mako
Executable file
24
pricewatch/app/db/migrations/script.py.mako
Executable file
@@ -0,0 +1,24 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
# Revision identifiers, used by Alembic.
|
||||
revision = ${repr(up_revision)}
|
||||
down_revision = ${repr(down_revision)}
|
||||
branch_labels = ${repr(branch_labels)}
|
||||
depends_on = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
${downgrades if downgrades else "pass"}
|
||||
124
pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py
Executable file
124
pricewatch/app/db/migrations/versions/20260114_01_initial_schema.py
Executable file
@@ -0,0 +1,124 @@
|
||||
"""Initial schema
|
||||
|
||||
Revision ID: 20260114_01
|
||||
Revises: None
|
||||
Create Date: 2026-01-14 00:00:00
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# Revision identifiers, used by Alembic.
|
||||
revision = "20260114_01"
|
||||
down_revision = None
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"products",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("source", sa.String(length=50), nullable=False),
|
||||
sa.Column("reference", sa.String(length=100), nullable=False),
|
||||
sa.Column("url", sa.Text(), nullable=False),
|
||||
sa.Column("title", sa.Text(), nullable=True),
|
||||
sa.Column("category", sa.Text(), nullable=True),
|
||||
sa.Column("currency", sa.String(length=3), nullable=True),
|
||||
sa.Column("first_seen_at", sa.TIMESTAMP(), nullable=False),
|
||||
sa.Column("last_updated_at", sa.TIMESTAMP(), nullable=False),
|
||||
sa.UniqueConstraint("source", "reference", name="uq_product_source_reference"),
|
||||
)
|
||||
op.create_index("ix_product_source", "products", ["source"], unique=False)
|
||||
op.create_index("ix_product_reference", "products", ["reference"], unique=False)
|
||||
op.create_index("ix_product_last_updated", "products", ["last_updated_at"], unique=False)
|
||||
|
||||
op.create_table(
|
||||
"price_history",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("product_id", sa.Integer(), nullable=False),
|
||||
sa.Column("price", sa.Numeric(10, 2), nullable=True),
|
||||
sa.Column("shipping_cost", sa.Numeric(10, 2), nullable=True),
|
||||
sa.Column("stock_status", sa.String(length=20), nullable=True),
|
||||
sa.Column("fetch_method", sa.String(length=20), nullable=False),
|
||||
sa.Column("fetch_status", sa.String(length=20), nullable=False),
|
||||
sa.Column("fetched_at", sa.TIMESTAMP(), nullable=False),
|
||||
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"),
|
||||
sa.UniqueConstraint("product_id", "fetched_at", name="uq_price_history_product_time"),
|
||||
sa.CheckConstraint("stock_status IN ('in_stock', 'out_of_stock', 'unknown')"),
|
||||
sa.CheckConstraint("fetch_method IN ('http', 'playwright')"),
|
||||
sa.CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
|
||||
)
|
||||
op.create_index("ix_price_history_product_id", "price_history", ["product_id"], unique=False)
|
||||
op.create_index("ix_price_history_fetched_at", "price_history", ["fetched_at"], unique=False)
|
||||
|
||||
op.create_table(
|
||||
"product_images",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("product_id", sa.Integer(), nullable=False),
|
||||
sa.Column("image_url", sa.Text(), nullable=False),
|
||||
sa.Column("position", sa.Integer(), nullable=False),
|
||||
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"),
|
||||
sa.UniqueConstraint("product_id", "image_url", name="uq_product_image_url"),
|
||||
)
|
||||
op.create_index("ix_product_image_product_id", "product_images", ["product_id"], unique=False)
|
||||
|
||||
op.create_table(
|
||||
"product_specs",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("product_id", sa.Integer(), nullable=False),
|
||||
sa.Column("spec_key", sa.String(length=200), nullable=False),
|
||||
sa.Column("spec_value", sa.Text(), nullable=False),
|
||||
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="CASCADE"),
|
||||
sa.UniqueConstraint("product_id", "spec_key", name="uq_product_spec_key"),
|
||||
)
|
||||
op.create_index("ix_product_spec_product_id", "product_specs", ["product_id"], unique=False)
|
||||
op.create_index("ix_product_spec_key", "product_specs", ["spec_key"], unique=False)
|
||||
|
||||
op.create_table(
|
||||
"scraping_logs",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("product_id", sa.Integer(), nullable=True),
|
||||
sa.Column("url", sa.Text(), nullable=False),
|
||||
sa.Column("source", sa.String(length=50), nullable=False),
|
||||
sa.Column("reference", sa.String(length=100), nullable=True),
|
||||
sa.Column("fetch_method", sa.String(length=20), nullable=False),
|
||||
sa.Column("fetch_status", sa.String(length=20), nullable=False),
|
||||
sa.Column("fetched_at", sa.TIMESTAMP(), nullable=False),
|
||||
sa.Column("duration_ms", sa.Integer(), nullable=True),
|
||||
sa.Column("html_size_bytes", sa.Integer(), nullable=True),
|
||||
sa.Column("errors", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("notes", postgresql.JSONB(), nullable=True),
|
||||
sa.ForeignKeyConstraint(["product_id"], ["products.id"], ondelete="SET NULL"),
|
||||
sa.CheckConstraint("fetch_method IN ('http', 'playwright')"),
|
||||
sa.CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
|
||||
)
|
||||
op.create_index("ix_scraping_log_product_id", "scraping_logs", ["product_id"], unique=False)
|
||||
op.create_index("ix_scraping_log_source", "scraping_logs", ["source"], unique=False)
|
||||
op.create_index("ix_scraping_log_fetched_at", "scraping_logs", ["fetched_at"], unique=False)
|
||||
op.create_index("ix_scraping_log_fetch_status", "scraping_logs", ["fetch_status"], unique=False)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index("ix_scraping_log_fetch_status", table_name="scraping_logs")
|
||||
op.drop_index("ix_scraping_log_fetched_at", table_name="scraping_logs")
|
||||
op.drop_index("ix_scraping_log_source", table_name="scraping_logs")
|
||||
op.drop_index("ix_scraping_log_product_id", table_name="scraping_logs")
|
||||
op.drop_table("scraping_logs")
|
||||
|
||||
op.drop_index("ix_product_spec_key", table_name="product_specs")
|
||||
op.drop_index("ix_product_spec_product_id", table_name="product_specs")
|
||||
op.drop_table("product_specs")
|
||||
|
||||
op.drop_index("ix_product_image_product_id", table_name="product_images")
|
||||
op.drop_table("product_images")
|
||||
|
||||
op.drop_index("ix_price_history_fetched_at", table_name="price_history")
|
||||
op.drop_index("ix_price_history_product_id", table_name="price_history")
|
||||
op.drop_table("price_history")
|
||||
|
||||
op.drop_index("ix_product_last_updated", table_name="products")
|
||||
op.drop_index("ix_product_reference", table_name="products")
|
||||
op.drop_index("ix_product_source", table_name="products")
|
||||
op.drop_table("products")
|
||||
Binary file not shown.
320
pricewatch/app/db/models.py
Executable file
320
pricewatch/app/db/models.py
Executable file
@@ -0,0 +1,320 @@
|
||||
"""
|
||||
Modèles SQLAlchemy pour PriceWatch Phase 2.
|
||||
|
||||
Schéma normalisé pour persistence PostgreSQL:
|
||||
- products: Catalogue produits (déduplication sur source + reference)
|
||||
- price_history: Historique prix time-series
|
||||
- product_images: Images produit (N par produit)
|
||||
- product_specs: Caractéristiques produit (key-value)
|
||||
- scraping_logs: Logs observabilité pour debugging
|
||||
|
||||
Justification technique:
|
||||
- Normalisation: products séparée de price_history (catalogue vs time-series)
|
||||
- Clé naturelle: (source, reference) comme unique constraint (ASIN Amazon, etc.)
|
||||
- Pas de JSONB pour données structurées: tables séparées pour images/specs
|
||||
- JSONB uniquement pour données variables: errors, notes dans logs
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import List, Optional
|
||||
|
||||
from sqlalchemy import (
|
||||
TIMESTAMP,
|
||||
CheckConstraint,
|
||||
Column,
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
JSON,
|
||||
Numeric,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
"""Base class pour tous les modèles SQLAlchemy."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Product(Base):
|
||||
"""
|
||||
Catalogue produits (1 ligne par produit unique).
|
||||
|
||||
Clé naturelle: (source, reference) - Ex: (amazon, B08N5WRWNW)
|
||||
Mise à jour: title, category, url à chaque scraping
|
||||
Historique prix: relation 1-N vers PriceHistory
|
||||
"""
|
||||
|
||||
__tablename__ = "products"
|
||||
|
||||
# Primary key
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
|
||||
# Natural key (unique)
|
||||
source: Mapped[str] = mapped_column(
|
||||
String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)"
|
||||
)
|
||||
reference: Mapped[str] = mapped_column(
|
||||
String(100), nullable=False, comment="Product reference (ASIN, SKU, etc.)"
|
||||
)
|
||||
|
||||
# Product metadata
|
||||
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Canonical product URL")
|
||||
title: Mapped[Optional[str]] = mapped_column(Text, nullable=True, comment="Product title")
|
||||
category: Mapped[Optional[str]] = mapped_column(
|
||||
Text, nullable=True, comment="Product category (breadcrumb)"
|
||||
)
|
||||
currency: Mapped[Optional[str]] = mapped_column(
|
||||
String(3), nullable=True, comment="Currency code (EUR, USD, GBP)"
|
||||
)
|
||||
|
||||
# Timestamps
|
||||
first_seen_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="First scraping timestamp"
|
||||
)
|
||||
last_updated_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP,
|
||||
nullable=False,
|
||||
default=datetime.utcnow,
|
||||
onupdate=datetime.utcnow,
|
||||
comment="Last metadata update",
|
||||
)
|
||||
|
||||
# Relationships
|
||||
price_history: Mapped[List["PriceHistory"]] = relationship(
|
||||
"PriceHistory", back_populates="product", cascade="all, delete-orphan"
|
||||
)
|
||||
images: Mapped[List["ProductImage"]] = relationship(
|
||||
"ProductImage", back_populates="product", cascade="all, delete-orphan"
|
||||
)
|
||||
specs: Mapped[List["ProductSpec"]] = relationship(
|
||||
"ProductSpec", back_populates="product", cascade="all, delete-orphan"
|
||||
)
|
||||
logs: Mapped[List["ScrapingLog"]] = relationship(
|
||||
"ScrapingLog", back_populates="product", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
# Constraints
|
||||
__table_args__ = (
|
||||
UniqueConstraint("source", "reference", name="uq_product_source_reference"),
|
||||
Index("ix_product_source", "source"),
|
||||
Index("ix_product_reference", "reference"),
|
||||
Index("ix_product_last_updated", "last_updated_at"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Product(id={self.id}, source={self.source}, reference={self.reference})>"
|
||||
|
||||
|
||||
class PriceHistory(Base):
|
||||
"""
|
||||
Historique prix (time-series).
|
||||
|
||||
Une ligne par scraping réussi avec extraction prix.
|
||||
Unique constraint sur (product_id, fetched_at) évite doublons.
|
||||
"""
|
||||
|
||||
__tablename__ = "price_history"
|
||||
|
||||
# Primary key
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
|
||||
# Foreign key
|
||||
product_id: Mapped[int] = mapped_column(
|
||||
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
|
||||
# Price data
|
||||
price: Mapped[Optional[Decimal]] = mapped_column(
|
||||
Numeric(10, 2), nullable=True, comment="Product price"
|
||||
)
|
||||
shipping_cost: Mapped[Optional[Decimal]] = mapped_column(
|
||||
Numeric(10, 2), nullable=True, comment="Shipping cost"
|
||||
)
|
||||
stock_status: Mapped[Optional[str]] = mapped_column(
|
||||
String(20), nullable=True, comment="Stock status (in_stock, out_of_stock, unknown)"
|
||||
)
|
||||
|
||||
# Fetch metadata
|
||||
fetch_method: Mapped[str] = mapped_column(
|
||||
String(20), nullable=False, comment="Fetch method (http, playwright)"
|
||||
)
|
||||
fetch_status: Mapped[str] = mapped_column(
|
||||
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
|
||||
)
|
||||
fetched_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP, nullable=False, comment="Scraping timestamp"
|
||||
)
|
||||
|
||||
# Relationship
|
||||
product: Mapped["Product"] = relationship("Product", back_populates="price_history")
|
||||
|
||||
# Constraints
|
||||
__table_args__ = (
|
||||
UniqueConstraint("product_id", "fetched_at", name="uq_price_history_product_time"),
|
||||
Index("ix_price_history_product_id", "product_id"),
|
||||
Index("ix_price_history_fetched_at", "fetched_at"),
|
||||
CheckConstraint("stock_status IN ('in_stock', 'out_of_stock', 'unknown')"),
|
||||
CheckConstraint("fetch_method IN ('http', 'playwright')"),
|
||||
CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<PriceHistory(id={self.id}, product_id={self.product_id}, price={self.price}, fetched_at={self.fetched_at})>"
|
||||
|
||||
|
||||
class ProductImage(Base):
|
||||
"""
|
||||
Images produit (N images par produit).
|
||||
|
||||
Unique constraint sur (product_id, image_url) évite doublons.
|
||||
Position permet de garder l'ordre des images.
|
||||
"""
|
||||
|
||||
__tablename__ = "product_images"
|
||||
|
||||
# Primary key
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
|
||||
# Foreign key
|
||||
product_id: Mapped[int] = mapped_column(
|
||||
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
|
||||
# Image data
|
||||
image_url: Mapped[str] = mapped_column(Text, nullable=False, comment="Image URL")
|
||||
position: Mapped[int] = mapped_column(
|
||||
Integer, nullable=False, default=0, comment="Image position (0=main)"
|
||||
)
|
||||
|
||||
# Relationship
|
||||
product: Mapped["Product"] = relationship("Product", back_populates="images")
|
||||
|
||||
# Constraints
|
||||
__table_args__ = (
|
||||
UniqueConstraint("product_id", "image_url", name="uq_product_image_url"),
|
||||
Index("ix_product_image_product_id", "product_id"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<ProductImage(id={self.id}, product_id={self.product_id}, position={self.position})>"
|
||||
|
||||
|
||||
class ProductSpec(Base):
|
||||
"""
|
||||
Caractéristiques produit (key-value).
|
||||
|
||||
Unique constraint sur (product_id, spec_key) évite doublons.
|
||||
Permet queries efficaces par clé.
|
||||
"""
|
||||
|
||||
__tablename__ = "product_specs"
|
||||
|
||||
# Primary key
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
|
||||
# Foreign key
|
||||
product_id: Mapped[int] = mapped_column(
|
||||
Integer, ForeignKey("products.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
|
||||
# Spec data
|
||||
spec_key: Mapped[str] = mapped_column(
|
||||
String(200), nullable=False, comment="Specification key (e.g., 'Brand', 'Color')"
|
||||
)
|
||||
spec_value: Mapped[str] = mapped_column(Text, nullable=False, comment="Specification value")
|
||||
|
||||
# Relationship
|
||||
product: Mapped["Product"] = relationship("Product", back_populates="specs")
|
||||
|
||||
# Constraints
|
||||
__table_args__ = (
|
||||
UniqueConstraint("product_id", "spec_key", name="uq_product_spec_key"),
|
||||
Index("ix_product_spec_product_id", "product_id"),
|
||||
Index("ix_product_spec_key", "spec_key"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<ProductSpec(id={self.id}, product_id={self.product_id}, key={self.spec_key})>"
|
||||
|
||||
|
||||
class ScrapingLog(Base):
|
||||
"""
|
||||
Logs observabilité pour debugging.
|
||||
|
||||
FK optionnelle vers products (permet logs même si produit non créé).
|
||||
JSONB pour errors/notes car structure variable.
|
||||
Permet analytics: taux succès, durée moyenne, etc.
|
||||
"""
|
||||
|
||||
__tablename__ = "scraping_logs"
|
||||
|
||||
# Primary key
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
|
||||
# Foreign key (optional)
|
||||
product_id: Mapped[Optional[int]] = mapped_column(
|
||||
Integer, ForeignKey("products.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
|
||||
# Scraping metadata
|
||||
url: Mapped[str] = mapped_column(Text, nullable=False, comment="Scraped URL")
|
||||
source: Mapped[str] = mapped_column(
|
||||
String(50), nullable=False, comment="Store ID (amazon, cdiscount, etc.)"
|
||||
)
|
||||
reference: Mapped[Optional[str]] = mapped_column(
|
||||
String(100), nullable=True, comment="Product reference (if extracted)"
|
||||
)
|
||||
|
||||
# Fetch metadata
|
||||
fetch_method: Mapped[str] = mapped_column(
|
||||
String(20), nullable=False, comment="Fetch method (http, playwright)"
|
||||
)
|
||||
fetch_status: Mapped[str] = mapped_column(
|
||||
String(20), nullable=False, comment="Fetch status (success, partial, failed)"
|
||||
)
|
||||
fetched_at: Mapped[datetime] = mapped_column(
|
||||
TIMESTAMP, nullable=False, default=datetime.utcnow, comment="Scraping timestamp"
|
||||
)
|
||||
|
||||
# Performance metrics
|
||||
duration_ms: Mapped[Optional[int]] = mapped_column(
|
||||
Integer, nullable=True, comment="Fetch duration in milliseconds"
|
||||
)
|
||||
html_size_bytes: Mapped[Optional[int]] = mapped_column(
|
||||
Integer, nullable=True, comment="HTML response size in bytes"
|
||||
)
|
||||
|
||||
# Debug data (JSONB)
|
||||
errors: Mapped[Optional[list[str]]] = mapped_column(
|
||||
JSON().with_variant(JSONB, "postgresql"),
|
||||
nullable=True,
|
||||
comment="Error messages (list of strings)",
|
||||
)
|
||||
notes: Mapped[Optional[list[str]]] = mapped_column(
|
||||
JSON().with_variant(JSONB, "postgresql"),
|
||||
nullable=True,
|
||||
comment="Debug notes (list of strings)",
|
||||
)
|
||||
|
||||
# Relationship
|
||||
product: Mapped[Optional["Product"]] = relationship("Product", back_populates="logs")
|
||||
|
||||
# Constraints
|
||||
__table_args__ = (
|
||||
Index("ix_scraping_log_product_id", "product_id"),
|
||||
Index("ix_scraping_log_source", "source"),
|
||||
Index("ix_scraping_log_fetched_at", "fetched_at"),
|
||||
Index("ix_scraping_log_fetch_status", "fetch_status"),
|
||||
CheckConstraint("fetch_method IN ('http', 'playwright')"),
|
||||
CheckConstraint("fetch_status IN ('success', 'partial', 'failed')"),
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<ScrapingLog(id={self.id}, url={self.url}, status={self.fetch_status}, fetched_at={self.fetched_at})>"
|
||||
140
pricewatch/app/db/repository.py
Executable file
140
pricewatch/app/db/repository.py
Executable file
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
Repository pattern pour la persistence SQLAlchemy.
|
||||
|
||||
Centralise les operations CRUD sur les modeles DB a partir d'un ProductSnapshot.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
from pricewatch.app.db.models import PriceHistory, Product, ProductImage, ProductSpec, ScrapingLog
|
||||
|
||||
logger = get_logger("db.repository")
|
||||
|
||||
|
||||
class ProductRepository:
|
||||
"""Repository de persistence pour ProductSnapshot."""
|
||||
|
||||
def __init__(self, session: Session) -> None:
|
||||
self.session = session
|
||||
|
||||
def get_or_create(self, source: str, reference: str, url: str) -> Product:
|
||||
"""
|
||||
Recuperer ou creer un produit par cle naturelle (source, reference).
|
||||
"""
|
||||
product = (
|
||||
self.session.query(Product)
|
||||
.filter(Product.source == source, Product.reference == reference)
|
||||
.one_or_none()
|
||||
)
|
||||
if product:
|
||||
return product
|
||||
|
||||
product = Product(source=source, reference=reference, url=url)
|
||||
self.session.add(product)
|
||||
self.session.flush()
|
||||
return product
|
||||
|
||||
def update_product_metadata(self, product: Product, snapshot: ProductSnapshot) -> None:
|
||||
"""Met a jour les metadonnees produit si disponibles."""
|
||||
if snapshot.url:
|
||||
product.url = snapshot.url
|
||||
if snapshot.title:
|
||||
product.title = snapshot.title
|
||||
if snapshot.category:
|
||||
product.category = snapshot.category
|
||||
if snapshot.currency:
|
||||
product.currency = snapshot.currency
|
||||
|
||||
def add_price_history(self, product: Product, snapshot: ProductSnapshot) -> Optional[PriceHistory]:
|
||||
"""Ajoute une entree d'historique de prix si inexistante."""
|
||||
existing = (
|
||||
self.session.query(PriceHistory)
|
||||
.filter(
|
||||
PriceHistory.product_id == product.id,
|
||||
PriceHistory.fetched_at == snapshot.fetched_at,
|
||||
)
|
||||
.one_or_none()
|
||||
)
|
||||
if existing:
|
||||
return existing
|
||||
|
||||
price_entry = PriceHistory(
|
||||
product_id=product.id,
|
||||
price=snapshot.price,
|
||||
shipping_cost=snapshot.shipping_cost,
|
||||
stock_status=snapshot.stock_status,
|
||||
fetch_method=snapshot.debug.method,
|
||||
fetch_status=snapshot.debug.status,
|
||||
fetched_at=snapshot.fetched_at,
|
||||
)
|
||||
self.session.add(price_entry)
|
||||
return price_entry
|
||||
|
||||
def sync_images(self, product: Product, images: list[str]) -> None:
|
||||
"""Synchronise les images (ajout des nouvelles)."""
|
||||
existing_urls = {image.image_url for image in product.images}
|
||||
for position, url in enumerate(images):
|
||||
if url in existing_urls:
|
||||
continue
|
||||
self.session.add(ProductImage(product_id=product.id, image_url=url, position=position))
|
||||
|
||||
def sync_specs(self, product: Product, specs: dict[str, str]) -> None:
|
||||
"""Synchronise les specs (upsert par cle)."""
|
||||
existing_specs = {spec.spec_key: spec for spec in product.specs}
|
||||
for key, value in specs.items():
|
||||
if key in existing_specs:
|
||||
existing_specs[key].spec_value = value
|
||||
else:
|
||||
self.session.add(ProductSpec(product_id=product.id, spec_key=key, spec_value=value))
|
||||
|
||||
def add_scraping_log(self, snapshot: ProductSnapshot, product_id: Optional[int]) -> ScrapingLog:
|
||||
"""Ajoute un log de scraping pour observabilite."""
|
||||
log_entry = ScrapingLog(
|
||||
product_id=product_id,
|
||||
url=snapshot.url,
|
||||
source=snapshot.source,
|
||||
reference=snapshot.reference,
|
||||
fetch_method=snapshot.debug.method,
|
||||
fetch_status=snapshot.debug.status,
|
||||
fetched_at=snapshot.fetched_at,
|
||||
duration_ms=snapshot.debug.duration_ms,
|
||||
html_size_bytes=snapshot.debug.html_size_bytes,
|
||||
errors=snapshot.debug.errors or None,
|
||||
notes=snapshot.debug.notes or None,
|
||||
)
|
||||
self.session.add(log_entry)
|
||||
return log_entry
|
||||
|
||||
def save_snapshot(self, snapshot: ProductSnapshot) -> Optional[int]:
|
||||
"""
|
||||
Persiste un ProductSnapshot complet dans la base.
|
||||
|
||||
Retourne l'id produit ou None si reference absente.
|
||||
"""
|
||||
if not snapshot.reference:
|
||||
logger.warning("Reference absente: persistence ignoree")
|
||||
self.add_scraping_log(snapshot, product_id=None)
|
||||
return None
|
||||
|
||||
product = self.get_or_create(snapshot.source, snapshot.reference, snapshot.url)
|
||||
self.update_product_metadata(product, snapshot)
|
||||
self.add_price_history(product, snapshot)
|
||||
self.sync_images(product, snapshot.images)
|
||||
self.sync_specs(product, snapshot.specs)
|
||||
self.add_scraping_log(snapshot, product_id=product.id)
|
||||
return product.id
|
||||
|
||||
def safe_save_snapshot(self, snapshot: ProductSnapshot) -> Optional[int]:
|
||||
"""Sauvegarde avec gestion d'erreur SQLAlchemy."""
|
||||
try:
|
||||
return self.save_snapshot(snapshot)
|
||||
except SQLAlchemyError as exc:
|
||||
logger.error(f"Erreur SQLAlchemy: {exc}")
|
||||
raise
|
||||
@@ -0,0 +1,3 @@
|
||||
from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
||||
|
||||
__all__ = ["ScrapingPipeline"]
|
||||
|
||||
Binary file not shown.
BIN
pricewatch/app/scraping/__pycache__/pipeline.cpython-313.pyc
Executable file
BIN
pricewatch/app/scraping/__pycache__/pipeline.cpython-313.pyc
Executable file
Binary file not shown.
52
pricewatch/app/scraping/pipeline.py
Executable file
52
pricewatch/app/scraping/pipeline.py
Executable file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Pipeline de persistence pour les snapshots de scraping.
|
||||
|
||||
Ne doit jamais bloquer le pipeline principal si la DB est indisponible.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
from pricewatch.app.db.connection import get_session
|
||||
from pricewatch.app.db.repository import ProductRepository
|
||||
|
||||
logger = get_logger("scraping.pipeline")
|
||||
|
||||
|
||||
class ScrapingPipeline:
|
||||
"""Orchestration de persistence DB pour un ProductSnapshot."""
|
||||
|
||||
def __init__(self, config: Optional[AppConfig] = None) -> None:
|
||||
self.config = config
|
||||
|
||||
def process_snapshot(self, snapshot: ProductSnapshot, save_to_db: bool = True) -> Optional[int]:
|
||||
"""
|
||||
Persiste un snapshot en base si active.
|
||||
|
||||
Retourne l'id produit si sauve, sinon None.
|
||||
"""
|
||||
app_config = self.config or get_config()
|
||||
if not save_to_db or not app_config.enable_db:
|
||||
logger.debug("Persistence DB desactivee")
|
||||
return None
|
||||
|
||||
try:
|
||||
with get_session(app_config) as session:
|
||||
repo = ProductRepository(session)
|
||||
product_id = repo.safe_save_snapshot(snapshot)
|
||||
session.commit()
|
||||
return product_id
|
||||
except SQLAlchemyError as exc:
|
||||
snapshot.add_note(f"Persistence DB echouee: {exc}")
|
||||
logger.error(f"Persistence DB echouee: {exc}")
|
||||
return None
|
||||
except Exception as exc:
|
||||
snapshot.add_note(f"Erreur pipeline DB: {exc}")
|
||||
logger.error(f"Erreur pipeline DB: {exc}")
|
||||
return None
|
||||
Binary file not shown.
@@ -214,6 +214,18 @@ class AmazonStore(BaseStore):
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Fallback: chercher les spans séparés a-price-whole et a-price-fraction
|
||||
whole = soup.select_one("span.a-price-whole")
|
||||
fraction = soup.select_one("span.a-price-fraction")
|
||||
if whole and fraction:
|
||||
whole_text = whole.get_text(strip=True)
|
||||
fraction_text = fraction.get_text(strip=True)
|
||||
try:
|
||||
price_str = f"{whole_text}.{fraction_text}"
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
@@ -270,6 +282,14 @@ class AmazonStore(BaseStore):
|
||||
if url and url.startswith("http"):
|
||||
images.append(url)
|
||||
|
||||
# Fallback: chercher tous les img tags si aucune image trouvée
|
||||
if not images:
|
||||
all_imgs = soup.find_all("img")
|
||||
for img in all_imgs:
|
||||
url = img.get("src") or img.get("data-src")
|
||||
if url and url.startswith("http"):
|
||||
images.append(url)
|
||||
|
||||
return list(set(images)) # Dédupliquer
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
|
||||
8
pricewatch/app/tasks/__init__.py
Executable file
8
pricewatch/app/tasks/__init__.py
Executable file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Module tasks pour les jobs RQ.
|
||||
"""
|
||||
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
from pricewatch.app.tasks.scheduler import ScrapingScheduler
|
||||
|
||||
__all__ = ["scrape_product", "ScrapingScheduler"]
|
||||
75
pricewatch/app/tasks/scheduler.py
Executable file
75
pricewatch/app/tasks/scheduler.py
Executable file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Planification des jobs de scraping via RQ Scheduler.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
from rq import Queue
|
||||
from rq_scheduler import Scheduler
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
|
||||
logger = get_logger("tasks.scheduler")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScheduledJobInfo:
|
||||
"""Infos de retour pour un job planifie."""
|
||||
|
||||
job_id: str
|
||||
next_run: datetime
|
||||
|
||||
|
||||
class ScrapingScheduler:
|
||||
"""Scheduler pour les jobs de scraping avec RQ."""
|
||||
|
||||
def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
|
||||
self.config = config or get_config()
|
||||
self.redis = redis.from_url(self.config.redis.url)
|
||||
self.queue = Queue(queue_name, connection=self.redis)
|
||||
self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
|
||||
|
||||
def enqueue_immediate(
|
||||
self,
|
||||
url: str,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
):
|
||||
"""Enqueue un job immediat."""
|
||||
job = self.queue.enqueue(
|
||||
scrape_product,
|
||||
url,
|
||||
use_playwright=use_playwright,
|
||||
save_db=save_db,
|
||||
)
|
||||
logger.info(f"Job enqueued: {job.id}")
|
||||
return job
|
||||
|
||||
def schedule_product(
|
||||
self,
|
||||
url: str,
|
||||
interval_hours: int = 24,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
) -> ScheduledJobInfo:
|
||||
"""Planifie un scraping recurrent (intervalle en heures)."""
|
||||
interval_seconds = int(timedelta(hours=interval_hours).total_seconds())
|
||||
next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds)
|
||||
|
||||
job = self.scheduler.schedule(
|
||||
scheduled_time=next_run,
|
||||
func=scrape_product,
|
||||
args=[url],
|
||||
kwargs={"use_playwright": use_playwright, "save_db": save_db},
|
||||
interval=interval_seconds,
|
||||
repeat=None,
|
||||
)
|
||||
logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}")
|
||||
return ScheduledJobInfo(job_id=job.id, next_run=next_run)
|
||||
160
pricewatch/app/tasks/scrape.py
Executable file
160
pricewatch/app/tasks/scrape.py
Executable file
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
Tache de scraping asynchrone pour RQ.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.registry import get_registry
|
||||
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot
|
||||
from pricewatch.app.scraping.http_fetch import fetch_http
|
||||
from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
from pricewatch.app.stores.aliexpress.store import AliexpressStore
|
||||
from pricewatch.app.stores.amazon.store import AmazonStore
|
||||
from pricewatch.app.stores.backmarket.store import BackmarketStore
|
||||
from pricewatch.app.stores.cdiscount.store import CdiscountStore
|
||||
|
||||
logger = get_logger("tasks.scrape")
|
||||
|
||||
|
||||
def setup_stores() -> None:
|
||||
"""Enregistre les stores disponibles si besoin."""
|
||||
registry = get_registry()
|
||||
if registry.list_stores():
|
||||
return
|
||||
registry.register(AmazonStore())
|
||||
registry.register(CdiscountStore())
|
||||
registry.register(BackmarketStore())
|
||||
registry.register(AliexpressStore())
|
||||
|
||||
|
||||
def scrape_product(
|
||||
url: str,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
save_html: bool = False,
|
||||
save_screenshot: bool = False,
|
||||
headful: bool = False,
|
||||
timeout_ms: Optional[int] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Scrape un produit et persiste en base via ScrapingPipeline.
|
||||
|
||||
Retourne un dict avec success, product_id, snapshot, error.
|
||||
"""
|
||||
config: AppConfig = get_config()
|
||||
setup_stores()
|
||||
|
||||
if use_playwright is None:
|
||||
use_playwright = config.default_use_playwright
|
||||
|
||||
if timeout_ms is None:
|
||||
timeout_ms = config.default_playwright_timeout
|
||||
|
||||
registry = get_registry()
|
||||
store = registry.detect_store(url)
|
||||
if not store:
|
||||
snapshot = ProductSnapshot(
|
||||
source="unknown",
|
||||
url=url,
|
||||
debug=DebugInfo(
|
||||
method=FetchMethod.HTTP,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=["Aucun store detecte"],
|
||||
),
|
||||
)
|
||||
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
|
||||
|
||||
canonical_url = store.canonicalize(url)
|
||||
|
||||
html = None
|
||||
fetch_method = FetchMethod.HTTP
|
||||
fetch_error = None
|
||||
duration_ms = None
|
||||
html_size_bytes = None
|
||||
pw_result = None
|
||||
|
||||
http_result = fetch_http(canonical_url)
|
||||
duration_ms = http_result.duration_ms
|
||||
|
||||
if http_result.success:
|
||||
html = http_result.html
|
||||
fetch_method = FetchMethod.HTTP
|
||||
elif use_playwright:
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not headful,
|
||||
timeout_ms=timeout_ms,
|
||||
save_screenshot=save_screenshot,
|
||||
)
|
||||
duration_ms = pw_result.duration_ms
|
||||
|
||||
if pw_result.success:
|
||||
html = pw_result.html
|
||||
fetch_method = FetchMethod.PLAYWRIGHT
|
||||
else:
|
||||
fetch_error = pw_result.error
|
||||
else:
|
||||
fetch_error = http_result.error
|
||||
|
||||
if html:
|
||||
html_size_bytes = len(html.encode("utf-8"))
|
||||
if save_html:
|
||||
from pricewatch.app.core.io import save_debug_html
|
||||
|
||||
ref = store.extract_reference(canonical_url) or "unknown"
|
||||
save_debug_html(html, f"{store.store_id}_{ref}")
|
||||
|
||||
if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result:
|
||||
from pricewatch.app.core.io import save_debug_screenshot
|
||||
|
||||
if pw_result and pw_result.screenshot:
|
||||
ref = store.extract_reference(canonical_url) or "unknown"
|
||||
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
|
||||
|
||||
try:
|
||||
snapshot = store.parse(html, canonical_url)
|
||||
snapshot.debug.method = fetch_method
|
||||
snapshot.debug.duration_ms = duration_ms
|
||||
snapshot.debug.html_size_bytes = html_size_bytes
|
||||
success = snapshot.debug.status != DebugStatus.FAILED
|
||||
except Exception as exc:
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
url=canonical_url,
|
||||
debug=DebugInfo(
|
||||
method=fetch_method,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=[f"Parsing failed: {exc}"],
|
||||
duration_ms=duration_ms,
|
||||
html_size_bytes=html_size_bytes,
|
||||
),
|
||||
)
|
||||
success = False
|
||||
fetch_error = str(exc)
|
||||
else:
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
url=canonical_url,
|
||||
debug=DebugInfo(
|
||||
method=fetch_method,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
|
||||
duration_ms=duration_ms,
|
||||
),
|
||||
)
|
||||
success = False
|
||||
|
||||
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
|
||||
return {
|
||||
"success": success,
|
||||
"product_id": product_id,
|
||||
"snapshot": snapshot,
|
||||
"error": fetch_error,
|
||||
}
|
||||
Reference in New Issue
Block a user