scrap/pricewatch/app/scraping/pipeline.py

"""
Pipeline de persistence pour les snapshots de scraping.

Ne doit jamais bloquer le pipeline principal si la DB est indisponible.
"""

from __future__ import annotations

from typing import Optional

from sqlalchemy.exc import SQLAlchemyError

from pricewatch.app.core.config import AppConfig, get_config
from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import ProductSnapshot
from pricewatch.app.db.connection import get_session
from pricewatch.app.db.repository import ProductRepository

logger = get_logger("scraping.pipeline")


class ScrapingPipeline:
    """Orchestration de persistence DB pour un ProductSnapshot."""

    def __init__(self, config: Optional[AppConfig] = None) -> None:
        self.config = config

    def process_snapshot(
        self,
        snapshot: ProductSnapshot,
        save_to_db: bool = True,
        apply_classification: bool = True,
    ) -> Optional[int]:
        """
        Persiste un snapshot en base si active.

        Retourne l'id produit si sauve, sinon None.
        """
        app_config = self.config or get_config()
        if not save_to_db or not app_config.enable_db:
            logger.debug("Persistence DB desactivee")
            return None

        try:
            with get_session(app_config) as session:
                repo = ProductRepository(session)
                if apply_classification:
                    repo.apply_classification(snapshot)
                product_id = repo.safe_save_snapshot(snapshot)
                session.commit()
                return product_id
        except SQLAlchemyError as exc:
            snapshot.add_note(f"Persistence DB echouee: {exc}")
            logger.error(f"Persistence DB echouee: {exc}")
            return None
        except Exception as exc:
            snapshot.add_note(f"Erreur pipeline DB: {exc}")
            logger.error(f"Erreur pipeline DB: {exc}")
            return None