scrap/pricewatch/app/core/io.py

"""
Fonctions d'entrée/sortie pour PriceWatch.

Gère la lecture de la configuration YAML et l'écriture des résultats JSON.
"""

import json
from pathlib import Path
from typing import Any

import yaml
from pydantic import BaseModel, Field, field_validator

from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import ProductSnapshot

logger = get_logger("core.io")


class ScrapingOptions(BaseModel):
    """Options de scraping depuis le fichier YAML."""

    use_playwright: bool = Field(
        default=True, description="Utiliser Playwright en fallback"
    )
    force_playwright: bool = Field(
        default=False, description="Forcer Playwright même si HTTP réussi"
    )
    headful: bool = Field(default=False, description="Mode headful (voir le navigateur)")
    save_html: bool = Field(
        default=True, description="Sauvegarder HTML pour debug"
    )
    save_screenshot: bool = Field(
        default=True, description="Sauvegarder screenshot pour debug"
    )
    timeout_ms: int = Field(
        default=60000, description="Timeout par page en millisecondes", ge=1000
    )


class ScrapingConfig(BaseModel):
    """Configuration complète du scraping depuis YAML."""

    urls: list[str] = Field(description="Liste des URLs à scraper")
    options: ScrapingOptions = Field(
        default_factory=ScrapingOptions, description="Options de scraping"
    )

    @field_validator("urls")
    @classmethod
    def validate_urls(cls, v: list[str]) -> list[str]:
        """Valide et nettoie les URLs."""
        if not v:
            raise ValueError("Au moins une URL doit être fournie")

        cleaned = [url.strip() for url in v if url and url.strip()]
        if not cleaned:
            raise ValueError("Aucune URL valide trouvée")

        return cleaned


def read_yaml_config(yaml_path: str | Path) -> ScrapingConfig:
    """
    Lit et valide le fichier YAML de configuration.

    Args:
        yaml_path: Chemin vers le fichier YAML

    Returns:
        Configuration validée

    Raises:
        FileNotFoundError: Si le fichier n'existe pas
        ValueError: Si le YAML est invalide

    Justification technique:
    - Utilisation de Pydantic pour valider la structure YAML
    - Cela évite des bugs si le fichier est mal formé
    - Les erreurs sont explicites pour l'utilisateur
    """
    yaml_path = Path(yaml_path)

    if not yaml_path.exists():
        logger.error(f"Fichier YAML introuvable: {yaml_path}")
        raise FileNotFoundError(f"Fichier YAML introuvable: {yaml_path}")

    logger.info(f"Lecture configuration: {yaml_path}")

    try:
        with open(yaml_path, "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)

        if not data:
            raise ValueError("Fichier YAML vide")

        config = ScrapingConfig.model_validate(data)
        logger.info(
            f"Configuration chargée: {len(config.urls)} URL(s), "
            f"playwright={config.options.use_playwright}, "
            f"force_playwright={config.options.force_playwright}"
        )
        return config

    except yaml.YAMLError as e:
        logger.error(f"Erreur parsing YAML: {e}")
        raise ValueError(f"YAML invalide: {e}") from e
    except Exception as e:
        logger.error(f"Erreur validation config: {e}")
        raise


def write_json_results(
    snapshots: list[ProductSnapshot], json_path: str | Path, indent: int = 2
) -> None:
    """
    Écrit les résultats du scraping dans un fichier JSON.

    Args:
        snapshots: Liste des ProductSnapshot à sauvegarder
        json_path: Chemin du fichier JSON de sortie
        indent: Indentation pour lisibilité (None = compact)

    Justification technique:
    - Serialization via Pydantic pour garantir la structure
    - Pretty-print par défaut (indent=2) pour faciliter le debug manuel
    - Création automatique des dossiers parents si nécessaire
    """
    json_path = Path(json_path)

    # Créer le dossier parent si nécessaire
    json_path.parent.mkdir(parents=True, exist_ok=True)

    logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {json_path}")

    try:
        # Serialization via Pydantic
        data = [snapshot.model_dump(mode="json") for snapshot in snapshots]

        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=indent, ensure_ascii=False)

        logger.info(f"Résultats sauvegardés: {json_path} ({json_path.stat().st_size} bytes)")

    except Exception as e:
        logger.error(f"Erreur écriture JSON: {e}")
        raise


def read_json_results(json_path: str | Path) -> list[ProductSnapshot]:
    """
    Lit et valide un fichier JSON de résultats.

    Args:
        json_path: Chemin vers le fichier JSON

    Returns:
        Liste de ProductSnapshot validés

    Raises:
        FileNotFoundError: Si le fichier n'existe pas
        ValueError: Si le JSON est invalide
    """
    json_path = Path(json_path)

    if not json_path.exists():
        logger.error(f"Fichier JSON introuvable: {json_path}")
        raise FileNotFoundError(f"Fichier JSON introuvable: {json_path}")

    logger.info(f"Lecture résultats: {json_path}")

    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        if not isinstance(data, list):
            raise ValueError("Le JSON doit contenir une liste")

        snapshots = [ProductSnapshot.model_validate(item) for item in data]
        logger.info(f"{len(snapshots)} snapshot(s) chargé(s)")
        return snapshots

    except json.JSONDecodeError as e:
        logger.error(f"Erreur parsing JSON: {e}")
        raise ValueError(f"JSON invalide: {e}") from e
    except Exception as e:
        logger.error(f"Erreur validation snapshots: {e}")
        raise


def save_debug_html(html: str, filename: str, output_dir: str | Path = "scraped") -> Path:
    """
    Sauvegarde le HTML récupéré pour debug.

    Args:
        html: Contenu HTML
        filename: Nom du fichier (sans extension)
        output_dir: Dossier de sortie

    Returns:
        Chemin du fichier sauvegardé
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    filepath = output_dir / f"{filename}.html"

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(html)

    logger.debug(f"HTML sauvegardé: {filepath} ({len(html)} chars)")
    return filepath


def save_debug_screenshot(
    screenshot_bytes: bytes, filename: str, output_dir: str | Path = "scraped"
) -> Path:
    """
    Sauvegarde un screenshot pour debug.

    Args:
        screenshot_bytes: Données binaires du screenshot
        filename: Nom du fichier (sans extension)
        output_dir: Dossier de sortie

    Returns:
        Chemin du fichier sauvegardé
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    filepath = output_dir / f"{filename}.png"

    with open(filepath, "wb") as f:
        f.write(screenshot_bytes)

    logger.debug(f"Screenshot sauvegardé: {filepath} ({len(screenshot_bytes)} bytes)")
    return filepath