chore: sync project files

2026-01-13 19:49:04 +01:00
parent 53f8227941
commit ecda149a4b
149 changed files with 65272 additions and 1 deletions
--- a/pricewatch/app/core/io.py
+++ b/pricewatch/app/core/io.py
@@ -0,0 +1,234 @@
+"""
+Fonctions d'entrée/sortie pour PriceWatch.
+
+Gère la lecture de la configuration YAML et l'écriture des résultats JSON.
+"""
+
+import json
+from pathlib import Path
+from typing import Any
+
+import yaml
+from pydantic import BaseModel, Field, field_validator
+
+from pricewatch.app.core.logging import get_logger
+from pricewatch.app.core.schema import ProductSnapshot
+
+logger = get_logger("core.io")
+
+
+class ScrapingOptions(BaseModel):
+    """Options de scraping depuis le fichier YAML."""
+
+    use_playwright: bool = Field(
+        default=True, description="Utiliser Playwright en fallback"
+    )
+    headful: bool = Field(default=False, description="Mode headful (voir le navigateur)")
+    save_html: bool = Field(
+        default=True, description="Sauvegarder HTML pour debug"
+    )
+    save_screenshot: bool = Field(
+        default=True, description="Sauvegarder screenshot pour debug"
+    )
+    timeout_ms: int = Field(
+        default=60000, description="Timeout par page en millisecondes", ge=1000
+    )
+
+
+class ScrapingConfig(BaseModel):
+    """Configuration complète du scraping depuis YAML."""
+
+    urls: list[str] = Field(description="Liste des URLs à scraper")
+    options: ScrapingOptions = Field(
+        default_factory=ScrapingOptions, description="Options de scraping"
+    )
+
+    @field_validator("urls")
+    @classmethod
+    def validate_urls(cls, v: list[str]) -> list[str]:
+        """Valide et nettoie les URLs."""
+        if not v:
+            raise ValueError("Au moins une URL doit être fournie")
+
+        cleaned = [url.strip() for url in v if url and url.strip()]
+        if not cleaned:
+            raise ValueError("Aucune URL valide trouvée")
+
+        return cleaned
+
+
+def read_yaml_config(yaml_path: str | Path) -> ScrapingConfig:
+    """
+    Lit et valide le fichier YAML de configuration.
+
+    Args:
+        yaml_path: Chemin vers le fichier YAML
+
+    Returns:
+        Configuration validée
+
+    Raises:
+        FileNotFoundError: Si le fichier n'existe pas
+        ValueError: Si le YAML est invalide
+
+    Justification technique:
+    - Utilisation de Pydantic pour valider la structure YAML
+    - Cela évite des bugs si le fichier est mal formé
+    - Les erreurs sont explicites pour l'utilisateur
+    """
+    yaml_path = Path(yaml_path)
+
+    if not yaml_path.exists():
+        logger.error(f"Fichier YAML introuvable: {yaml_path}")
+        raise FileNotFoundError(f"Fichier YAML introuvable: {yaml_path}")
+
+    logger.info(f"Lecture configuration: {yaml_path}")
+
+    try:
+        with open(yaml_path, "r", encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+
+        if not data:
+            raise ValueError("Fichier YAML vide")
+
+        config = ScrapingConfig.model_validate(data)
+        logger.info(
+            f"Configuration chargée: {len(config.urls)} URL(s), "
+            f"playwright={config.options.use_playwright}"
+        )
+        return config
+
+    except yaml.YAMLError as e:
+        logger.error(f"Erreur parsing YAML: {e}")
+        raise ValueError(f"YAML invalide: {e}") from e
+    except Exception as e:
+        logger.error(f"Erreur validation config: {e}")
+        raise
+
+
+def write_json_results(
+    snapshots: list[ProductSnapshot], json_path: str | Path, indent: int = 2
+) -> None:
+    """
+    Écrit les résultats du scraping dans un fichier JSON.
+
+    Args:
+        snapshots: Liste des ProductSnapshot à sauvegarder
+        json_path: Chemin du fichier JSON de sortie
+        indent: Indentation pour lisibilité (None = compact)
+
+    Justification technique:
+    - Serialization via Pydantic pour garantir la structure
+    - Pretty-print par défaut (indent=2) pour faciliter le debug manuel
+    - Création automatique des dossiers parents si nécessaire
+    """
+    json_path = Path(json_path)
+
+    # Créer le dossier parent si nécessaire
+    json_path.parent.mkdir(parents=True, exist_ok=True)
+
+    logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {json_path}")
+
+    try:
+        # Serialization via Pydantic
+        data = [snapshot.model_dump(mode="json") for snapshot in snapshots]
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=indent, ensure_ascii=False)
+
+        logger.info(f"Résultats sauvegardés: {json_path} ({json_path.stat().st_size} bytes)")
+
+    except Exception as e:
+        logger.error(f"Erreur écriture JSON: {e}")
+        raise
+
+
+def read_json_results(json_path: str | Path) -> list[ProductSnapshot]:
+    """
+    Lit et valide un fichier JSON de résultats.
+
+    Args:
+        json_path: Chemin vers le fichier JSON
+
+    Returns:
+        Liste de ProductSnapshot validés
+
+    Raises:
+        FileNotFoundError: Si le fichier n'existe pas
+        ValueError: Si le JSON est invalide
+    """
+    json_path = Path(json_path)
+
+    if not json_path.exists():
+        logger.error(f"Fichier JSON introuvable: {json_path}")
+        raise FileNotFoundError(f"Fichier JSON introuvable: {json_path}")
+
+    logger.info(f"Lecture résultats: {json_path}")
+
+    try:
+        with open(json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        if not isinstance(data, list):
+            raise ValueError("Le JSON doit contenir une liste")
+
+        snapshots = [ProductSnapshot.model_validate(item) for item in data]
+        logger.info(f"{len(snapshots)} snapshot(s) chargé(s)")
+        return snapshots
+
+    except json.JSONDecodeError as e:
+        logger.error(f"Erreur parsing JSON: {e}")
+        raise ValueError(f"JSON invalide: {e}") from e
+    except Exception as e:
+        logger.error(f"Erreur validation snapshots: {e}")
+        raise
+
+
+def save_debug_html(html: str, filename: str, output_dir: str | Path = "scraped") -> Path:
+    """
+    Sauvegarde le HTML récupéré pour debug.
+
+    Args:
+        html: Contenu HTML
+        filename: Nom du fichier (sans extension)
+        output_dir: Dossier de sortie
+
+    Returns:
+        Chemin du fichier sauvegardé
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    filepath = output_dir / f"{filename}.html"
+
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(html)
+
+    logger.debug(f"HTML sauvegardé: {filepath} ({len(html)} chars)")
+    return filepath
+
+
+def save_debug_screenshot(
+    screenshot_bytes: bytes, filename: str, output_dir: str | Path = "scraped"
+) -> Path:
+    """
+    Sauvegarde un screenshot pour debug.
+
+    Args:
+        screenshot_bytes: Données binaires du screenshot
+        filename: Nom du fichier (sans extension)
+        output_dir: Dossier de sortie
+
+    Returns:
+        Chemin du fichier sauvegardé
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    filepath = output_dir / f"{filename}.png"
+
+    with open(filepath, "wb") as f:
+        f.write(screenshot_bytes)
+
+    logger.debug(f"Screenshot sauvegardé: {filepath} ({len(screenshot_bytes)} bytes)")
+    return filepath