chore: sync project files
This commit is contained in:
0
pricewatch/app/__init__.py
Executable file
0
pricewatch/app/__init__.py
Executable file
BIN
pricewatch/app/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
0
pricewatch/app/cli/__init__.py
Executable file
0
pricewatch/app/cli/__init__.py
Executable file
BIN
pricewatch/app/cli/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/cli/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/cli/__pycache__/main.cpython-313.pyc
Executable file
BIN
pricewatch/app/cli/__pycache__/main.cpython-313.pyc
Executable file
Binary file not shown.
363
pricewatch/app/cli/main.py
Executable file
363
pricewatch/app/cli/main.py
Executable file
@@ -0,0 +1,363 @@
|
||||
"""
|
||||
CLI PriceWatch - Interface en ligne de commande.
|
||||
|
||||
Commandes disponibles:
|
||||
- run: Pipeline complet YAML → JSON
|
||||
- detect: Détection du store depuis une URL
|
||||
- fetch: Récupération d'une page (HTTP ou Playwright)
|
||||
- parse: Parsing d'un fichier HTML
|
||||
- doctor: Vérification de l'installation
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from rich import print as rprint
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from pricewatch.app.core import logging as app_logging
|
||||
from pricewatch.app.core.io import read_yaml_config, write_json_results
|
||||
from pricewatch.app.core.logging import get_logger, set_level
|
||||
from pricewatch.app.core.registry import get_registry, register_store
|
||||
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod
|
||||
from pricewatch.app.scraping.http_fetch import fetch_http
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
from pricewatch.app.stores.amazon.store import AmazonStore
|
||||
from pricewatch.app.stores.cdiscount.store import CdiscountStore
|
||||
|
||||
# Créer l'application Typer
|
||||
app = typer.Typer(
|
||||
name="pricewatch",
|
||||
help="Application de suivi de prix e-commerce",
|
||||
add_completion=False,
|
||||
)
|
||||
|
||||
console = Console()
|
||||
logger = get_logger("cli")
|
||||
|
||||
|
||||
def setup_stores():
|
||||
"""Enregistre tous les stores disponibles dans le registry."""
|
||||
registry = get_registry()
|
||||
registry.register(AmazonStore())
|
||||
registry.register(CdiscountStore())
|
||||
|
||||
|
||||
@app.command()
|
||||
def run(
|
||||
yaml: Path = typer.Option(
|
||||
"scrap_url.yaml",
|
||||
"--yaml",
|
||||
"-y",
|
||||
help="Fichier YAML de configuration",
|
||||
exists=True,
|
||||
),
|
||||
out: Path = typer.Option(
|
||||
"scraped_store.json",
|
||||
"--out",
|
||||
"-o",
|
||||
help="Fichier JSON de sortie",
|
||||
),
|
||||
debug: bool = typer.Option(
|
||||
False,
|
||||
"--debug",
|
||||
"-d",
|
||||
help="Activer le mode debug",
|
||||
),
|
||||
):
|
||||
"""
|
||||
Pipeline complet: scrape toutes les URLs du YAML et génère le JSON.
|
||||
"""
|
||||
if debug:
|
||||
set_level("DEBUG")
|
||||
|
||||
logger.info("=== Démarrage du pipeline PriceWatch ===")
|
||||
|
||||
# Initialiser les stores
|
||||
setup_stores()
|
||||
registry = get_registry()
|
||||
logger.info(f"Stores enregistrés: {', '.join(registry.list_stores())}")
|
||||
|
||||
# Lire la configuration
|
||||
try:
|
||||
config = read_yaml_config(yaml)
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur lecture YAML: {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
logger.info(f"{len(config.urls)} URL(s) à scraper")
|
||||
|
||||
# Scraper chaque URL
|
||||
snapshots = []
|
||||
for i, url in enumerate(config.urls, 1):
|
||||
logger.info(f"[{i}/{len(config.urls)}] Traitement: {url}")
|
||||
|
||||
# Détecter le store
|
||||
store = registry.detect_store(url)
|
||||
if not store:
|
||||
logger.error(f"Aucun store trouvé pour: {url}")
|
||||
continue
|
||||
|
||||
# Canoniser l'URL
|
||||
canonical_url = store.canonicalize(url)
|
||||
logger.info(f"URL canonique: {canonical_url}")
|
||||
|
||||
# Récupérer la page
|
||||
html = None
|
||||
fetch_method = FetchMethod.HTTP
|
||||
fetch_error = None
|
||||
|
||||
# Tenter HTTP d'abord
|
||||
logger.info("Tentative HTTP...")
|
||||
http_result = fetch_http(canonical_url)
|
||||
|
||||
if http_result.success:
|
||||
html = http_result.html
|
||||
fetch_method = FetchMethod.HTTP
|
||||
logger.info("✓ HTTP réussi")
|
||||
elif config.options.use_playwright:
|
||||
# Fallback Playwright
|
||||
logger.warning(f"HTTP échoué: {http_result.error}, fallback Playwright")
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not config.options.headful,
|
||||
timeout_ms=config.options.timeout_ms,
|
||||
save_screenshot=config.options.save_screenshot,
|
||||
)
|
||||
|
||||
if pw_result.success:
|
||||
html = pw_result.html
|
||||
fetch_method = FetchMethod.PLAYWRIGHT
|
||||
logger.info("✓ Playwright réussi")
|
||||
|
||||
# Sauvegarder screenshot si demandé
|
||||
if config.options.save_screenshot and pw_result.screenshot:
|
||||
from pricewatch.app.core.io import save_debug_screenshot
|
||||
|
||||
ref = store.extract_reference(canonical_url) or f"url_{i}"
|
||||
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
|
||||
else:
|
||||
fetch_error = pw_result.error
|
||||
logger.error(f"✗ Playwright échoué: {fetch_error}")
|
||||
else:
|
||||
fetch_error = http_result.error
|
||||
logger.error(f"✗ HTTP échoué: {fetch_error}")
|
||||
|
||||
# Parser si on a du HTML
|
||||
if html:
|
||||
try:
|
||||
# Sauvegarder HTML si demandé
|
||||
if config.options.save_html:
|
||||
from pricewatch.app.core.io import save_debug_html
|
||||
|
||||
ref = store.extract_reference(canonical_url) or f"url_{i}"
|
||||
save_debug_html(html, f"{store.store_id}_{ref}")
|
||||
|
||||
snapshot = store.parse(html, canonical_url)
|
||||
snapshot.debug.method = fetch_method
|
||||
snapshots.append(snapshot)
|
||||
|
||||
status_emoji = "✓" if snapshot.is_complete() else "⚠"
|
||||
logger.info(
|
||||
f"{status_emoji} Parsing: title={bool(snapshot.title)}, "
|
||||
f"price={snapshot.price is not None}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"✗ Erreur parsing: {e}")
|
||||
# Créer un snapshot failed
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
url=canonical_url,
|
||||
debug=DebugInfo(
|
||||
method=fetch_method,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=[f"Parsing failed: {str(e)}"],
|
||||
),
|
||||
)
|
||||
snapshots.append(snapshot)
|
||||
else:
|
||||
# Pas de HTML récupéré
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id if store else "unknown",
|
||||
url=canonical_url,
|
||||
debug=DebugInfo(
|
||||
method=fetch_method,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
|
||||
),
|
||||
)
|
||||
snapshots.append(snapshot)
|
||||
|
||||
# Écrire les résultats
|
||||
logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {out}")
|
||||
try:
|
||||
write_json_results(snapshots, out)
|
||||
logger.info("✓ Pipeline terminé avec succès")
|
||||
except Exception as e:
|
||||
logger.error(f"✗ Erreur écriture JSON: {e}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def detect(url: str):
|
||||
"""
|
||||
Détecte le store correspondant à une URL.
|
||||
"""
|
||||
logger.info(f"Détection du store pour: {url}")
|
||||
|
||||
setup_stores()
|
||||
registry = get_registry()
|
||||
|
||||
store = registry.detect_store(url)
|
||||
|
||||
if store:
|
||||
rprint(f"[green]✓ Store détecté: {store.store_id}[/green]")
|
||||
rprint(f" URL canonique: {store.canonicalize(url)}")
|
||||
rprint(f" Référence: {store.extract_reference(url)}")
|
||||
else:
|
||||
rprint("[red]✗ Aucun store trouvé[/red]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def fetch(
|
||||
url: str,
|
||||
http: bool = typer.Option(False, "--http", help="Forcer HTTP"),
|
||||
playwright: bool = typer.Option(False, "--playwright", help="Forcer Playwright"),
|
||||
headful: bool = typer.Option(False, "--headful", help="Mode Playwright visible"),
|
||||
debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
|
||||
):
|
||||
"""
|
||||
Récupère une page via HTTP ou Playwright.
|
||||
"""
|
||||
if debug:
|
||||
set_level("DEBUG")
|
||||
|
||||
if http and playwright:
|
||||
rprint("[red]✗ Impossible de spécifier --http et --playwright ensemble[/red]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
if playwright or (not http and not playwright):
|
||||
# Playwright par défaut ou explicite
|
||||
logger.info(f"Récupération via Playwright: {url}")
|
||||
result = fetch_playwright(url, headless=not headful)
|
||||
|
||||
if result.success:
|
||||
rprint(f"[green]✓ Succès[/green]")
|
||||
rprint(f" Taille HTML: {len(result.html)} chars")
|
||||
rprint(f" Durée: {result.duration_ms}ms")
|
||||
else:
|
||||
rprint(f"[red]✗ Échec: {result.error}[/red]")
|
||||
raise typer.Exit(code=1)
|
||||
else:
|
||||
# HTTP explicite
|
||||
logger.info(f"Récupération via HTTP: {url}")
|
||||
result = fetch_http(url)
|
||||
|
||||
if result.success:
|
||||
rprint(f"[green]✓ Succès[/green]")
|
||||
rprint(f" Taille HTML: {len(result.html)} chars")
|
||||
rprint(f" Status: {result.status_code}")
|
||||
rprint(f" Durée: {result.duration_ms}ms")
|
||||
else:
|
||||
rprint(f"[red]✗ Échec: {result.error}[/red]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def parse(
|
||||
store: str = typer.Argument(..., help="Store ID (amazon, cdiscount)"),
|
||||
html_file: Path = typer.Option(
|
||||
..., "--in", "-i", help="Fichier HTML à parser", exists=True
|
||||
),
|
||||
debug: bool = typer.Option(False, "--debug", "-d", help="Mode debug"),
|
||||
):
|
||||
"""
|
||||
Parse un fichier HTML avec un store spécifique.
|
||||
"""
|
||||
if debug:
|
||||
set_level("DEBUG")
|
||||
|
||||
setup_stores()
|
||||
registry = get_registry()
|
||||
|
||||
store_obj = registry.get_store(store)
|
||||
if not store_obj:
|
||||
rprint(f"[red]✗ Store inconnu: {store}[/red]")
|
||||
rprint(f"Stores disponibles: {', '.join(registry.list_stores())}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
logger.info(f"Parsing avec {store}: {html_file}")
|
||||
|
||||
with open(html_file, "r", encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
|
||||
try:
|
||||
snapshot = store_obj.parse(html, url="file://local")
|
||||
|
||||
if snapshot.is_complete():
|
||||
rprint("[green]✓ Parsing réussi[/green]")
|
||||
else:
|
||||
rprint("[yellow]⚠ Parsing partiel[/yellow]")
|
||||
|
||||
rprint(f" Titre: {snapshot.title or 'N/A'}")
|
||||
rprint(f" Prix: {snapshot.price} {snapshot.currency}")
|
||||
rprint(f" Référence: {snapshot.reference or 'N/A'}")
|
||||
rprint(f" Stock: {snapshot.stock_status}")
|
||||
rprint(f" Images: {len(snapshot.images)}")
|
||||
rprint(f" Specs: {len(snapshot.specs)}")
|
||||
|
||||
except Exception as e:
|
||||
rprint(f"[red]✗ Erreur parsing: {e}[/red]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def doctor():
|
||||
"""
|
||||
Vérifie l'installation de PriceWatch.
|
||||
"""
|
||||
table = Table(title="PriceWatch Doctor")
|
||||
table.add_column("Composant", style="cyan")
|
||||
table.add_column("Statut", style="green")
|
||||
|
||||
# Python version
|
||||
table.add_row("Python", f"{sys.version.split()[0]} ✓")
|
||||
|
||||
# Dépendances
|
||||
deps = [
|
||||
("typer", "typer"),
|
||||
("pydantic", "pydantic"),
|
||||
("requests", "requests"),
|
||||
("playwright", "playwright"),
|
||||
("beautifulsoup4", "bs4"),
|
||||
("pyyaml", "yaml"),
|
||||
]
|
||||
|
||||
for name, module in deps:
|
||||
try:
|
||||
__import__(module)
|
||||
table.add_row(name, "✓ Installé")
|
||||
except ImportError:
|
||||
table.add_row(name, "✗ Manquant")
|
||||
|
||||
# Stores
|
||||
setup_stores()
|
||||
registry = get_registry()
|
||||
table.add_row("Stores", f"{len(registry)} enregistrés: {', '.join(registry.list_stores())}")
|
||||
|
||||
console.print(table)
|
||||
|
||||
rprint("\n[green]✓ PriceWatch est prêt![/green]")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
0
pricewatch/app/core/__init__.py
Executable file
0
pricewatch/app/core/__init__.py
Executable file
BIN
pricewatch/app/core/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/core/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/core/__pycache__/io.cpython-313.pyc
Executable file
BIN
pricewatch/app/core/__pycache__/io.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/core/__pycache__/logging.cpython-313.pyc
Executable file
BIN
pricewatch/app/core/__pycache__/logging.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/core/__pycache__/registry.cpython-313.pyc
Executable file
BIN
pricewatch/app/core/__pycache__/registry.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/core/__pycache__/schema.cpython-313.pyc
Executable file
BIN
pricewatch/app/core/__pycache__/schema.cpython-313.pyc
Executable file
Binary file not shown.
234
pricewatch/app/core/io.py
Executable file
234
pricewatch/app/core/io.py
Executable file
@@ -0,0 +1,234 @@
|
||||
"""
|
||||
Fonctions d'entrée/sortie pour PriceWatch.
|
||||
|
||||
Gère la lecture de la configuration YAML et l'écriture des résultats JSON.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
|
||||
logger = get_logger("core.io")
|
||||
|
||||
|
||||
class ScrapingOptions(BaseModel):
|
||||
"""Options de scraping depuis le fichier YAML."""
|
||||
|
||||
use_playwright: bool = Field(
|
||||
default=True, description="Utiliser Playwright en fallback"
|
||||
)
|
||||
headful: bool = Field(default=False, description="Mode headful (voir le navigateur)")
|
||||
save_html: bool = Field(
|
||||
default=True, description="Sauvegarder HTML pour debug"
|
||||
)
|
||||
save_screenshot: bool = Field(
|
||||
default=True, description="Sauvegarder screenshot pour debug"
|
||||
)
|
||||
timeout_ms: int = Field(
|
||||
default=60000, description="Timeout par page en millisecondes", ge=1000
|
||||
)
|
||||
|
||||
|
||||
class ScrapingConfig(BaseModel):
|
||||
"""Configuration complète du scraping depuis YAML."""
|
||||
|
||||
urls: list[str] = Field(description="Liste des URLs à scraper")
|
||||
options: ScrapingOptions = Field(
|
||||
default_factory=ScrapingOptions, description="Options de scraping"
|
||||
)
|
||||
|
||||
@field_validator("urls")
|
||||
@classmethod
|
||||
def validate_urls(cls, v: list[str]) -> list[str]:
|
||||
"""Valide et nettoie les URLs."""
|
||||
if not v:
|
||||
raise ValueError("Au moins une URL doit être fournie")
|
||||
|
||||
cleaned = [url.strip() for url in v if url and url.strip()]
|
||||
if not cleaned:
|
||||
raise ValueError("Aucune URL valide trouvée")
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def read_yaml_config(yaml_path: str | Path) -> ScrapingConfig:
|
||||
"""
|
||||
Lit et valide le fichier YAML de configuration.
|
||||
|
||||
Args:
|
||||
yaml_path: Chemin vers le fichier YAML
|
||||
|
||||
Returns:
|
||||
Configuration validée
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: Si le fichier n'existe pas
|
||||
ValueError: Si le YAML est invalide
|
||||
|
||||
Justification technique:
|
||||
- Utilisation de Pydantic pour valider la structure YAML
|
||||
- Cela évite des bugs si le fichier est mal formé
|
||||
- Les erreurs sont explicites pour l'utilisateur
|
||||
"""
|
||||
yaml_path = Path(yaml_path)
|
||||
|
||||
if not yaml_path.exists():
|
||||
logger.error(f"Fichier YAML introuvable: {yaml_path}")
|
||||
raise FileNotFoundError(f"Fichier YAML introuvable: {yaml_path}")
|
||||
|
||||
logger.info(f"Lecture configuration: {yaml_path}")
|
||||
|
||||
try:
|
||||
with open(yaml_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not data:
|
||||
raise ValueError("Fichier YAML vide")
|
||||
|
||||
config = ScrapingConfig.model_validate(data)
|
||||
logger.info(
|
||||
f"Configuration chargée: {len(config.urls)} URL(s), "
|
||||
f"playwright={config.options.use_playwright}"
|
||||
)
|
||||
return config
|
||||
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"Erreur parsing YAML: {e}")
|
||||
raise ValueError(f"YAML invalide: {e}") from e
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur validation config: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def write_json_results(
|
||||
snapshots: list[ProductSnapshot], json_path: str | Path, indent: int = 2
|
||||
) -> None:
|
||||
"""
|
||||
Écrit les résultats du scraping dans un fichier JSON.
|
||||
|
||||
Args:
|
||||
snapshots: Liste des ProductSnapshot à sauvegarder
|
||||
json_path: Chemin du fichier JSON de sortie
|
||||
indent: Indentation pour lisibilité (None = compact)
|
||||
|
||||
Justification technique:
|
||||
- Serialization via Pydantic pour garantir la structure
|
||||
- Pretty-print par défaut (indent=2) pour faciliter le debug manuel
|
||||
- Création automatique des dossiers parents si nécessaire
|
||||
"""
|
||||
json_path = Path(json_path)
|
||||
|
||||
# Créer le dossier parent si nécessaire
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f"Écriture de {len(snapshots)} snapshot(s) dans: {json_path}")
|
||||
|
||||
try:
|
||||
# Serialization via Pydantic
|
||||
data = [snapshot.model_dump(mode="json") for snapshot in snapshots]
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=indent, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Résultats sauvegardés: {json_path} ({json_path.stat().st_size} bytes)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur écriture JSON: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def read_json_results(json_path: str | Path) -> list[ProductSnapshot]:
|
||||
"""
|
||||
Lit et valide un fichier JSON de résultats.
|
||||
|
||||
Args:
|
||||
json_path: Chemin vers le fichier JSON
|
||||
|
||||
Returns:
|
||||
Liste de ProductSnapshot validés
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: Si le fichier n'existe pas
|
||||
ValueError: Si le JSON est invalide
|
||||
"""
|
||||
json_path = Path(json_path)
|
||||
|
||||
if not json_path.exists():
|
||||
logger.error(f"Fichier JSON introuvable: {json_path}")
|
||||
raise FileNotFoundError(f"Fichier JSON introuvable: {json_path}")
|
||||
|
||||
logger.info(f"Lecture résultats: {json_path}")
|
||||
|
||||
try:
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("Le JSON doit contenir une liste")
|
||||
|
||||
snapshots = [ProductSnapshot.model_validate(item) for item in data]
|
||||
logger.info(f"{len(snapshots)} snapshot(s) chargé(s)")
|
||||
return snapshots
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Erreur parsing JSON: {e}")
|
||||
raise ValueError(f"JSON invalide: {e}") from e
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur validation snapshots: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def save_debug_html(html: str, filename: str, output_dir: str | Path = "scraped") -> Path:
|
||||
"""
|
||||
Sauvegarde le HTML récupéré pour debug.
|
||||
|
||||
Args:
|
||||
html: Contenu HTML
|
||||
filename: Nom du fichier (sans extension)
|
||||
output_dir: Dossier de sortie
|
||||
|
||||
Returns:
|
||||
Chemin du fichier sauvegardé
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
filepath = output_dir / f"{filename}.html"
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
logger.debug(f"HTML sauvegardé: {filepath} ({len(html)} chars)")
|
||||
return filepath
|
||||
|
||||
|
||||
def save_debug_screenshot(
|
||||
screenshot_bytes: bytes, filename: str, output_dir: str | Path = "scraped"
|
||||
) -> Path:
|
||||
"""
|
||||
Sauvegarde un screenshot pour debug.
|
||||
|
||||
Args:
|
||||
screenshot_bytes: Données binaires du screenshot
|
||||
filename: Nom du fichier (sans extension)
|
||||
output_dir: Dossier de sortie
|
||||
|
||||
Returns:
|
||||
Chemin du fichier sauvegardé
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
filepath = output_dir / f"{filename}.png"
|
||||
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(screenshot_bytes)
|
||||
|
||||
logger.debug(f"Screenshot sauvegardé: {filepath} ({len(screenshot_bytes)} bytes)")
|
||||
return filepath
|
||||
112
pricewatch/app/core/logging.py
Executable file
112
pricewatch/app/core/logging.py
Executable file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Configuration du système de logging pour PriceWatch.
|
||||
|
||||
Fournit un logger configuré avec formatage coloré et niveaux appropriés.
|
||||
Les logs incluent : timestamp, niveau, module, et message.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
"""Formatter avec couleurs pour améliorer la lisibilité en CLI."""
|
||||
|
||||
# Codes ANSI pour les couleurs
|
||||
COLORS = {
|
||||
"DEBUG": "\033[36m", # Cyan
|
||||
"INFO": "\033[32m", # Vert
|
||||
"WARNING": "\033[33m", # Jaune
|
||||
"ERROR": "\033[31m", # Rouge
|
||||
"CRITICAL": "\033[35m", # Magenta
|
||||
}
|
||||
RESET = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
"""Formate le log avec couleurs selon le niveau."""
|
||||
# Copie pour éviter de modifier l'original
|
||||
log_color = self.COLORS.get(record.levelname, self.RESET)
|
||||
record.levelname = f"{log_color}{self.BOLD}{record.levelname}{self.RESET}"
|
||||
|
||||
# Colorer le nom du module
|
||||
record.name = f"\033[90m{record.name}{self.RESET}"
|
||||
|
||||
return super().format(record)
|
||||
|
||||
|
||||
def setup_logging(level: str = "INFO", enable_colors: bool = True) -> logging.Logger:
|
||||
"""
|
||||
Configure le logger racine de PriceWatch.
|
||||
|
||||
Args:
|
||||
level: Niveau de log (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
enable_colors: Activer la colorisation (désactiver pour les logs fichier)
|
||||
|
||||
Returns:
|
||||
Logger configuré
|
||||
|
||||
Justification technique:
|
||||
- Handler unique sur stdout pour éviter les duplications
|
||||
- Format détaillé avec timestamp ISO8601 pour faciliter le debug
|
||||
- Colorisation optionnelle pour améliorer l'UX en CLI
|
||||
"""
|
||||
logger = logging.getLogger("pricewatch")
|
||||
|
||||
# Éviter d'ajouter plusieurs handlers si appelé plusieurs fois
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
logger.setLevel(getattr(logging, level.upper(), logging.INFO))
|
||||
logger.propagate = False
|
||||
|
||||
# Handler console
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(logger.level)
|
||||
|
||||
# Format avec timestamp ISO8601
|
||||
log_format = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
|
||||
date_format = "%Y-%m-%d %H:%M:%S"
|
||||
|
||||
if enable_colors and sys.stdout.isatty():
|
||||
formatter = ColoredFormatter(log_format, datefmt=date_format)
|
||||
else:
|
||||
formatter = logging.Formatter(log_format, datefmt=date_format)
|
||||
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
def get_logger(name: Optional[str] = None) -> logging.Logger:
|
||||
"""
|
||||
Retourne un logger enfant de 'pricewatch'.
|
||||
|
||||
Args:
|
||||
name: Nom du sous-module (ex: 'scraping.http')
|
||||
|
||||
Returns:
|
||||
Logger configuré
|
||||
"""
|
||||
if name:
|
||||
return logging.getLogger(f"pricewatch.{name}")
|
||||
return logging.getLogger("pricewatch")
|
||||
|
||||
|
||||
def set_level(level: str) -> None:
|
||||
"""
|
||||
Change dynamiquement le niveau de log.
|
||||
|
||||
Args:
|
||||
level: Nouveau niveau (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
"""
|
||||
logger = logging.getLogger("pricewatch")
|
||||
logger.setLevel(getattr(logging, level.upper(), logging.INFO))
|
||||
for handler in logger.handlers:
|
||||
handler.setLevel(logger.level)
|
||||
|
||||
|
||||
# Initialisation par défaut au premier import
|
||||
_default_logger = setup_logging()
|
||||
191
pricewatch/app/core/registry.py
Executable file
191
pricewatch/app/core/registry.py
Executable file
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
Registry pour la détection automatique des stores.
|
||||
|
||||
Le Registry maintient une liste de tous les stores disponibles et
|
||||
peut détecter automatiquement quel store correspond à une URL donnée.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
|
||||
logger = get_logger("core.registry")
|
||||
|
||||
|
||||
class StoreRegistry:
|
||||
"""
|
||||
Registry central pour tous les stores.
|
||||
|
||||
Permet d'enregistrer des stores et de détecter automatiquement
|
||||
le bon store depuis une URL via la méthode match().
|
||||
|
||||
Justification technique:
|
||||
- Pattern Registry pour découpler la détection des stores du code métier
|
||||
- Extensible: ajouter un nouveau store = juste register() un nouvel objet
|
||||
- Pas de dépendances hardcodées entre modules
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialise un registry vide."""
|
||||
self._stores: list[BaseStore] = []
|
||||
logger.debug("Registry initialisé")
|
||||
|
||||
def register(self, store: BaseStore) -> None:
|
||||
"""
|
||||
Enregistre un nouveau store dans le registry.
|
||||
|
||||
Args:
|
||||
store: Instance de BaseStore à enregistrer
|
||||
"""
|
||||
if not isinstance(store, BaseStore):
|
||||
raise TypeError(f"Expected BaseStore, got {type(store)}")
|
||||
|
||||
# Éviter les doublons
|
||||
if any(s.store_id == store.store_id for s in self._stores):
|
||||
logger.warning(f"Store '{store.store_id}' déjà enregistré, remplacement")
|
||||
self._stores = [s for s in self._stores if s.store_id != store.store_id]
|
||||
|
||||
self._stores.append(store)
|
||||
logger.info(f"Store enregistré: {store.store_id}")
|
||||
|
||||
def unregister(self, store_id: str) -> bool:
|
||||
"""
|
||||
Retire un store du registry.
|
||||
|
||||
Args:
|
||||
store_id: ID du store à retirer
|
||||
|
||||
Returns:
|
||||
True si le store a été retiré, False s'il n'était pas présent
|
||||
"""
|
||||
initial_count = len(self._stores)
|
||||
self._stores = [s for s in self._stores if s.store_id != store_id]
|
||||
removed = len(self._stores) < initial_count
|
||||
|
||||
if removed:
|
||||
logger.info(f"Store désenregistré: {store_id}")
|
||||
else:
|
||||
logger.warning(f"Store non trouvé pour désenregistrement: {store_id}")
|
||||
|
||||
return removed
|
||||
|
||||
def get_store(self, store_id: str) -> Optional[BaseStore]:
|
||||
"""
|
||||
Récupère un store par son ID.
|
||||
|
||||
Args:
|
||||
store_id: ID du store à récupérer
|
||||
|
||||
Returns:
|
||||
Instance du store ou None si non trouvé
|
||||
"""
|
||||
for store in self._stores:
|
||||
if store.store_id == store_id:
|
||||
return store
|
||||
return None
|
||||
|
||||
def detect_store(self, url: str) -> Optional[BaseStore]:
|
||||
"""
|
||||
Détecte automatiquement le store correspondant à une URL.
|
||||
|
||||
Args:
|
||||
url: URL à analyser
|
||||
|
||||
Returns:
|
||||
Store avec le meilleur score, ou None si aucun match
|
||||
|
||||
Justification technique:
|
||||
- Teste tous les stores enregistrés avec leur méthode match()
|
||||
- Retourne celui avec le score le plus élevé (> 0)
|
||||
- Permet de gérer les ambiguïtés (ex: sous-domaines multiples)
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.warning("URL vide fournie pour détection")
|
||||
return None
|
||||
|
||||
if not self._stores:
|
||||
logger.warning("Aucun store enregistré dans le registry")
|
||||
return None
|
||||
|
||||
best_store: Optional[BaseStore] = None
|
||||
best_score = 0.0
|
||||
|
||||
logger.debug(f"Détection du store pour: {url}")
|
||||
|
||||
for store in self._stores:
|
||||
try:
|
||||
score = store.match(url)
|
||||
logger.debug(f" {store.store_id}: score={score:.2f}")
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_store = store
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur lors du match de {store.store_id}: {e}")
|
||||
continue
|
||||
|
||||
if best_store:
|
||||
logger.info(
|
||||
f"Store détecté: {best_store.store_id} (score={best_score:.2f})"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Aucun store trouvé pour: {url}")
|
||||
|
||||
return best_store
|
||||
|
||||
def list_stores(self) -> list[str]:
|
||||
"""
|
||||
Liste tous les stores enregistrés.
|
||||
|
||||
Returns:
|
||||
Liste des IDs de stores
|
||||
"""
|
||||
return [store.store_id for store in self._stores]
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Retourne le nombre de stores enregistrés."""
|
||||
return len(self._stores)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
stores_list = ", ".join(self.list_stores())
|
||||
return f"<StoreRegistry stores=[{stores_list}]>"
|
||||
|
||||
|
||||
# Instance globale du registry
|
||||
# Les stores s'y enregistreront lors de leur import
|
||||
_global_registry = StoreRegistry()
|
||||
|
||||
|
||||
def get_registry() -> StoreRegistry:
|
||||
"""
|
||||
Retourne l'instance globale du registry.
|
||||
|
||||
Returns:
|
||||
Registry singleton
|
||||
"""
|
||||
return _global_registry
|
||||
|
||||
|
||||
def register_store(store: BaseStore) -> None:
|
||||
"""
|
||||
Enregistre un store dans le registry global.
|
||||
|
||||
Args:
|
||||
store: Instance de BaseStore
|
||||
"""
|
||||
_global_registry.register(store)
|
||||
|
||||
|
||||
def detect_store(url: str) -> Optional[BaseStore]:
|
||||
"""
|
||||
Détecte le store depuis le registry global.
|
||||
|
||||
Args:
|
||||
url: URL à analyser
|
||||
|
||||
Returns:
|
||||
Store détecté ou None
|
||||
"""
|
||||
return _global_registry.detect_store(url)
|
||||
197
pricewatch/app/core/schema.py
Executable file
197
pricewatch/app/core/schema.py
Executable file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
Modèles de données Pydantic pour PriceWatch.
|
||||
|
||||
Ce module définit ProductSnapshot, le modèle canonique représentant
|
||||
toutes les informations récupérées lors du scraping d'un produit.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl, field_validator
|
||||
|
||||
|
||||
class StockStatus(str, Enum):
|
||||
"""Statut de disponibilité du produit."""
|
||||
|
||||
IN_STOCK = "in_stock"
|
||||
OUT_OF_STOCK = "out_of_stock"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class FetchMethod(str, Enum):
|
||||
"""Méthode utilisée pour récupérer la page."""
|
||||
|
||||
HTTP = "http"
|
||||
PLAYWRIGHT = "playwright"
|
||||
|
||||
|
||||
class DebugStatus(str, Enum):
|
||||
"""Statut technique de la récupération."""
|
||||
|
||||
SUCCESS = "success" # Récupération complète et parsing réussi
|
||||
PARTIAL = "partial" # Récupération OK mais parsing incomplet
|
||||
FAILED = "failed" # Échec complet (403, timeout, captcha, etc.)
|
||||
|
||||
|
||||
class DebugInfo(BaseModel):
|
||||
"""Informations de debug pour tracer les problèmes de scraping."""
|
||||
|
||||
method: FetchMethod = Field(
|
||||
description="Méthode utilisée pour la récupération (http ou playwright)"
|
||||
)
|
||||
status: DebugStatus = Field(description="Statut de la récupération")
|
||||
errors: list[str] = Field(
|
||||
default_factory=list, description="Liste des erreurs rencontrées"
|
||||
)
|
||||
notes: list[str] = Field(
|
||||
default_factory=list, description="Notes techniques sur la récupération"
|
||||
)
|
||||
duration_ms: Optional[int] = Field(
|
||||
default=None, description="Durée de la récupération en millisecondes"
|
||||
)
|
||||
html_size_bytes: Optional[int] = Field(
|
||||
default=None, description="Taille du HTML récupéré en octets"
|
||||
)
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
|
||||
|
||||
class ProductSnapshot(BaseModel):
|
||||
"""
|
||||
Modèle canonique représentant un produit scraped à un instant T.
|
||||
|
||||
Ce modèle unifie les données de tous les stores (Amazon, Cdiscount, etc.)
|
||||
dans une structure commune. Les champs peuvent être null si l'information
|
||||
n'est pas disponible sur le site source.
|
||||
"""
|
||||
|
||||
# Métadonnées
|
||||
source: str = Field(
|
||||
description="Identifiant du store source (amazon, cdiscount, unknown)"
|
||||
)
|
||||
url: str = Field(description="URL canonique du produit")
|
||||
fetched_at: datetime = Field(
|
||||
default_factory=datetime.now,
|
||||
description="Date et heure de récupération (ISO 8601)",
|
||||
)
|
||||
|
||||
# Données produit principales
|
||||
title: Optional[str] = Field(default=None, description="Nom du produit")
|
||||
price: Optional[float] = Field(default=None, description="Prix du produit", ge=0)
|
||||
currency: str = Field(default="EUR", description="Devise (EUR, USD, etc.)")
|
||||
shipping_cost: Optional[float] = Field(
|
||||
default=None, description="Frais de port", ge=0
|
||||
)
|
||||
stock_status: StockStatus = Field(
|
||||
default=StockStatus.UNKNOWN, description="Statut de disponibilité"
|
||||
)
|
||||
|
||||
# Identifiants et catégorisation
|
||||
reference: Optional[str] = Field(
|
||||
default=None, description="Référence produit (ASIN, SKU, etc.)"
|
||||
)
|
||||
category: Optional[str] = Field(default=None, description="Catégorie du produit")
|
||||
|
||||
# Médias
|
||||
images: list[str] = Field(
|
||||
default_factory=list, description="Liste des URLs d'images du produit"
|
||||
)
|
||||
|
||||
# Caractéristiques techniques
|
||||
specs: dict[str, str] = Field(
|
||||
default_factory=dict,
|
||||
description="Caractéristiques techniques (clé/valeur)",
|
||||
)
|
||||
|
||||
# Debug et traçabilité
|
||||
debug: DebugInfo = Field(
|
||||
description="Informations de debug pour traçabilité"
|
||||
)
|
||||
|
||||
@field_validator("url")
|
||||
@classmethod
|
||||
def validate_url(cls, v: str) -> str:
|
||||
"""Valide que l'URL n'est pas vide."""
|
||||
if not v or not v.strip():
|
||||
raise ValueError("URL cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
@field_validator("source")
|
||||
@classmethod
|
||||
def validate_source(cls, v: str) -> str:
|
||||
"""Valide et normalise le nom du store."""
|
||||
if not v or not v.strip():
|
||||
raise ValueError("Source cannot be empty")
|
||||
return v.strip().lower()
|
||||
|
||||
@field_validator("images")
|
||||
@classmethod
|
||||
def validate_images(cls, v: list[str]) -> list[str]:
|
||||
"""Filtre les URLs d'images vides."""
|
||||
return [url.strip() for url in v if url and url.strip()]
|
||||
|
||||
class Config:
|
||||
use_enum_values = True
|
||||
json_schema_extra = {
|
||||
"example": {
|
||||
"source": "amazon",
|
||||
"url": "https://www.amazon.fr/dp/B08N5WRWNW",
|
||||
"fetched_at": "2026-01-13T10:30:00Z",
|
||||
"title": "Exemple de produit",
|
||||
"price": 299.99,
|
||||
"currency": "EUR",
|
||||
"shipping_cost": 0.0,
|
||||
"stock_status": "in_stock",
|
||||
"reference": "B08N5WRWNW",
|
||||
"category": "Electronics",
|
||||
"images": [
|
||||
"https://example.com/image1.jpg",
|
||||
"https://example.com/image2.jpg",
|
||||
],
|
||||
"specs": {
|
||||
"Marque": "ExampleBrand",
|
||||
"Couleur": "Noir",
|
||||
"Poids": "2.5 kg",
|
||||
},
|
||||
"debug": {
|
||||
"method": "http",
|
||||
"status": "success",
|
||||
"errors": [],
|
||||
"notes": ["Récupération réussie du premier coup"],
|
||||
"duration_ms": 1250,
|
||||
"html_size_bytes": 145000,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serialize vers un dictionnaire Python natif."""
|
||||
return self.model_dump(mode="json")
|
||||
|
||||
def to_json(self, **kwargs) -> str:
|
||||
"""Serialize vers JSON."""
|
||||
return self.model_dump_json(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_str: str) -> "ProductSnapshot":
|
||||
"""Désérialise depuis JSON."""
|
||||
return cls.model_validate_json(json_str)
|
||||
|
||||
def is_complete(self) -> bool:
|
||||
"""
|
||||
Vérifie si le snapshot contient au minimum les données essentielles.
|
||||
|
||||
Retourne True si title ET price sont présents.
|
||||
"""
|
||||
return self.title is not None and self.price is not None
|
||||
|
||||
def add_error(self, error: str) -> None:
|
||||
"""Ajoute une erreur au debug."""
|
||||
self.debug.errors.append(error)
|
||||
|
||||
def add_note(self, note: str) -> None:
|
||||
"""Ajoute une note au debug."""
|
||||
self.debug.notes.append(note)
|
||||
0
pricewatch/app/scraping/__init__.py
Executable file
0
pricewatch/app/scraping/__init__.py
Executable file
BIN
pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/scraping/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/scraping/__pycache__/http_fetch.cpython-313.pyc
Executable file
BIN
pricewatch/app/scraping/__pycache__/http_fetch.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/scraping/__pycache__/pw_fetch.cpython-313.pyc
Executable file
BIN
pricewatch/app/scraping/__pycache__/pw_fetch.cpython-313.pyc
Executable file
Binary file not shown.
193
pricewatch/app/scraping/http_fetch.py
Executable file
193
pricewatch/app/scraping/http_fetch.py
Executable file
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Récupération HTTP simple pour le scraping.
|
||||
|
||||
Utilise requests avec rotation de User-Agent et gestion des erreurs.
|
||||
Méthode prioritaire avant le fallback Playwright (plus lent).
|
||||
"""
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from requests.exceptions import RequestException, Timeout
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
|
||||
logger = get_logger("scraping.http")
|
||||
|
||||
# Liste de User-Agents réalistes pour éviter les blocages
|
||||
USER_AGENTS = [
|
||||
# Chrome on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
# Chrome on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
# Firefox on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
# Firefox on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
# Safari on macOS
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
||||
# Edge on Windows
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
||||
]
|
||||
|
||||
|
||||
class FetchResult:
|
||||
"""Résultat d'une récupération HTTP."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
html: Optional[str] = None,
|
||||
error: Optional[str] = None,
|
||||
status_code: Optional[int] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
):
|
||||
self.success = success
|
||||
self.html = html
|
||||
self.error = error
|
||||
self.status_code = status_code
|
||||
self.duration_ms = duration_ms
|
||||
|
||||
|
||||
def fetch_http(
|
||||
url: str,
|
||||
timeout: int = 30,
|
||||
headers: Optional[dict] = None,
|
||||
follow_redirects: bool = True,
|
||||
) -> FetchResult:
|
||||
"""
|
||||
Récupère une page via HTTP simple avec requests.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
timeout: Timeout en secondes
|
||||
headers: Headers HTTP personnalisés (optionnel)
|
||||
follow_redirects: Suivre les redirections automatiquement
|
||||
|
||||
Returns:
|
||||
FetchResult avec le HTML ou l'erreur
|
||||
|
||||
Justification technique:
|
||||
- User-Agent aléatoire pour éviter les blocages basiques
|
||||
- Timeout configuré pour ne pas bloquer indéfiniment
|
||||
- Gestion explicite des codes d'erreur (403, 404, 429, etc.)
|
||||
- Headers Accept pour indiquer qu'on veut du HTML
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.error("URL vide fournie")
|
||||
return FetchResult(success=False, error="URL vide")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Headers par défaut
|
||||
default_headers = {
|
||||
"User-Agent": random.choice(USER_AGENTS),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"DNT": "1",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
# Merge avec headers personnalisés
|
||||
if headers:
|
||||
default_headers.update(headers)
|
||||
|
||||
logger.info(f"[HTTP] Récupération: {url}")
|
||||
logger.debug(f"[HTTP] User-Agent: {default_headers['User-Agent'][:50]}...")
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
url,
|
||||
headers=default_headers,
|
||||
timeout=timeout,
|
||||
allow_redirects=follow_redirects,
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Vérifier le code de statut
|
||||
if response.status_code == 200:
|
||||
html = response.text
|
||||
logger.info(
|
||||
f"[HTTP] Succès: {len(html)} chars, {duration_ms}ms, "
|
||||
f"status={response.status_code}"
|
||||
)
|
||||
return FetchResult(
|
||||
success=True,
|
||||
html=html,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
# Codes d'erreur courants
|
||||
elif response.status_code == 403:
|
||||
error = "403 Forbidden - Anti-bot détecté"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code == 404:
|
||||
error = "404 Not Found - Page introuvable"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code == 429:
|
||||
error = "429 Too Many Requests - Rate limit atteint"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
elif response.status_code >= 500:
|
||||
error = f"{response.status_code} Server Error - Erreur serveur"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
else:
|
||||
error = f"HTTP {response.status_code} - Erreur inconnue"
|
||||
logger.warning(f"[HTTP] {error}")
|
||||
return FetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
status_code=response.status_code,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except Timeout:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Timeout après {timeout}s"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
|
||||
except RequestException as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur réseau: {str(e)}"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur inattendue: {str(e)}"
|
||||
logger.error(f"[HTTP] {error}")
|
||||
return FetchResult(success=False, error=error, duration_ms=duration_ms)
|
||||
238
pricewatch/app/scraping/pw_fetch.py
Executable file
238
pricewatch/app/scraping/pw_fetch.py
Executable file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Récupération avec Playwright (fallback anti-bot).
|
||||
|
||||
Utilisé quand HTTP échoue (403, captcha, etc.).
|
||||
Plus lent mais plus robuste contre les protections anti-scraping.
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from playwright.sync_api import (
|
||||
Browser,
|
||||
Page,
|
||||
Playwright,
|
||||
sync_playwright,
|
||||
TimeoutError as PlaywrightTimeout,
|
||||
)
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
|
||||
logger = get_logger("scraping.playwright")
|
||||
|
||||
|
||||
class PlaywrightFetchResult:
|
||||
"""Résultat d'une récupération Playwright."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
html: Optional[str] = None,
|
||||
screenshot: Optional[bytes] = None,
|
||||
error: Optional[str] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
):
|
||||
self.success = success
|
||||
self.html = html
|
||||
self.screenshot = screenshot
|
||||
self.error = error
|
||||
self.duration_ms = duration_ms
|
||||
|
||||
|
||||
def fetch_playwright(
|
||||
url: str,
|
||||
headless: bool = True,
|
||||
timeout_ms: int = 60000,
|
||||
save_screenshot: bool = False,
|
||||
wait_for_selector: Optional[str] = None,
|
||||
) -> PlaywrightFetchResult:
|
||||
"""
|
||||
Récupère une page avec Playwright.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
headless: Mode headless (True) ou visible (False)
|
||||
timeout_ms: Timeout en millisecondes
|
||||
save_screenshot: Prendre un screenshot
|
||||
wait_for_selector: Attendre un sélecteur CSS avant de récupérer
|
||||
|
||||
Returns:
|
||||
PlaywrightFetchResult avec HTML, screenshot (optionnel), ou erreur
|
||||
|
||||
Justification technique:
|
||||
- Playwright simule un vrai navigateur → contourne beaucoup d'anti-bots
|
||||
- Headless par défaut pour performance
|
||||
- Headful disponible pour debug visuel
|
||||
- Screenshot optionnel pour diagnostiquer les échecs
|
||||
- wait_for_selector permet d'attendre le chargement dynamique
|
||||
"""
|
||||
if not url or not url.strip():
|
||||
logger.error("URL vide fournie")
|
||||
return PlaywrightFetchResult(success=False, error="URL vide")
|
||||
|
||||
start_time = time.time()
|
||||
logger.info(f"[Playwright] Récupération: {url} (headless={headless})")
|
||||
|
||||
playwright: Optional[Playwright] = None
|
||||
browser: Optional[Browser] = None
|
||||
page: Optional[Page] = None
|
||||
|
||||
try:
|
||||
playwright = sync_playwright().start()
|
||||
|
||||
# Lancer le navigateur Chromium
|
||||
browser = playwright.chromium.launch(headless=headless)
|
||||
|
||||
# Créer un contexte avec User-Agent réaliste
|
||||
context = browser.new_context(
|
||||
user_agent=(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
locale="fr-FR",
|
||||
)
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# Définir le timeout
|
||||
page.set_default_timeout(timeout_ms)
|
||||
|
||||
# Naviguer vers la page
|
||||
logger.debug(f"[Playwright] Navigation vers {url}")
|
||||
response = page.goto(url, wait_until="domcontentloaded")
|
||||
|
||||
if not response:
|
||||
raise Exception("Pas de réponse du serveur")
|
||||
|
||||
# Attendre un sélecteur spécifique si demandé
|
||||
if wait_for_selector:
|
||||
logger.debug(f"[Playwright] Attente du sélecteur: {wait_for_selector}")
|
||||
try:
|
||||
page.wait_for_selector(wait_for_selector, timeout=timeout_ms)
|
||||
except PlaywrightTimeout:
|
||||
logger.warning(
|
||||
f"[Playwright] Timeout en attendant le sélecteur: {wait_for_selector}"
|
||||
)
|
||||
|
||||
# Récupérer le HTML
|
||||
html = page.content()
|
||||
|
||||
# Screenshot optionnel
|
||||
screenshot = None
|
||||
if save_screenshot:
|
||||
logger.debug("[Playwright] Capture du screenshot")
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
logger.info(
|
||||
f"[Playwright] Succès: {len(html)} chars, {duration_ms}ms, "
|
||||
f"status={response.status}"
|
||||
)
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=True,
|
||||
html=html,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except PlaywrightTimeout:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Timeout après {timeout_ms}ms"
|
||||
logger.error(f"[Playwright] {error}")
|
||||
|
||||
# Tenter un screenshot même en cas d'erreur
|
||||
screenshot = None
|
||||
if save_screenshot and page:
|
||||
try:
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
error = f"Erreur Playwright: {str(e)}"
|
||||
logger.error(f"[Playwright] {error}")
|
||||
|
||||
# Tenter un screenshot même en cas d'erreur
|
||||
screenshot = None
|
||||
if save_screenshot and page:
|
||||
try:
|
||||
screenshot = page.screenshot(full_page=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return PlaywrightFetchResult(
|
||||
success=False,
|
||||
error=error,
|
||||
screenshot=screenshot,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
finally:
|
||||
# Nettoyage
|
||||
try:
|
||||
if page:
|
||||
page.close()
|
||||
if browser:
|
||||
browser.close()
|
||||
if playwright:
|
||||
playwright.stop()
|
||||
except Exception as e:
|
||||
logger.warning(f"[Playwright] Erreur lors du nettoyage: {e}")
|
||||
|
||||
|
||||
def fetch_with_fallback(
|
||||
url: str,
|
||||
try_http_first: bool = True,
|
||||
playwright_options: Optional[dict] = None,
|
||||
) -> PlaywrightFetchResult:
|
||||
"""
|
||||
Stratégie de récupération avec fallback HTTP → Playwright.
|
||||
|
||||
Args:
|
||||
url: URL à récupérer
|
||||
try_http_first: Tenter HTTP d'abord (plus rapide)
|
||||
playwright_options: Options pour Playwright si nécessaire
|
||||
|
||||
Returns:
|
||||
PlaywrightFetchResult
|
||||
|
||||
Justification technique:
|
||||
- HTTP d'abord car beaucoup plus rapide (~1s vs ~10s)
|
||||
- Fallback Playwright si HTTP échoue (403, timeout, etc.)
|
||||
- Économise des ressources quand HTTP suffit
|
||||
"""
|
||||
from pricewatch.app.scraping.http_fetch import fetch_http
|
||||
|
||||
playwright_options = playwright_options or {}
|
||||
|
||||
if try_http_first:
|
||||
logger.info(f"[Fallback] Tentative HTTP d'abord: {url}")
|
||||
http_result = fetch_http(url)
|
||||
|
||||
if http_result.success:
|
||||
logger.info("[Fallback] HTTP a réussi, pas besoin de Playwright")
|
||||
return PlaywrightFetchResult(
|
||||
success=True,
|
||||
html=http_result.html,
|
||||
duration_ms=http_result.duration_ms,
|
||||
)
|
||||
|
||||
logger.warning(
|
||||
f"[Fallback] HTTP échoué ({http_result.error}), "
|
||||
"fallback vers Playwright"
|
||||
)
|
||||
|
||||
# Playwright en fallback ou en méthode principale
|
||||
return fetch_playwright(url, **playwright_options)
|
||||
0
pricewatch/app/stores/__init__.py
Executable file
0
pricewatch/app/stores/__init__.py
Executable file
BIN
pricewatch/app/stores/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/stores/__pycache__/base.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/__pycache__/base.cpython-313.pyc
Executable file
Binary file not shown.
5
pricewatch/app/stores/aliexpress/__init__.py
Executable file
5
pricewatch/app/stores/aliexpress/__init__.py
Executable file
@@ -0,0 +1,5 @@
|
||||
"""Store AliExpress."""
|
||||
|
||||
from pricewatch.app.stores.aliexpress.store import AliexpressStore
|
||||
|
||||
__all__ = ["AliexpressStore"]
|
||||
BIN
pricewatch/app/stores/aliexpress/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/aliexpress/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/stores/aliexpress/__pycache__/store.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/aliexpress/__pycache__/store.cpython-313.pyc
Executable file
Binary file not shown.
163
pricewatch/app/stores/aliexpress/fixtures/README.md
Executable file
163
pricewatch/app/stores/aliexpress/fixtures/README.md
Executable file
@@ -0,0 +1,163 @@
|
||||
# Fixtures AliExpress
|
||||
|
||||
Ce dossier contient des fichiers HTML réels capturés depuis AliExpress pour les tests.
|
||||
|
||||
## ⚠️ Note importante sur AliExpress
|
||||
|
||||
AliExpress utilise un **rendu client-side (SPA React/Vue)**:
|
||||
- HTTP simple retourne **HTML minimal** (75KB sans contenu)
|
||||
- **Playwright est OBLIGATOIRE** avec attente (~3s)
|
||||
- Attendre le sélecteur `.product-title` pour obtenir les données
|
||||
- Données chargées via **AJAX** après le render initial
|
||||
|
||||
## Spécificité AliExpress
|
||||
|
||||
AliExpress est un **marketplace chinois** avec des particularités:
|
||||
- **Pas de JSON-LD** schema.org
|
||||
- **Prix**: Extrait par **regex** (aucun sélecteur CSS stable)
|
||||
- **Images**: Extraites depuis `window._d_c_.DCData.imagePathList` (JSON embarqué)
|
||||
- **Classes CSS**: Générées aléatoirement (hachées) → **TRÈS instables**
|
||||
- **SKU**: ID numérique long (13 chiffres) depuis l'URL
|
||||
|
||||
## Fichiers
|
||||
|
||||
### aliexpress_1005007187023722.html
|
||||
- **Produit**: Samsung serveur DDR4 mémoire Ram ECC
|
||||
- **SKU**: 1005007187023722
|
||||
- **URL**: https://fr.aliexpress.com/item/1005007187023722.html
|
||||
- **Taille**: 378 KB (rendu complet)
|
||||
- **Date capture**: 2026-01-13
|
||||
- **Méthode**: Playwright avec wait_for_selector='.product-title'
|
||||
- **Prix capturé**: 136,69 EUR
|
||||
- **Usage**: Test complet parsing produit électronique
|
||||
|
||||
## Structure HTML AliExpress
|
||||
|
||||
### JSON-LD Schema.org ✗
|
||||
AliExpress **n'utilise PAS** JSON-LD (contrairement à Backmarket).
|
||||
|
||||
### Données embarquées ✓
|
||||
AliExpress embarque les données dans des variables JavaScript:
|
||||
|
||||
```javascript
|
||||
window._d_c_.DCData = {
|
||||
"imagePathList": ["https://ae01.alicdn.com/kf/..."],
|
||||
"summImagePathList": ["https://ae01.alicdn.com/kf/..."],
|
||||
"i18nMap": {...},
|
||||
"extParams": {...}
|
||||
}
|
||||
```
|
||||
|
||||
### Sélecteurs identifiés
|
||||
|
||||
#### Titre
|
||||
```css
|
||||
h1 /* Apparaît après AJAX */
|
||||
meta[property="og:title"] /* Fallback dans meta tags */
|
||||
```
|
||||
Le h1 n'existe PAS dans le HTML initial, il est ajouté dynamiquement.
|
||||
|
||||
#### Prix
|
||||
⚠️ **AUCUN SÉLECTEUR CSS STABLE** - Utiliser regex:
|
||||
```regex
|
||||
([0-9]+[.,][0-9]{2})\s*€ /* Prix avant € */
|
||||
€\s*([0-9]+[.,][0-9]{2}) /* € avant prix */
|
||||
```
|
||||
|
||||
#### Images
|
||||
Priorité: **window._d_c_.DCData.imagePathList**
|
||||
Fallback: `meta[property="og:image"]`
|
||||
|
||||
URLs CDN: `https://ae01.alicdn.com/kf/...`
|
||||
|
||||
#### SKU
|
||||
Extraction depuis l'URL:
|
||||
```regex
|
||||
/item/(\d+)\.html
|
||||
```
|
||||
Exemple: `/item/1005007187023722.html` → SKU = "1005007187023722"
|
||||
|
||||
#### Stock
|
||||
Chercher bouton "Add to cart" / "Ajouter au panier"
|
||||
```css
|
||||
button[class*='add-to-cart']
|
||||
```
|
||||
|
||||
## Comparaison avec autres stores
|
||||
|
||||
| Aspect | Amazon | Cdiscount | Backmarket | **AliExpress** |
|
||||
|--------|--------|-----------|------------|----------------|
|
||||
| **Anti-bot** | Faible | Fort | Fort | Moyen |
|
||||
| **Méthode** | HTTP OK | Playwright | Playwright | **Playwright** |
|
||||
| **JSON-LD** | Partiel | ✗ Non | ✓ Oui | **✗ Non** |
|
||||
| **Sélecteurs** | Stables (IDs) | Instables | Stables | **Très instables** |
|
||||
| **SKU format** | `/dp/{ASIN}` | `/f-{cat}-{SKU}` | `/p/{slug}` | **/item/{ID}.html** |
|
||||
| **Prix extraction** | CSS | CSS/Regex | JSON-LD | **Regex uniquement** |
|
||||
| **Rendu** | Server-side | Server-side | Server-side | **Client-side (SPA)** |
|
||||
| **Particularité** | - | Prix dynamiques | Reconditionné | **SPA React/Vue** |
|
||||
|
||||
## Utilisation dans les tests
|
||||
|
||||
```python
|
||||
@pytest.fixture
|
||||
def aliexpress_fixture_samsung():
|
||||
fixture_path = Path(__file__).parent.parent.parent / \
|
||||
"pricewatch/app/stores/aliexpress/fixtures/aliexpress_1005007187023722.html"
|
||||
with open(fixture_path, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def test_parse_real_fixture(store, aliexpress_fixture_samsung):
|
||||
url = "https://fr.aliexpress.com/item/1005007187023722.html"
|
||||
snapshot = store.parse(aliexpress_fixture_samsung, url)
|
||||
|
||||
assert snapshot.title.startswith("Samsung serveur DDR4")
|
||||
assert snapshot.price == 136.69
|
||||
assert snapshot.reference == "1005007187023722"
|
||||
assert snapshot.currency == "EUR"
|
||||
assert len(snapshot.images) >= 6
|
||||
```
|
||||
|
||||
## Points d'attention pour les tests
|
||||
|
||||
1. **HTML volumineux** - 378KB pour une page (SPA chargée)
|
||||
2. **Prix instable** - Peut changer selon promo/devise
|
||||
3. **Ne pas tester le prix exact** - Tester le format et la présence
|
||||
4. **Images multiples** - Toujours 6+ images par produit
|
||||
5. **Titre long** - Souvent 100-150 caractères
|
||||
6. **Stock variable** - Peut changer rapidement
|
||||
|
||||
## Comment capturer une nouvelle fixture
|
||||
|
||||
```python
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
|
||||
url = "https://fr.aliexpress.com/item/..."
|
||||
result = fetch_playwright(
|
||||
url,
|
||||
headless=True,
|
||||
timeout_ms=15000,
|
||||
wait_for_selector=".product-title" # IMPORTANT!
|
||||
)
|
||||
|
||||
if result.success:
|
||||
with open("fixture.html", "w", encoding="utf-8") as f:
|
||||
f.write(result.html)
|
||||
```
|
||||
|
||||
⚠️ **N'utilisez PAS** `fetch_http()` - il retourne un HTML minimal (75KB)!
|
||||
⚠️ **Utilisez TOUJOURS** `wait_for_selector=".product-title"` avec Playwright!
|
||||
|
||||
## Avantages de AliExpress
|
||||
|
||||
✓ **HTTP fonctionne** → Pas d'anti-bot fort (mais HTML vide)
|
||||
✓ **Données embarquées** → DCData JSON avec images
|
||||
✓ **SKU simple** → ID numérique depuis URL
|
||||
|
||||
## Inconvénients
|
||||
|
||||
✗ **SPA client-side** → Playwright obligatoire avec wait (~3-5s)
|
||||
✗ **Pas de JSON-LD** → Extraction moins fiable
|
||||
✗ **Prix par regex** → Fragile, peut casser
|
||||
✗ **Classes CSS instables** → Générées aléatoirement (hachées)
|
||||
✗ **Temps de chargement** → 3-5s avec Playwright + wait
|
||||
✗ **Specs mal structurées** → Souvent dans des onglets/modals
|
||||
863
pricewatch/app/stores/aliexpress/fixtures/aliexpress_1005007187023722.html
Executable file
863
pricewatch/app/stores/aliexpress/fixtures/aliexpress_1005007187023722.html
Executable file
File diff suppressed because one or more lines are too long
79
pricewatch/app/stores/aliexpress/selectors.yml
Executable file
79
pricewatch/app/stores/aliexpress/selectors.yml
Executable file
@@ -0,0 +1,79 @@
|
||||
# Sélecteurs CSS/XPath pour AliExpress.com
|
||||
# Mis à jour le 2026-01-13 après analyse du HTML réel
|
||||
|
||||
# ⚠️ IMPORTANT: AliExpress utilise un rendu client-side (SPA React/Vue)
|
||||
# - HTTP fonctionne mais retourne un HTML minimal (75KB)
|
||||
# - Playwright OBLIGATOIRE pour obtenir le contenu rendu
|
||||
# - Attendre le sélecteur '.product-title' ou ajouter un délai (~3s)
|
||||
# - Les données sont chargées dynamiquement via AJAX
|
||||
|
||||
# ⚠️ Extraction prioritaire:
|
||||
# 1. Titre: h1 ou meta[property="og:title"]
|
||||
# 2. Prix: Regex dans le HTML (aucun sélecteur stable)
|
||||
# 3. Images: window._d_c_.DCData.imagePathList (JSON embarqué)
|
||||
# 4. SKU: Depuis l'URL /item/{ID}.html
|
||||
|
||||
# Titre du produit
|
||||
# Le h1 apparaît après chargement AJAX
|
||||
title:
|
||||
- "h1"
|
||||
- "meta[property='og:title']" # Fallback dans meta tags
|
||||
|
||||
# Prix principal
|
||||
# ⚠️ AUCUN SÉLECTEUR STABLE - Utiliser regex sur le HTML
|
||||
# Pattern: ([0-9]+[.,][0-9]{2})\s*€ ou €\s*([0-9]+[.,][0-9]{2})
|
||||
price:
|
||||
- "span[class*='price']"
|
||||
- "div[class*='price']"
|
||||
- "span.product-price"
|
||||
# Ces sélecteurs ne fonctionnent PAS - prix extrait par regex
|
||||
|
||||
# Devise
|
||||
# Toujours EUR pour fr.aliexpress.com
|
||||
currency:
|
||||
- "meta[property='og:price:currency']"
|
||||
# Fallback: détecter depuis l'URL (fr = EUR)
|
||||
|
||||
# Images produit
|
||||
# ⚠️ Les images sont dans window._d_c_.DCData.imagePathList
|
||||
# Format: https://ae01.alicdn.com/kf/{hash}.jpg
|
||||
images:
|
||||
- "img[alt]"
|
||||
# Extraction depuis DCData JSON plus fiable
|
||||
|
||||
# Catégorie / breadcrumb
|
||||
category:
|
||||
- "nav[aria-label='breadcrumb'] a"
|
||||
- ".breadcrumb a"
|
||||
|
||||
# Caractéristiques techniques
|
||||
# Peuvent être dans des onglets ou sections dépliables
|
||||
specs_table:
|
||||
- "div[class*='specification']"
|
||||
- "div[class*='properties']"
|
||||
- "dl"
|
||||
|
||||
# SKU / référence produit
|
||||
# Extraction depuis l'URL plus fiable
|
||||
# URL pattern: /item/{ID}.html
|
||||
# SKU = ID (10 chiffres)
|
||||
sku:
|
||||
- "meta[property='product:retailer_item_id']"
|
||||
- "span[data-spm-anchor-id]"
|
||||
|
||||
# Stock / Disponibilité
|
||||
stock_status:
|
||||
- "button[class*='add-to-cart']"
|
||||
- "button[class*='addtocart']"
|
||||
- "div[class*='availability']"
|
||||
|
||||
# Notes importantes:
|
||||
# 1. ⚠️ Playwright OBLIGATOIRE avec wait - HTML minimal sinon
|
||||
# 2. Attendre le sélecteur '.product-title' avant de parser
|
||||
# 3. Prix: REGEX obligatoire - aucun sélecteur CSS stable
|
||||
# 4. Images: Extraire depuis window._d_c_.DCData (JSON)
|
||||
# 5. SKU: Extraire depuis URL /item/{ID}.html → ID = SKU
|
||||
# 6. Devise: EUR pour France (fr.aliexpress.com)
|
||||
# 7. Classes CSS générées aléatoirement (hachées) - TRÈS INSTABLES
|
||||
# 8. Pas de JSON-LD schema.org disponible
|
||||
# 9. Temps de chargement: ~3-5s avec Playwright + wait
|
||||
350
pricewatch/app/stores/aliexpress/store.py
Executable file
350
pricewatch/app/stores/aliexpress/store.py
Executable file
@@ -0,0 +1,350 @@
|
||||
"""
|
||||
Store AliExpress - Parsing de produits AliExpress.com.
|
||||
|
||||
Supporte l'extraction de: titre, prix, SKU, images, etc.
|
||||
Spécificité: Rendu client-side (SPA) - nécessite Playwright avec attente.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import (
|
||||
DebugInfo,
|
||||
DebugStatus,
|
||||
FetchMethod,
|
||||
ProductSnapshot,
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
|
||||
logger = get_logger("stores.aliexpress")
|
||||
|
||||
|
||||
class AliexpressStore(BaseStore):
|
||||
"""Store pour AliExpress.com (marketplace chinois)."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialise le store AliExpress avec ses sélecteurs."""
|
||||
selectors_path = Path(__file__).parent / "selectors.yml"
|
||||
super().__init__(store_id="aliexpress", selectors_path=selectors_path)
|
||||
|
||||
def match(self, url: str) -> float:
|
||||
"""
|
||||
Détecte si l'URL est AliExpress.
|
||||
|
||||
Returns:
|
||||
0.9 pour aliexpress.com/aliexpress.fr
|
||||
0.0 sinon
|
||||
"""
|
||||
if not url:
|
||||
return 0.0
|
||||
|
||||
url_lower = url.lower()
|
||||
|
||||
if "aliexpress.com" in url_lower or "aliexpress.fr" in url_lower:
|
||||
# Vérifier que c'est bien une page produit
|
||||
if "/item/" in url_lower:
|
||||
return 0.9
|
||||
else:
|
||||
return 0.5 # C'est AliExpress mais pas une page produit
|
||||
|
||||
return 0.0
|
||||
|
||||
def canonicalize(self, url: str) -> str:
|
||||
"""
|
||||
Normalise l'URL AliExpress.
|
||||
|
||||
Les URLs AliExpress ont généralement la forme:
|
||||
https://fr.aliexpress.com/item/{ID}.html?params...
|
||||
|
||||
On garde juste: https://fr.aliexpress.com/item/{ID}.html
|
||||
"""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
parsed = urlparse(url)
|
||||
|
||||
# Extraire le path de base (sans query params)
|
||||
path = parsed.path
|
||||
|
||||
# Garder seulement /item/{ID}.html
|
||||
match = re.search(r"(/item/\d+\.html)", path)
|
||||
if match:
|
||||
clean_path = match.group(1)
|
||||
return f"{parsed.scheme}://{parsed.netloc}{clean_path}"
|
||||
|
||||
# Si le pattern ne matche pas, retirer juste query params
|
||||
return f"{parsed.scheme}://{parsed.netloc}{path}"
|
||||
|
||||
def extract_reference(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Extrait le SKU (Product ID) depuis l'URL.
|
||||
|
||||
Format typique: /item/{ID}.html
|
||||
Exemple: /item/1005007187023722.html → "1005007187023722"
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Pattern: /item/{ID}.html
|
||||
match = re.search(r"/item/(\d+)\.html", url, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
def parse(self, html: str, url: str) -> ProductSnapshot:
|
||||
"""
|
||||
Parse le HTML AliExpress vers ProductSnapshot.
|
||||
|
||||
AliExpress utilise un rendu client-side (SPA), donc:
|
||||
- Extraction prioritaire depuis meta tags (og:title, og:image)
|
||||
- Prix extrait par regex (pas de sélecteur stable)
|
||||
- Images extraites depuis window._d_c_.DCData JSON
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
debug_info = DebugInfo(
|
||||
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
|
||||
status=DebugStatus.SUCCESS,
|
||||
errors=[],
|
||||
notes=[],
|
||||
)
|
||||
|
||||
# Extraction des champs
|
||||
title = self._extract_title(soup, debug_info)
|
||||
price = self._extract_price(html, soup, debug_info)
|
||||
currency = self._extract_currency(url, soup, debug_info)
|
||||
stock_status = self._extract_stock(soup, debug_info)
|
||||
images = self._extract_images(html, soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
reference = self.extract_reference(url)
|
||||
|
||||
# Note sur le rendu client-side
|
||||
if len(html) < 200000: # HTML trop petit = pas de rendu complet
|
||||
debug_info.notes.append(
|
||||
"HTML court (<200KB) - possiblement non rendu. Utiliser Playwright avec wait."
|
||||
)
|
||||
|
||||
# Déterminer le statut final
|
||||
if not title or price is None:
|
||||
debug_info.status = DebugStatus.PARTIAL
|
||||
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
|
||||
|
||||
snapshot = ProductSnapshot(
|
||||
source=self.store_id,
|
||||
url=self.canonicalize(url),
|
||||
fetched_at=datetime.now(),
|
||||
title=title,
|
||||
price=price,
|
||||
currency=currency,
|
||||
shipping_cost=None,
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
images=images,
|
||||
specs=specs,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[AliExpress] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
|
||||
f"title={bool(title)}, price={price is not None}"
|
||||
)
|
||||
|
||||
return snapshot
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait le titre du produit."""
|
||||
# Priorité 1: h1 (apparaît après rendu AJAX)
|
||||
h1 = soup.find("h1")
|
||||
if h1:
|
||||
title = h1.get_text(strip=True)
|
||||
if title and len(title) > 10: # Titre valide
|
||||
return title
|
||||
|
||||
# Priorité 2: og:title (dans meta tags)
|
||||
og_title = soup.find("meta", property="og:title")
|
||||
if og_title:
|
||||
title = og_title.get("content", "")
|
||||
if title:
|
||||
# Nettoyer " - AliExpress" à la fin
|
||||
title = re.sub(r"\s*-\s*AliExpress.*$", "", title)
|
||||
return title.strip()
|
||||
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_price(
|
||||
self, html: str, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> Optional[float]:
|
||||
"""
|
||||
Extrait le prix.
|
||||
|
||||
AliExpress n'a PAS de sélecteur CSS stable pour le prix.
|
||||
On utilise regex sur le HTML brut.
|
||||
"""
|
||||
# Pattern 1: Prix avant € (ex: "136,69 €")
|
||||
match = re.search(r"([0-9]+[.,][0-9]{2})\s*€", html)
|
||||
if match:
|
||||
price_str = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pattern 2: € avant prix (ex: "€ 136.69")
|
||||
match = re.search(r"€\s*([0-9]+[.,][0-9]{2})", html)
|
||||
if match:
|
||||
price_str = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Pattern 3: Chercher dans meta tags (moins fiable)
|
||||
og_price = soup.find("meta", property="og:price:amount")
|
||||
if og_price:
|
||||
price_str = og_price.get("content", "")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_currency(
|
||||
self, url: str, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> str:
|
||||
"""Extrait la devise."""
|
||||
# Priorité 1: og:price:currency
|
||||
og_currency = soup.find("meta", property="og:price:currency")
|
||||
if og_currency:
|
||||
currency = og_currency.get("content", "")
|
||||
if currency:
|
||||
return currency.upper()
|
||||
|
||||
# Priorité 2: Détecter depuis l'URL
|
||||
if "fr.aliexpress" in url.lower():
|
||||
return "EUR"
|
||||
elif "aliexpress.com" in url.lower():
|
||||
return "USD"
|
||||
|
||||
# Défaut
|
||||
return "EUR"
|
||||
|
||||
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
||||
"""Extrait le statut de stock."""
|
||||
# Chercher le bouton "Add to cart" / "Ajouter au panier"
|
||||
buttons = soup.find_all("button")
|
||||
for btn in buttons:
|
||||
text = btn.get_text(strip=True).lower()
|
||||
if any(
|
||||
keyword in text
|
||||
for keyword in ["add to cart", "ajouter", "buy now", "acheter"]
|
||||
):
|
||||
# Bouton trouvé et pas disabled
|
||||
if not btn.get("disabled"):
|
||||
return StockStatus.IN_STOCK
|
||||
|
||||
# Fallback: chercher texte indiquant la disponibilité
|
||||
text_lower = soup.get_text().lower()
|
||||
if "out of stock" in text_lower or "rupture" in text_lower:
|
||||
return StockStatus.OUT_OF_STOCK
|
||||
|
||||
return StockStatus.UNKNOWN
|
||||
|
||||
def _extract_images(
|
||||
self, html: str, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> list[str]:
|
||||
"""
|
||||
Extrait les URLs d'images.
|
||||
|
||||
Priorité: window._d_c_.DCData.imagePathList (JSON embarqué)
|
||||
"""
|
||||
images = []
|
||||
|
||||
# Priorité 1: Extraire depuis DCData JSON
|
||||
match = re.search(
|
||||
r"window\._d_c_\.DCData\s*=\s*(\{[^;]*\});", html, re.DOTALL
|
||||
)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
if "imagePathList" in data:
|
||||
image_list = data["imagePathList"]
|
||||
if isinstance(image_list, list):
|
||||
images.extend(image_list)
|
||||
debug.notes.append(
|
||||
f"Images extraites depuis DCData: {len(images)}"
|
||||
)
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
# Priorité 2: og:image
|
||||
if not images:
|
||||
og_image = soup.find("meta", property="og:image")
|
||||
if og_image:
|
||||
img_url = og_image.get("content", "")
|
||||
if img_url:
|
||||
images.append(img_url)
|
||||
|
||||
# Priorité 3: Chercher dans les <img> avec alicdn.com
|
||||
if not images:
|
||||
img_elems = soup.find_all("img", src=True)
|
||||
for img in img_elems:
|
||||
src = img.get("src", "")
|
||||
if "alicdn.com" in src and not any(
|
||||
x in src for x in ["logo", "icon", "avatar"]
|
||||
):
|
||||
if src not in images:
|
||||
images.append(src)
|
||||
|
||||
return images
|
||||
|
||||
def _extract_category(
|
||||
self, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis le breadcrumb."""
|
||||
selectors = self.get_selector("category", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
# Prendre le dernier élément du breadcrumb
|
||||
categories = [
|
||||
elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)
|
||||
]
|
||||
if categories:
|
||||
return categories[-1]
|
||||
|
||||
return None
|
||||
|
||||
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
||||
"""Extrait les caractéristiques techniques."""
|
||||
specs = {}
|
||||
|
||||
# Chercher les dl (definition lists)
|
||||
dls = soup.find_all("dl")
|
||||
for dl in dls:
|
||||
dts = dl.find_all("dt")
|
||||
dds = dl.find_all("dd")
|
||||
|
||||
for dt, dd in zip(dts, dds):
|
||||
key = dt.get_text(strip=True)
|
||||
value = dd.get_text(strip=True)
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
0
pricewatch/app/stores/amazon/__init__.py
Executable file
0
pricewatch/app/stores/amazon/__init__.py
Executable file
BIN
pricewatch/app/stores/amazon/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/amazon/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/amazon/__pycache__/store.cpython-313.pyc
Executable file
Binary file not shown.
54
pricewatch/app/stores/amazon/fixtures/README.md
Executable file
54
pricewatch/app/stores/amazon/fixtures/README.md
Executable file
@@ -0,0 +1,54 @@
|
||||
# Fixtures Amazon
|
||||
|
||||
Ce dossier contient des fichiers HTML réels capturés depuis Amazon.fr pour les tests.
|
||||
|
||||
## Fichiers
|
||||
|
||||
### amazon_B0D4DX8PH3.html
|
||||
- **Produit**: elago MS1 Station de Charge Compatible avec Le Chargeur MagSafe
|
||||
- **ASIN**: B0D4DX8PH3
|
||||
- **URL**: https://www.amazon.fr/dp/B0D4DX8PH3
|
||||
- **Taille**: ~2.4 MB
|
||||
- **Lignes**: 11151
|
||||
- **Date capture**: 2026-01-13
|
||||
- **Usage**: Test complet parsing avec images, specs, prix
|
||||
|
||||
### amazon_B0F6MWNJ6J.html
|
||||
- **Produit**: Baseus Docking Station, Nomos Air 12 in 1
|
||||
- **ASIN**: B0F6MWNJ6J
|
||||
- **URL**: https://www.amazon.fr/dp/B0F6MWNJ6J
|
||||
- **Taille**: ~2.3 MB
|
||||
- **Lignes**: 11168
|
||||
- **Date capture**: 2026-01-13
|
||||
- **Usage**: Test complet parsing produit tech complexe
|
||||
|
||||
### captcha.html
|
||||
- **Contenu**: Page captcha Amazon
|
||||
- **Taille**: 5.1 KB
|
||||
- **Lignes**: 115
|
||||
- **Usage**: Test détection captcha et gestion erreurs
|
||||
|
||||
## Utilisation
|
||||
|
||||
Les tests utilisent ces fixtures avec pytest:
|
||||
|
||||
```python
|
||||
@pytest.fixture
|
||||
def amazon_fixture_b0d4dx8ph3():
|
||||
fixture_path = Path(__file__).parent.parent / "pricewatch/app/stores/amazon/fixtures/amazon_B0D4DX8PH3.html"
|
||||
with open(fixture_path, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def test_parse_real_fixture(store, amazon_fixture_b0d4dx8ph3):
|
||||
url = "https://www.amazon.fr/dp/B0D4DX8PH3"
|
||||
snapshot = store.parse(amazon_fixture_b0d4dx8ph3, url)
|
||||
assert snapshot.reference == "B0D4DX8PH3"
|
||||
assert snapshot.price is not None
|
||||
# ...
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Ces fichiers sont de vraies pages HTML capturées, ils peuvent contenir beaucoup de JavaScript et de métadonnées
|
||||
- Les tests doivent se concentrer sur l'extraction des données essentielles (titre, prix, ASIN, stock)
|
||||
- Ne pas tester les données qui peuvent changer (prix exact, nombre d'avis, etc.) mais plutôt le format
|
||||
11151
pricewatch/app/stores/amazon/fixtures/amazon_B0D4DX8PH3.html
Executable file
11151
pricewatch/app/stores/amazon/fixtures/amazon_B0D4DX8PH3.html
Executable file
File diff suppressed because one or more lines are too long
11168
pricewatch/app/stores/amazon/fixtures/amazon_B0F6MWNJ6J.html
Executable file
11168
pricewatch/app/stores/amazon/fixtures/amazon_B0F6MWNJ6J.html
Executable file
File diff suppressed because one or more lines are too long
115
pricewatch/app/stores/amazon/fixtures/captcha.html
Executable file
115
pricewatch/app/stores/amazon/fixtures/captcha.html
Executable file
@@ -0,0 +1,115 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if lt IE 7]> <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
|
||||
<!--[if IE 7]> <html lang="fr" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
|
||||
<!--[if IE 8]> <html lang="fr" class="a-no-js a-lt-ie9"> <![endif]-->
|
||||
<!--[if gt IE 8]><!-->
|
||||
<html class="a-no-js" lang="fr"><!--<![endif]--><head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
|
||||
<title dir="ltr">Amazon.fr</title>
|
||||
<meta name="viewport" content="width=device-width">
|
||||
<link rel="stylesheet" href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css">
|
||||
<script>
|
||||
|
||||
if (true === true) {
|
||||
var ue_t0 = (+ new Date()),
|
||||
ue_csm = window,
|
||||
ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
|
||||
ue_furl = "fls-eu.amazon.fr",
|
||||
ue_mid = "A13V1IB3VIYZZH",
|
||||
ue_sid = (document.cookie.match(/session-id=([0-9-]+)/) || [])[1],
|
||||
ue_sn = "opfcaptcha.amazon.fr",
|
||||
ue_id = 'V1R3HCVDQ573ZEMZKZQD';
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<!--
|
||||
To discuss automated access to Amazon data please contact api-services-support@amazon.com.
|
||||
For information about migrating to our APIs refer to our Marketplace APIs at https://developer.amazonservices.fr/ref=rm_c_sv, or our Product Advertising API at https://partenaires.amazon.fr/gp/advertising/api/detail/main.html/ref=rm_c_ac for advertising use cases.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Correios.DoNotSend
|
||||
-->
|
||||
|
||||
<div class="a-container a-padding-double-large" style="min-width:350px;padding:44px 0 !important">
|
||||
|
||||
<div class="a-row a-spacing-double-large" style="width: 350px; margin: 0 auto">
|
||||
|
||||
<div class="a-row a-spacing-medium a-text-center"><i class="a-icon a-logo" alt="Logo d'Amazon"></i></div>
|
||||
|
||||
<div class="a-box a-alert a-alert-info a-spacing-base">
|
||||
<div class="a-box-inner">
|
||||
<i class="a-icon a-icon-alert" alt="Icône d'alerte"></i>
|
||||
<h4>Cliquez sur le bouton ci-dessous pour continuer vos achats</h4>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="a-section">
|
||||
|
||||
<div class="a-box a-color-offset-background">
|
||||
<div class="a-box-inner a-padding-extra-large">
|
||||
|
||||
<form method="get" action="/errors/validateCaptcha" name="">
|
||||
<input type=hidden name="amzn" value="2W5U2H7MWJXqdgImnmg0CQ==" /><input type=hidden name="amzn-r" value="/dp/B0DFWRHZ7L" />
|
||||
<input type=hidden name="field-keywords" value="ELFGJB" />
|
||||
<div class="a-section a-spacing-extra-large">
|
||||
|
||||
<div class="a-row">
|
||||
<span class="a-button a-button-primary a-span12">
|
||||
<span class="a-button-inner">
|
||||
<button type="submit" class="a-button-text" alt="Continuer les achats">Continuer les achats</button>
|
||||
</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="a-divider a-divider-section"><div class="a-divider-inner"></div></div>
|
||||
|
||||
<div class="a-text-center a-spacing-small a-size-mini">
|
||||
<a href="https://www.amazon.fr/gp/help/customer/display.html/ref=footer_cou?ie=UTF8&nodeId=548524">Conditions générales de vente</a>
|
||||
<span class="a-letter-space"></span>
|
||||
<span class="a-letter-space"></span>
|
||||
<span class="a-letter-space"></span>
|
||||
<span class="a-letter-space"></span>
|
||||
<a href="https://www.amazon.fr/gp/help/customer/display.html/ref=footer_privacy?ie=UTF8&nodeId=3329781">Vos informations personnelles</a>
|
||||
</div>
|
||||
|
||||
<div class="a-text-center a-size-mini a-color-base">
|
||||
© 1996-2025, Amazon.com, Inc. ou ses filiales.
|
||||
<script>
|
||||
if (true === true) {
|
||||
document.write('<img src="https://fls-eu.amaz'+'on.fr/'+'1/oc-csi/1/OP/requestId=V1R3HCVDQ573ZEMZKZQD&js=1" alt=""/>');
|
||||
};
|
||||
</script>
|
||||
<noscript>
|
||||
<img src="https://fls-eu.amazon.fr/1/oc-csi/1/OP/requestId=V1R3HCVDQ573ZEMZKZQD&js=0" alt=""/>
|
||||
</noscript>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
if (true === true) {
|
||||
var head = document.getElementsByTagName('head')[0],
|
||||
prefix = "https://images-eu.ssl-images-amazon.com/images/G/01/csminstrumentation/",
|
||||
elem = document.createElement("script");
|
||||
elem.src = prefix + "csm-captcha-instrumentation.min.js";
|
||||
head.appendChild(elem);
|
||||
|
||||
elem = document.createElement("script");
|
||||
elem.src = prefix + "rd-script-6d68177fa6061598e9509dc4b5bdd08d.js";
|
||||
head.appendChild(elem);
|
||||
}
|
||||
</script>
|
||||
</body></html>
|
||||
69
pricewatch/app/stores/amazon/selectors.yml
Executable file
69
pricewatch/app/stores/amazon/selectors.yml
Executable file
@@ -0,0 +1,69 @@
|
||||
# Sélecteurs CSS/XPath pour Amazon
|
||||
# Ces sélecteurs sont à ajuster selon l'évolution du site
|
||||
|
||||
# Titre du produit
|
||||
title:
|
||||
- "#productTitle"
|
||||
- "#title"
|
||||
- "h1.product-title"
|
||||
|
||||
# Prix principal
|
||||
price:
|
||||
- "span.a-price-whole"
|
||||
- ".a-price .a-offscreen"
|
||||
- "#priceblock_ourprice"
|
||||
- "#priceblock_dealprice"
|
||||
- ".a-price-range .a-price .a-offscreen"
|
||||
|
||||
# Devise (généralement dans le symbole)
|
||||
currency:
|
||||
- "span.a-price-symbol"
|
||||
- ".a-price-symbol"
|
||||
|
||||
# Frais de port
|
||||
shipping_cost:
|
||||
- "#ourprice_shippingmessage"
|
||||
- "#price-shipping-message"
|
||||
- "#deliveryMessageMirId"
|
||||
|
||||
# Statut de stock
|
||||
stock_status:
|
||||
- "#availability span"
|
||||
- "#availability"
|
||||
- ".a-declarative .a-size-medium"
|
||||
|
||||
# Images produit
|
||||
images:
|
||||
- "#landingImage"
|
||||
- "#imgBlkFront"
|
||||
- ".a-dynamic-image"
|
||||
- "#main-image"
|
||||
|
||||
# Catégorie / breadcrumb
|
||||
category:
|
||||
- "#wayfinding-breadcrumbs_feature_div"
|
||||
- ".a-breadcrumb"
|
||||
|
||||
# Caractéristiques techniques (table specs)
|
||||
specs_table:
|
||||
- "#productDetails_techSpec_section_1"
|
||||
- "#productDetails_detailBullets_sections1"
|
||||
- ".prodDetTable"
|
||||
- "#product-specification-table"
|
||||
|
||||
# ASIN (parfois dans les métadonnées)
|
||||
asin:
|
||||
- "input[name='ASIN']"
|
||||
- "th:contains('ASIN') + td"
|
||||
|
||||
# Messages captcha / robot check
|
||||
captcha_indicators:
|
||||
- "form[action*='validateCaptcha']"
|
||||
- "p.a-last:contains('Sorry')"
|
||||
- "img[alt*='captcha']"
|
||||
|
||||
# Notes pour le parsing:
|
||||
# - Amazon change fréquemment ses sélecteurs
|
||||
# - Plusieurs fallbacks sont fournis pour chaque champ
|
||||
# - Le parsing doit tester tous les sélecteurs dans l'ordre
|
||||
# - En cas d'échec, marquer le champ comme null dans ProductSnapshot
|
||||
330
pricewatch/app/stores/amazon/store.py
Executable file
330
pricewatch/app/stores/amazon/store.py
Executable file
@@ -0,0 +1,330 @@
|
||||
"""
|
||||
Store Amazon - Parsing de produits Amazon.fr et Amazon.com.
|
||||
|
||||
Supporte l'extraction de: titre, prix, ASIN, images, specs, etc.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import (
|
||||
DebugInfo,
|
||||
DebugStatus,
|
||||
FetchMethod,
|
||||
ProductSnapshot,
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
|
||||
logger = get_logger("stores.amazon")
|
||||
|
||||
|
||||
class AmazonStore(BaseStore):
|
||||
"""Store pour Amazon.fr et Amazon.com."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialise le store Amazon avec ses sélecteurs."""
|
||||
selectors_path = Path(__file__).parent / "selectors.yml"
|
||||
super().__init__(store_id="amazon", selectors_path=selectors_path)
|
||||
|
||||
def match(self, url: str) -> float:
|
||||
"""
|
||||
Détecte si l'URL est Amazon.
|
||||
|
||||
Returns:
|
||||
0.9 pour amazon.fr
|
||||
0.8 pour amazon.com et autres domaines amazon
|
||||
0.0 sinon
|
||||
"""
|
||||
if not url:
|
||||
return 0.0
|
||||
|
||||
url_lower = url.lower()
|
||||
|
||||
if "amazon.fr" in url_lower:
|
||||
return 0.9
|
||||
elif "amazon.com" in url_lower or "amazon.co" in url_lower:
|
||||
return 0.8
|
||||
elif "amazon." in url_lower:
|
||||
return 0.7
|
||||
|
||||
return 0.0
|
||||
|
||||
def canonicalize(self, url: str) -> str:
|
||||
"""
|
||||
Normalise l'URL Amazon vers /dp/{ASIN}.
|
||||
|
||||
Exemples:
|
||||
https://www.amazon.fr/product-name/dp/B08N5WRWNW/ref=...
|
||||
→ https://www.amazon.fr/dp/B08N5WRWNW
|
||||
|
||||
Justification: L'ASIN est l'identifiant unique, le reste est superflu.
|
||||
"""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
# Extraire l'ASIN
|
||||
asin = self.extract_reference(url)
|
||||
if not asin:
|
||||
# Si pas d'ASIN trouvé, retourner l'URL sans query params
|
||||
parsed = urlparse(url)
|
||||
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
||||
|
||||
# Reconstruire l'URL canonique
|
||||
parsed = urlparse(url)
|
||||
return f"{parsed.scheme}://{parsed.netloc}/dp/{asin}"
|
||||
|
||||
def extract_reference(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Extrait l'ASIN depuis l'URL.
|
||||
|
||||
L'ASIN est généralement après /dp/ ou /gp/product/.
|
||||
L'ASIN doit avoir exactement 10 caractères alphanumériques.
|
||||
|
||||
Exemples:
|
||||
/dp/B08N5WRWNW → B08N5WRWNW
|
||||
/gp/product/B08N5WRWNW → B08N5WRWNW
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Pattern: /dp/{ASIN} ou /gp/product/{ASIN}
|
||||
# L'ASIN doit être suivi de /, ?, #, ou fin de string
|
||||
match = re.search(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:/|\?|#|$)", url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
def parse(self, html: str, url: str) -> ProductSnapshot:
|
||||
"""
|
||||
Parse le HTML Amazon vers ProductSnapshot.
|
||||
|
||||
Utilise BeautifulSoup et les sélecteurs du fichier YAML.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
debug_info = DebugInfo(
|
||||
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
|
||||
status=DebugStatus.SUCCESS,
|
||||
errors=[],
|
||||
notes=[],
|
||||
)
|
||||
|
||||
# Vérifier si captcha/robot check
|
||||
if self._detect_captcha(soup):
|
||||
debug_info.errors.append("Captcha ou robot check détecté")
|
||||
debug_info.status = DebugStatus.FAILED
|
||||
logger.warning(f"[Amazon] Captcha détecté pour: {url}")
|
||||
|
||||
# Extraction des champs
|
||||
title = self._extract_title(soup, debug_info)
|
||||
price = self._extract_price(soup, debug_info)
|
||||
currency = self._extract_currency(soup, debug_info)
|
||||
stock_status = self._extract_stock(soup, debug_info)
|
||||
images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
|
||||
|
||||
# Déterminer le statut final (ne pas écraser FAILED)
|
||||
if debug_info.status != DebugStatus.FAILED:
|
||||
if not title or price is None:
|
||||
debug_info.status = DebugStatus.PARTIAL
|
||||
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
|
||||
|
||||
snapshot = ProductSnapshot(
|
||||
source=self.store_id,
|
||||
url=self.canonicalize(url),
|
||||
fetched_at=datetime.now(),
|
||||
title=title,
|
||||
price=price,
|
||||
currency=currency or "EUR",
|
||||
shipping_cost=None, # Difficile à extraire
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
images=images,
|
||||
specs=specs,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[Amazon] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
|
||||
f"title={bool(title)}, price={price is not None}"
|
||||
)
|
||||
|
||||
return snapshot
|
||||
|
||||
def _detect_captcha(self, soup: BeautifulSoup) -> bool:
|
||||
"""Détecte si la page contient un captcha/robot check."""
|
||||
captcha_selectors = self.get_selector("captcha_indicators", [])
|
||||
if isinstance(captcha_selectors, str):
|
||||
captcha_selectors = [captcha_selectors]
|
||||
|
||||
for selector in captcha_selectors:
|
||||
if soup.select(selector):
|
||||
return True
|
||||
|
||||
# Vérifier dans le texte
|
||||
text = soup.get_text().lower()
|
||||
if "captcha" in text or "robot check" in text or "sorry" in text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait le titre du produit."""
|
||||
selectors = self.get_selector("title", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
title = element.get_text(strip=True)
|
||||
if title:
|
||||
return title
|
||||
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
text = element.get_text(strip=True)
|
||||
# Extraire nombre (format: "299,99" ou "299.99")
|
||||
match = re.search(r"(\d+)[.,](\d+)", text)
|
||||
if match:
|
||||
price_str = f"{match.group(1)}.{match.group(2)}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
symbol = element.get_text(strip=True)
|
||||
# Mapper symboles vers codes ISO
|
||||
currency_map = {"€": "EUR", "$": "USD", "£": "GBP"}
|
||||
return currency_map.get(symbol, "EUR")
|
||||
|
||||
# Défaut basé sur le domaine
|
||||
return "EUR"
|
||||
|
||||
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
||||
"""Extrait le statut de stock."""
|
||||
selectors = self.get_selector("stock_status", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True).lower()
|
||||
if "en stock" in text or "available" in text or "in stock" in text:
|
||||
return StockStatus.IN_STOCK
|
||||
elif (
|
||||
"rupture" in text
|
||||
or "indisponible" in text
|
||||
or "out of stock" in text
|
||||
):
|
||||
return StockStatus.OUT_OF_STOCK
|
||||
|
||||
return StockStatus.UNKNOWN
|
||||
|
||||
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
||||
"""Extrait les URLs d'images."""
|
||||
images = []
|
||||
selectors = self.get_selector("images", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Attribut src ou data-src
|
||||
url = element.get("src") or element.get("data-src")
|
||||
if url and url.startswith("http"):
|
||||
images.append(url)
|
||||
|
||||
return list(set(images)) # Dédupliquer
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis les breadcrumbs."""
|
||||
selectors = self.get_selector("category", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Prendre le dernier élément du breadcrumb
|
||||
links = element.select("a")
|
||||
if links:
|
||||
return links[-1].get_text(strip=True)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
||||
"""Extrait les caractéristiques techniques."""
|
||||
specs = {}
|
||||
selectors = self.get_selector("specs_table", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
table = soup.select_one(selector)
|
||||
if table:
|
||||
# Parser table <th>/<td>
|
||||
rows = table.select("tr")
|
||||
for row in rows:
|
||||
th = row.select_one("th")
|
||||
td = row.select_one("td")
|
||||
if th and td:
|
||||
key = th.get_text(strip=True)
|
||||
value = td.get_text(strip=True)
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
|
||||
def _extract_asin_from_html(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extrait l'ASIN depuis le HTML (fallback)."""
|
||||
selectors = self.get_selector("asin", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Input avec attribut value
|
||||
if element.name == "input":
|
||||
return element.get("value")
|
||||
# TD dans une table
|
||||
else:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return None
|
||||
0
pricewatch/app/stores/backmarket/__init__.py
Executable file
0
pricewatch/app/stores/backmarket/__init__.py
Executable file
BIN
pricewatch/app/stores/backmarket/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/backmarket/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/stores/backmarket/__pycache__/store.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/backmarket/__pycache__/store.cpython-313.pyc
Executable file
Binary file not shown.
143
pricewatch/app/stores/backmarket/fixtures/README.md
Executable file
143
pricewatch/app/stores/backmarket/fixtures/README.md
Executable file
@@ -0,0 +1,143 @@
|
||||
# Fixtures Backmarket
|
||||
|
||||
Ce dossier contient des fichiers HTML réels capturés depuis Backmarket.fr pour les tests.
|
||||
|
||||
## ⚠️ Note importante sur Backmarket
|
||||
|
||||
Backmarket utilise une **protection anti-bot**:
|
||||
- HTTP simple retourne **403 Forbidden**
|
||||
- **Playwright est OBLIGATOIRE** pour récupérer le contenu
|
||||
- Temps de chargement: ~2-3 secondes
|
||||
|
||||
## Spécificité Backmarket
|
||||
|
||||
Backmarket vend des **produits reconditionnés**:
|
||||
- Prix variable selon la **condition** (Correct, Bon, Excellent, etc.)
|
||||
- Chaque produit a plusieurs offres avec des états différents
|
||||
- Le prix extrait correspond à l'offre sélectionnée par défaut
|
||||
|
||||
## Fichiers
|
||||
|
||||
### backmarket_iphone15pro.html
|
||||
- **Produit**: iPhone 15 Pro (reconditionné)
|
||||
- **SKU**: iphone-15-pro
|
||||
- **URL**: https://www.backmarket.fr/fr-fr/p/iphone-15-pro
|
||||
- **Taille**: ~1.5 MB
|
||||
- **Date capture**: 2026-01-13
|
||||
- **Prix capturé**: 571 EUR (prix de l'offre par défaut)
|
||||
- **Usage**: Test complet parsing smartphone reconditionné
|
||||
|
||||
## Structure HTML Backmarket
|
||||
|
||||
### JSON-LD Schema.org ✓
|
||||
Backmarket utilise **JSON-LD structuré** (contrairement à Cdiscount):
|
||||
```json
|
||||
{
|
||||
"@type": "Product",
|
||||
"name": "iPhone 15 Pro",
|
||||
"offers": {
|
||||
"@type": "Offer",
|
||||
"price": "571.00",
|
||||
"priceCurrency": "EUR"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Sélecteurs identifiés
|
||||
|
||||
#### Titre
|
||||
```css
|
||||
h1.heading-1
|
||||
```
|
||||
Classes stables, simple et propre.
|
||||
|
||||
#### Prix
|
||||
Priorité: **JSON-LD** (source la plus fiable)
|
||||
Fallback: `div[data-test='price']`
|
||||
|
||||
#### Images
|
||||
```css
|
||||
img[alt]
|
||||
```
|
||||
URLs CDN: `https://d2e6ccujb3mkqf.cloudfront.net/...`
|
||||
|
||||
#### SKU
|
||||
Extraction depuis l'URL:
|
||||
```regex
|
||||
/p/([a-z0-9-]+)
|
||||
```
|
||||
Exemple: `/p/iphone-15-pro` → SKU = "iphone-15-pro"
|
||||
|
||||
#### Condition (État du reconditionné)
|
||||
```css
|
||||
button[data-test='condition-button']
|
||||
div[class*='condition']
|
||||
```
|
||||
Valeurs possibles: Correct, Bon, Très bon, Excellent, Comme neuf
|
||||
|
||||
## Comparaison avec autres stores
|
||||
|
||||
| Aspect | Amazon | Cdiscount | Backmarket |
|
||||
|--------|--------|-----------|------------|
|
||||
| **Anti-bot** | Faible | Fort | Fort |
|
||||
| **Méthode** | HTTP OK | Playwright | Playwright |
|
||||
| **JSON-LD** | Partiel | ✗ Non | ✓ Oui (complet) |
|
||||
| **Sélecteurs** | Stables (IDs) | Instables | Stables (classes) |
|
||||
| **SKU format** | `/dp/{ASIN}` | `/f-{cat}-{SKU}` | `/p/{slug}` |
|
||||
| **Particularité** | - | Prix dynamiques | Reconditionné (condition) |
|
||||
|
||||
## Utilisation dans les tests
|
||||
|
||||
```python
|
||||
@pytest.fixture
|
||||
def backmarket_fixture_iphone15pro():
|
||||
fixture_path = Path(__file__).parent.parent.parent / \
|
||||
"pricewatch/app/stores/backmarket/fixtures/backmarket_iphone15pro.html"
|
||||
with open(fixture_path, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def test_parse_real_fixture(store, backmarket_fixture_iphone15pro):
|
||||
url = "https://www.backmarket.fr/fr-fr/p/iphone-15-pro"
|
||||
snapshot = store.parse(backmarket_fixture_iphone15pro, url)
|
||||
|
||||
assert snapshot.title == "iPhone 15 Pro"
|
||||
assert snapshot.price == 571.0
|
||||
assert snapshot.reference == "iphone-15-pro"
|
||||
assert snapshot.currency == "EUR"
|
||||
```
|
||||
|
||||
## Points d'attention pour les tests
|
||||
|
||||
1. **JSON-LD prioritaire** - Le prix vient du JSON-LD, pas du HTML visible
|
||||
2. **Prix variable** - Change selon la condition sélectionnée
|
||||
3. **Ne pas tester le prix exact** - Il varie avec les offres disponibles
|
||||
4. **Tester le format** et la présence des données
|
||||
5. Backmarket = **produits reconditionnés** uniquement
|
||||
|
||||
## Comment capturer une nouvelle fixture
|
||||
|
||||
```python
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
|
||||
url = "https://www.backmarket.fr/fr-fr/p/..."
|
||||
result = fetch_playwright(url, headless=True, timeout_ms=60000)
|
||||
|
||||
if result.success:
|
||||
with open("fixture.html", "w", encoding="utf-8") as f:
|
||||
f.write(result.html)
|
||||
```
|
||||
|
||||
⚠️ **N'utilisez JAMAIS** `fetch_http()` pour Backmarket - cela retournera 403!
|
||||
|
||||
## Avantages de Backmarket
|
||||
|
||||
✓ **JSON-LD structuré** → Parsing très fiable
|
||||
✓ **Classes CSS stables** → Moins de casse que Cdiscount
|
||||
✓ **URL propre** → SKU facile à extraire
|
||||
✓ **Schema.org complet** → Prix, nom, images dans JSON
|
||||
|
||||
## Inconvénients
|
||||
|
||||
✗ **Protection anti-bot** → Playwright obligatoire (lent)
|
||||
✗ **Prix multiples** → Un produit = plusieurs offres selon état
|
||||
✗ **Stock complexe** → Dépend de l'offre et de la condition
|
||||
325
pricewatch/app/stores/backmarket/fixtures/backmarket_iphone15pro.html
Executable file
325
pricewatch/app/stores/backmarket/fixtures/backmarket_iphone15pro.html
Executable file
File diff suppressed because one or more lines are too long
72
pricewatch/app/stores/backmarket/selectors.yml
Executable file
72
pricewatch/app/stores/backmarket/selectors.yml
Executable file
@@ -0,0 +1,72 @@
|
||||
# Sélecteurs CSS/XPath pour Backmarket.fr
|
||||
# Mis à jour le 2026-01-13 après analyse du HTML réel
|
||||
|
||||
# ⚠️ IMPORTANT: Backmarket utilise une protection anti-bot
|
||||
# - HTTP simple ne fonctionne PAS (retourne 403 Forbidden)
|
||||
# - Playwright est OBLIGATOIRE pour récupérer le contenu
|
||||
# - Les classes CSS sont relativement stables (heading-1, etc.)
|
||||
|
||||
# Titre du produit
|
||||
# Classes simples et stables
|
||||
title:
|
||||
- "h1.heading-1"
|
||||
- "h1" # Fallback
|
||||
|
||||
# Prix principal
|
||||
# ✓ JSON-LD schema.org disponible (prioritaire)
|
||||
# Les prix sont dans <script type="application/ld+json">
|
||||
price:
|
||||
- "div[data-test='price']" # Fallback si JSON-LD n'est pas disponible
|
||||
- "span[class*='price']"
|
||||
|
||||
# Devise
|
||||
# Toujours EUR pour Backmarket France
|
||||
currency:
|
||||
- "meta[property='og:price:currency']"
|
||||
# Fallback: statique EUR
|
||||
|
||||
# État / Condition (spécifique aux produits reconditionnés)
|
||||
# Backmarket vend du reconditionné, donc il y a des grades (Correct, Bon, Excellent, etc.)
|
||||
condition:
|
||||
- "button[data-test='condition-button']"
|
||||
- "div[class*='condition']"
|
||||
- "span[class*='grade']"
|
||||
|
||||
# Images produit
|
||||
images:
|
||||
- "img[alt]" # Toutes les images avec alt
|
||||
# Filtrer celles qui contiennent le nom du produit
|
||||
|
||||
# Catégorie / breadcrumb
|
||||
category:
|
||||
- "nav[aria-label='breadcrumb'] a"
|
||||
- ".breadcrumb a"
|
||||
|
||||
# Caractéristiques techniques
|
||||
# Peuvent être dans des sections dépliables
|
||||
specs_table:
|
||||
- "div[class*='specification']"
|
||||
- "div[class*='technical']"
|
||||
- "dl"
|
||||
|
||||
# SKU / référence produit
|
||||
# Extraction depuis l'URL plus fiable
|
||||
# URL pattern: /fr-fr/p/{slug}
|
||||
# SKU = slug
|
||||
sku:
|
||||
- "meta[property='product:retailer_item_id']"
|
||||
- "span[data-test='sku']"
|
||||
|
||||
# Stock / Disponibilité
|
||||
stock_status:
|
||||
- "button[data-test='add-to-cart']" # Si présent = en stock
|
||||
- "div[class*='availability']"
|
||||
|
||||
# Notes importantes:
|
||||
# 1. ⚠️ Playwright OBLIGATOIRE - HTTP retourne 403 Forbidden
|
||||
# 2. JSON-LD schema.org disponible → prioritaire pour prix/titre
|
||||
# 3. Classes CSS relativement stables (heading-1, etc.)
|
||||
# 4. SKU: extraire depuis URL /fr-fr/p/{slug}
|
||||
# 5. Condition (grade) important pour Backmarket (Correct/Bon/Excellent)
|
||||
# 6. Prix varie selon la condition choisie
|
||||
# 7. Devise: toujours EUR pour France (static fallback OK)
|
||||
358
pricewatch/app/stores/backmarket/store.py
Executable file
358
pricewatch/app/stores/backmarket/store.py
Executable file
@@ -0,0 +1,358 @@
|
||||
"""
|
||||
Store Backmarket - Parsing de produits Backmarket.fr.
|
||||
|
||||
Supporte l'extraction de: titre, prix, SKU, images, condition (état), etc.
|
||||
Spécificité: Backmarket vend du reconditionné, donc prix variable selon condition.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import (
|
||||
DebugInfo,
|
||||
DebugStatus,
|
||||
FetchMethod,
|
||||
ProductSnapshot,
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
|
||||
logger = get_logger("stores.backmarket")
|
||||
|
||||
|
||||
class BackmarketStore(BaseStore):
|
||||
"""Store pour Backmarket.fr (produits reconditionnés)."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialise le store Backmarket avec ses sélecteurs."""
|
||||
selectors_path = Path(__file__).parent / "selectors.yml"
|
||||
super().__init__(store_id="backmarket", selectors_path=selectors_path)
|
||||
|
||||
def match(self, url: str) -> float:
|
||||
"""
|
||||
Détecte si l'URL est Backmarket.
|
||||
|
||||
Returns:
|
||||
0.9 pour backmarket.fr/backmarket.com
|
||||
0.0 sinon
|
||||
"""
|
||||
if not url:
|
||||
return 0.0
|
||||
|
||||
url_lower = url.lower()
|
||||
|
||||
if "backmarket.fr" in url_lower:
|
||||
return 0.9
|
||||
elif "backmarket.com" in url_lower:
|
||||
return 0.8 # .com pour autres pays
|
||||
|
||||
return 0.0
|
||||
|
||||
def canonicalize(self, url: str) -> str:
|
||||
"""
|
||||
Normalise l'URL Backmarket.
|
||||
|
||||
Les URLs Backmarket ont généralement la forme:
|
||||
https://www.backmarket.fr/fr-fr/p/{slug}
|
||||
|
||||
On garde l'URL complète sans query params.
|
||||
"""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
parsed = urlparse(url)
|
||||
# Retirer query params et fragment
|
||||
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
||||
|
||||
def extract_reference(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Extrait le SKU (slug) depuis l'URL.
|
||||
|
||||
Format typique: /fr-fr/p/{slug}
|
||||
Exemple: /fr-fr/p/iphone-15-pro → "iphone-15-pro"
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Pattern: /p/{slug} (peut être /fr-fr/p/ ou /en-us/p/ etc.)
|
||||
match = re.search(r"/p/([a-z0-9-]+)", url, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
def parse(self, html: str, url: str) -> ProductSnapshot:
|
||||
"""
|
||||
Parse le HTML Backmarket vers ProductSnapshot.
|
||||
|
||||
Utilise en priorité JSON-LD schema.org, puis BeautifulSoup avec sélecteurs.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
debug_info = DebugInfo(
|
||||
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
|
||||
status=DebugStatus.SUCCESS,
|
||||
errors=[],
|
||||
notes=[],
|
||||
)
|
||||
|
||||
# Extraction prioritaire depuis JSON-LD
|
||||
json_ld_data = self._extract_json_ld(soup)
|
||||
|
||||
# Extraction des champs
|
||||
title = json_ld_data.get("name") or self._extract_title(soup, debug_info)
|
||||
price = json_ld_data.get("price") or self._extract_price(soup, debug_info)
|
||||
currency = (
|
||||
json_ld_data.get("priceCurrency") or self._extract_currency(soup, debug_info) or "EUR"
|
||||
)
|
||||
stock_status = self._extract_stock(soup, debug_info)
|
||||
images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
reference = self.extract_reference(url)
|
||||
|
||||
# Spécifique Backmarket: condition (état du reconditionné)
|
||||
condition = self._extract_condition(soup, debug_info)
|
||||
if condition:
|
||||
specs["Condition"] = condition
|
||||
debug_info.notes.append(f"Produit reconditionné: {condition}")
|
||||
|
||||
# Déterminer le statut final
|
||||
if not title or price is None:
|
||||
debug_info.status = DebugStatus.PARTIAL
|
||||
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
|
||||
|
||||
snapshot = ProductSnapshot(
|
||||
source=self.store_id,
|
||||
url=self.canonicalize(url),
|
||||
fetched_at=datetime.now(),
|
||||
title=title,
|
||||
price=price,
|
||||
currency=currency,
|
||||
shipping_cost=None,
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
images=images,
|
||||
specs=specs,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[Backmarket] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
|
||||
f"title={bool(title)}, price={price is not None}"
|
||||
)
|
||||
|
||||
return snapshot
|
||||
|
||||
def _extract_json_ld(self, soup: BeautifulSoup) -> dict:
|
||||
"""
|
||||
Extrait les données depuis JSON-LD schema.org.
|
||||
|
||||
Backmarket utilise schema.org Product, c'est la source la plus fiable.
|
||||
"""
|
||||
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
|
||||
|
||||
for script in json_ld_scripts:
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||
result = {
|
||||
"name": data.get("name"),
|
||||
"priceCurrency": None,
|
||||
"price": None,
|
||||
"images": [],
|
||||
}
|
||||
|
||||
# Prix depuis offers
|
||||
offers = data.get("offers", {})
|
||||
if isinstance(offers, dict):
|
||||
result["price"] = offers.get("price")
|
||||
result["priceCurrency"] = offers.get("priceCurrency")
|
||||
|
||||
# Convertir en float si c'est une string
|
||||
if isinstance(result["price"], str):
|
||||
try:
|
||||
result["price"] = float(result["price"])
|
||||
except ValueError:
|
||||
result["price"] = None
|
||||
|
||||
# Images
|
||||
image_data = data.get("image")
|
||||
if isinstance(image_data, str):
|
||||
result["images"] = [image_data]
|
||||
elif isinstance(image_data, list):
|
||||
result["images"] = image_data
|
||||
|
||||
return result
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
continue
|
||||
|
||||
return {}
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait le titre du produit."""
|
||||
selectors = self.get_selector("title", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
title = element.get_text(strip=True)
|
||||
if title:
|
||||
return title
|
||||
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Attribut content (schema.org) ou texte
|
||||
price_text = element.get("content") or element.get_text(strip=True)
|
||||
|
||||
# Extraire nombre (format: "299,99" ou "299.99" ou "299")
|
||||
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
|
||||
if match:
|
||||
integer_part = match.group(1)
|
||||
decimal_part = match.group(2) or "00"
|
||||
price_str = f"{integer_part}.{decimal_part}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Attribut content
|
||||
currency = element.get("content")
|
||||
if currency:
|
||||
return currency.upper()
|
||||
|
||||
# Défaut EUR pour Backmarket France
|
||||
return "EUR"
|
||||
|
||||
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
||||
"""Extrait le statut de stock."""
|
||||
# Chercher le bouton "Ajouter au panier"
|
||||
add_to_cart = soup.find("button", attrs={"data-test": "add-to-cart"})
|
||||
if add_to_cart and not add_to_cart.get("disabled"):
|
||||
return StockStatus.IN_STOCK
|
||||
|
||||
# Fallback: chercher textes indiquant la disponibilité
|
||||
selectors = self.get_selector("stock_status", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True).lower()
|
||||
|
||||
if "en stock" in text or "disponible" in text or "ajouter" in text:
|
||||
return StockStatus.IN_STOCK
|
||||
elif (
|
||||
"rupture" in text
|
||||
or "indisponible" in text
|
||||
or "épuisé" in text
|
||||
):
|
||||
return StockStatus.OUT_OF_STOCK
|
||||
|
||||
return StockStatus.UNKNOWN
|
||||
|
||||
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
||||
"""Extrait les URLs d'images."""
|
||||
images = []
|
||||
selectors = self.get_selector("images", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# src ou data-src
|
||||
img_url = element.get("src") or element.get("data-src")
|
||||
if img_url and img_url.startswith("http"):
|
||||
# Éviter les doublons
|
||||
if img_url not in images:
|
||||
images.append(img_url)
|
||||
|
||||
return images
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis le breadcrumb."""
|
||||
selectors = self.get_selector("category", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
# Prendre le dernier élément du breadcrumb (catégorie la plus spécifique)
|
||||
categories = [elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)]
|
||||
if categories:
|
||||
return categories[-1]
|
||||
|
||||
return None
|
||||
|
||||
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
||||
"""Extrait les caractéristiques techniques."""
|
||||
specs = {}
|
||||
|
||||
# Chercher les dl (definition lists)
|
||||
dls = soup.find_all("dl")
|
||||
for dl in dls:
|
||||
dts = dl.find_all("dt")
|
||||
dds = dl.find_all("dd")
|
||||
|
||||
for dt, dd in zip(dts, dds):
|
||||
key = dt.get_text(strip=True)
|
||||
value = dd.get_text(strip=True)
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
|
||||
def _extract_condition(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""
|
||||
Extrait la condition/état du produit reconditionné.
|
||||
|
||||
Spécifique à Backmarket: Correct, Bon, Très bon, Excellent, etc.
|
||||
"""
|
||||
selectors = self.get_selector("condition", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
text = element.get_text(strip=True)
|
||||
# Chercher les grades Backmarket
|
||||
if any(grade in text for grade in ["Correct", "Bon", "Très bon", "Excellent", "Comme neuf"]):
|
||||
return text
|
||||
|
||||
return None
|
||||
156
pricewatch/app/stores/base.py
Executable file
156
pricewatch/app/stores/base.py
Executable file
@@ -0,0 +1,156 @@
|
||||
"""
|
||||
Classe abstraite BaseStore définissant l'interface des stores.
|
||||
|
||||
Tous les stores (Amazon, Cdiscount, etc.) doivent hériter de BaseStore
|
||||
et implémenter ses méthodes abstraites.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import ProductSnapshot
|
||||
|
||||
logger = get_logger("stores.base")
|
||||
|
||||
|
||||
class BaseStore(ABC):
|
||||
"""
|
||||
Classe abstraite définissant l'interface d'un store.
|
||||
|
||||
Chaque store (Amazon, Cdiscount, etc.) doit implémenter:
|
||||
- match(): Détection si une URL correspond au store
|
||||
- canonicalize(): Normalisation de l'URL
|
||||
- extract_reference(): Extraction de la référence produit
|
||||
- parse(): Parsing HTML vers ProductSnapshot
|
||||
|
||||
Les sélecteurs CSS/XPath sont stockés dans selectors.yml pour
|
||||
faciliter la maintenance sans toucher au code Python.
|
||||
"""
|
||||
|
||||
def __init__(self, store_id: str, selectors_path: Optional[Path] = None):
|
||||
"""
|
||||
Initialise le store.
|
||||
|
||||
Args:
|
||||
store_id: Identifiant unique du store (ex: 'amazon', 'cdiscount')
|
||||
selectors_path: Chemin vers le fichier selectors.yml
|
||||
"""
|
||||
self.store_id = store_id
|
||||
self.selectors: dict = {}
|
||||
|
||||
if selectors_path and selectors_path.exists():
|
||||
self._load_selectors(selectors_path)
|
||||
|
||||
def _load_selectors(self, path: Path) -> None:
|
||||
"""
|
||||
Charge les sélecteurs depuis le fichier YAML.
|
||||
|
||||
Args:
|
||||
path: Chemin vers selectors.yml
|
||||
"""
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
self.selectors = yaml.safe_load(f) or {}
|
||||
logger.debug(f"[{self.store_id}] Sélecteurs chargés: {len(self.selectors)} entrées")
|
||||
except Exception as e:
|
||||
logger.warning(f"[{self.store_id}] Erreur chargement sélecteurs: {e}")
|
||||
self.selectors = {}
|
||||
|
||||
@abstractmethod
|
||||
def match(self, url: str) -> float:
|
||||
"""
|
||||
Retourne un score de correspondance entre l'URL et ce store.
|
||||
|
||||
Args:
|
||||
url: URL à tester
|
||||
|
||||
Returns:
|
||||
Score entre 0.0 (aucune correspondance) et 1.0 (correspondance parfaite)
|
||||
|
||||
Exemple:
|
||||
- 'amazon.fr' dans l'URL → 0.9
|
||||
- 'amazon.com' dans l'URL → 0.8
|
||||
- Autres domaines → 0.0
|
||||
|
||||
Justification technique:
|
||||
- Score plutôt que booléen pour gérer les ambiguïtés (ex: sous-domaines)
|
||||
- Le Registry choisira le store avec le meilleur score
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def canonicalize(self, url: str) -> str:
|
||||
"""
|
||||
Normalise l'URL vers sa forme canonique.
|
||||
|
||||
Args:
|
||||
url: URL brute (peut contenir des query params, ref, etc.)
|
||||
|
||||
Returns:
|
||||
URL canonique (ex: https://www.amazon.fr/dp/B08N5WRWNW)
|
||||
|
||||
Justification technique:
|
||||
- Évite les doublons dans la base de données
|
||||
- Facilite le suivi d'un même produit dans le temps
|
||||
- Supprime les paramètres de tracking
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def extract_reference(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Extrait la référence produit depuis l'URL.
|
||||
|
||||
Args:
|
||||
url: URL du produit
|
||||
|
||||
Returns:
|
||||
Référence (ASIN pour Amazon, SKU pour autres) ou None
|
||||
|
||||
Exemple:
|
||||
- Amazon: https://amazon.fr/dp/B08N5WRWNW → "B08N5WRWNW"
|
||||
- Cdiscount: https://cdiscount.com/.../f-123-sku.html → "123-sku"
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, html: str, url: str) -> ProductSnapshot:
|
||||
"""
|
||||
Parse le HTML et retourne un ProductSnapshot.
|
||||
|
||||
Args:
|
||||
html: Contenu HTML de la page produit
|
||||
url: URL canonique du produit
|
||||
|
||||
Returns:
|
||||
ProductSnapshot avec toutes les données extraites
|
||||
|
||||
Raises:
|
||||
Exception: Si le parsing échoue complètement
|
||||
|
||||
Justification technique:
|
||||
- Utilise self.selectors (chargés depuis YAML) pour extraire les données
|
||||
- En cas d'échec partiel, retourne un snapshot avec debug.status=partial
|
||||
- En cas d'échec total, raise une exception pour fallback Playwright
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_selector(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Récupère un sélecteur depuis self.selectors.
|
||||
|
||||
Args:
|
||||
key: Clé du sélecteur (ex: 'title', 'price')
|
||||
default: Valeur par défaut si non trouvé
|
||||
|
||||
Returns:
|
||||
Sélecteur CSS ou XPath, ou default
|
||||
"""
|
||||
return self.selectors.get(key, default)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<{self.__class__.__name__} id={self.store_id}>"
|
||||
0
pricewatch/app/stores/cdiscount/__init__.py
Executable file
0
pricewatch/app/stores/cdiscount/__init__.py
Executable file
BIN
pricewatch/app/stores/cdiscount/__pycache__/__init__.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/cdiscount/__pycache__/__init__.cpython-313.pyc
Executable file
Binary file not shown.
BIN
pricewatch/app/stores/cdiscount/__pycache__/store.cpython-313.pyc
Executable file
BIN
pricewatch/app/stores/cdiscount/__pycache__/store.cpython-313.pyc
Executable file
Binary file not shown.
115
pricewatch/app/stores/cdiscount/fixtures/README.md
Executable file
115
pricewatch/app/stores/cdiscount/fixtures/README.md
Executable file
@@ -0,0 +1,115 @@
|
||||
# Fixtures Cdiscount
|
||||
|
||||
Ce dossier contient des fichiers HTML réels capturés depuis Cdiscount.com pour les tests.
|
||||
|
||||
## ⚠️ Note importante sur Cdiscount
|
||||
|
||||
Cdiscount utilise une **protection anti-bot forte** (Cloudflare/Baleen):
|
||||
- HTTP simple retourne une page de protection JavaScript (~14 KB)
|
||||
- **Playwright est OBLIGATOIRE** pour récupérer le vrai contenu
|
||||
- Temps de chargement: ~2-3 secondes
|
||||
|
||||
## Fichiers
|
||||
|
||||
### cdiscount_tuf608umrv004_pw.html
|
||||
- **Produit**: PC Portable Gamer ASUS TUF Gaming A16
|
||||
- **SKU**: tuf608umrv004
|
||||
- **URL**: https://www.cdiscount.com/informatique/ordinateurs-pc-portables/pc-portable-gamer-asus-tuf-gaming-a16-sans-windo/f-10709-tuf608umrv004.html
|
||||
- **Taille**: ~310 KB
|
||||
- **Lignes**: 399
|
||||
- **Méthode**: Playwright (headless)
|
||||
- **Date capture**: 2026-01-13
|
||||
- **Usage**: Test complet parsing produit tech
|
||||
|
||||
## Différences avec Amazon
|
||||
|
||||
| Aspect | Amazon | Cdiscount |
|
||||
|--------|--------|-----------|
|
||||
| **Anti-bot** | Faible (HTTP OK) | ✗ Fort (Playwright requis) |
|
||||
| **Sélecteurs** | Stables (IDs) | Instables (classes générées) |
|
||||
| **Structure** | `#productTitle`, `.a-price` | `data-e2e="title"`, classes dynamiques |
|
||||
| **Prix** | 3 parties (whole+fraction+symbol) | Texte direct "1499,99 €" |
|
||||
| **Référence** | ASIN dans URL `/dp/{ASIN}` | SKU dans URL `/f-{cat}-{SKU}.html` |
|
||||
| **Catégorie** | Breadcrumb HTML | Dans l'URL path |
|
||||
| **Specs** | Tables HTML | Peut être dans onglets cachés |
|
||||
|
||||
## Sélecteurs identifiés
|
||||
|
||||
### Titre
|
||||
```css
|
||||
h1[data-e2e="title"]
|
||||
```
|
||||
Exemple: "PC Portable Gamer ASUS TUF Gaming A16 | Sans Windows - 16" WUXGA..."
|
||||
|
||||
### Prix
|
||||
Classes CSS instables. Utiliser **regex sur le texte**:
|
||||
```regex
|
||||
(\d+[,\.]\d+)\s*€
|
||||
```
|
||||
Exemple: "1199,99 €" → 1199.99
|
||||
|
||||
### Images
|
||||
```css
|
||||
img[alt]
|
||||
```
|
||||
Filtrer celles dont `alt` contient le titre du produit.
|
||||
Format URL: `https://www.cdiscount.com/pdt2/0/0/4/{num}/700x700/{sku}/rw/...`
|
||||
|
||||
### SKU
|
||||
Extraction depuis l'URL:
|
||||
```regex
|
||||
/f-\d+-([a-z0-9]+)\.html
|
||||
```
|
||||
Groupe 1 = SKU (ex: `tuf608umrv004`)
|
||||
|
||||
### Catégorie
|
||||
Extraction depuis l'URL path:
|
||||
```
|
||||
/informatique/ordinateurs-pc-portables/...
|
||||
^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^
|
||||
catégorie1 catégorie2
|
||||
```
|
||||
|
||||
## Utilisation dans les tests
|
||||
|
||||
```python
|
||||
@pytest.fixture
|
||||
def cdiscount_fixture_tuf608umrv004():
|
||||
fixture_path = Path(__file__).parent.parent.parent / \
|
||||
"pricewatch/app/stores/cdiscount/fixtures/cdiscount_tuf608umrv004_pw.html"
|
||||
with open(fixture_path, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
def test_parse_real_fixture(store, cdiscount_fixture_tuf608umrv004):
|
||||
url = "https://www.cdiscount.com/informatique/.../f-10709-tuf608umrv004.html"
|
||||
snapshot = store.parse(cdiscount_fixture_tuf608umrv004, url)
|
||||
|
||||
assert snapshot.title is not None
|
||||
assert "ASUS" in snapshot.title
|
||||
assert snapshot.price == 1199.99
|
||||
assert snapshot.reference == "tuf608umrv004"
|
||||
```
|
||||
|
||||
## Points d'attention pour les tests
|
||||
|
||||
1. **Ne pas tester les valeurs exactes** (prix, nombre d'avis) car elles changent
|
||||
2. **Tester le format** et la présence des données
|
||||
3. **Prévoir des fallbacks** pour chaque champ (sélecteurs instables)
|
||||
4. Les classes CSS peuvent changer à tout moment
|
||||
5. Utiliser `data-e2e` attributes quand disponibles (plus stables)
|
||||
6. Parser le prix par regex plutôt que par sélecteurs CSS
|
||||
|
||||
## Comment capturer une nouvelle fixture
|
||||
|
||||
```python
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
|
||||
url = "https://www.cdiscount.com/..."
|
||||
result = fetch_playwright(url, headless=True, timeout_ms=60000)
|
||||
|
||||
if result.success:
|
||||
with open("fixture.html", "w", encoding="utf-8") as f:
|
||||
f.write(result.html)
|
||||
```
|
||||
|
||||
⚠️ **N'utilisez JAMAIS** `fetch_http()` pour Cdiscount - cela ne fonctionnera pas!
|
||||
382
pricewatch/app/stores/cdiscount/fixtures/cdiscount_a128902_pw.html
Executable file
382
pricewatch/app/stores/cdiscount/fixtures/cdiscount_a128902_pw.html
Executable file
File diff suppressed because one or more lines are too long
400
pricewatch/app/stores/cdiscount/fixtures/cdiscount_tuf608umrv004_pw.html
Executable file
400
pricewatch/app/stores/cdiscount/fixtures/cdiscount_tuf608umrv004_pw.html
Executable file
File diff suppressed because one or more lines are too long
83
pricewatch/app/stores/cdiscount/selectors.yml
Executable file
83
pricewatch/app/stores/cdiscount/selectors.yml
Executable file
@@ -0,0 +1,83 @@
|
||||
# Sélecteurs CSS/XPath pour Cdiscount
|
||||
# Mis à jour le 2026-01-13 après analyse du HTML réel
|
||||
|
||||
# ⚠️ IMPORTANT: Cdiscount utilise une protection anti-bot forte
|
||||
# - HTTP simple ne fonctionne PAS (retourne une page de protection JavaScript)
|
||||
# - Playwright est OBLIGATOIRE pour récupérer le vrai contenu
|
||||
# - Les classes CSS sont générées dynamiquement et peuvent changer
|
||||
|
||||
# Titre du produit
|
||||
# Utiliser data-e2e car plus stable que les classes CSS
|
||||
title:
|
||||
- "h1[data-e2e='title']"
|
||||
- "h1" # Fallback: premier h1
|
||||
|
||||
# Prix principal
|
||||
# Les classes CSS sont instables (sc-83lijy-0, kwssIa, etc.)
|
||||
# Meilleure approche: extraire par regex depuis le texte
|
||||
# Pattern: (\d+[,\.]\d+)\s*€
|
||||
price:
|
||||
- "div[data-e2e='price']" # Nouveau layout (2026)
|
||||
- "div[class*='SecondaryPrice-price']"
|
||||
- "div[class*='price']"
|
||||
- ".fpPrice"
|
||||
|
||||
# Prix de comparaison (prix barré)
|
||||
price_compare:
|
||||
- "div[class*='SecondaryPrice-wrapper']"
|
||||
|
||||
# Devise
|
||||
# Toujours EUR pour Cdiscount France
|
||||
currency:
|
||||
- "meta[itemprop='priceCurrency']"
|
||||
# Fallback: statique EUR
|
||||
|
||||
# Frais de port
|
||||
shipping_cost:
|
||||
- ".fpDeliveryInfo"
|
||||
- "div[class*='delivery']"
|
||||
|
||||
# Statut de stock
|
||||
# Non trouvé dans l'analyse HTML - peut être dynamique
|
||||
stock_status:
|
||||
- "link[itemprop='availability']"
|
||||
- "div[class*='availability']"
|
||||
- ".fpAvailability"
|
||||
|
||||
# Images produit
|
||||
# Filtrer par attribut alt contenant le titre
|
||||
images:
|
||||
- "img[alt]" # Toutes les images avec alt
|
||||
# URL format: https://www.cdiscount.com/pdt2/0/0/4/X/700x700/SKU/rw/...
|
||||
|
||||
# Catégorie / breadcrumb
|
||||
# Pas trouvé dans le HTML analysé
|
||||
# Extraire depuis l'URL: /informatique/ordinateurs-pc-portables/...
|
||||
category:
|
||||
- ".breadcrumb"
|
||||
- "nav[class*='breadcrumb']"
|
||||
|
||||
# Caractéristiques techniques
|
||||
# Non trouvées dans l'analyse - peuvent être dans des onglets cachés
|
||||
specs_table:
|
||||
- "table[class*='characteristic']"
|
||||
- ".fpCharacteristics"
|
||||
- "div[class*='specs']"
|
||||
|
||||
# SKU / référence produit
|
||||
# Extraction depuis l'URL plus fiable que le HTML
|
||||
# URL pattern: /f-10709-tuf608umrv004.html
|
||||
# Regex: /f-(\d+)-([a-z0-9]+)\.html
|
||||
# SKU = groupe 2
|
||||
sku:
|
||||
- "span[itemprop='sku']"
|
||||
- "meta[itemprop='productID']"
|
||||
|
||||
# Notes importantes:
|
||||
# 1. ⚠️ Playwright OBLIGATOIRE - HTTP ne fonctionne pas
|
||||
# 2. Classes CSS instables - utiliser data-e2e quand disponible
|
||||
# 3. Prix: parser par regex (\d+[,\.]\d+)\s*€ plutôt que CSS
|
||||
# 4. SKU: extraire depuis URL /f-\d+-([a-z0-9]+)\.html
|
||||
# 5. Catégorie: extraire depuis URL path /categorie1/categorie2/
|
||||
# 6. Images: filtrer celles avec alt contenant le titre produit
|
||||
# 7. Devise: toujours EUR pour France (static fallback OK)
|
||||
317
pricewatch/app/stores/cdiscount/store.py
Executable file
317
pricewatch/app/stores/cdiscount/store.py
Executable file
@@ -0,0 +1,317 @@
|
||||
"""
|
||||
Store Cdiscount - Parsing de produits Cdiscount.com.
|
||||
|
||||
Supporte l'extraction de: titre, prix, SKU, images, specs, etc.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.schema import (
|
||||
DebugInfo,
|
||||
DebugStatus,
|
||||
FetchMethod,
|
||||
ProductSnapshot,
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
|
||||
logger = get_logger("stores.cdiscount")
|
||||
|
||||
|
||||
class CdiscountStore(BaseStore):
|
||||
"""Store pour Cdiscount.com."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialise le store Cdiscount avec ses sélecteurs."""
|
||||
selectors_path = Path(__file__).parent / "selectors.yml"
|
||||
super().__init__(store_id="cdiscount", selectors_path=selectors_path)
|
||||
|
||||
def match(self, url: str) -> float:
|
||||
"""
|
||||
Détecte si l'URL est Cdiscount.
|
||||
|
||||
Returns:
|
||||
0.9 pour cdiscount.com
|
||||
0.0 sinon
|
||||
"""
|
||||
if not url:
|
||||
return 0.0
|
||||
|
||||
url_lower = url.lower()
|
||||
|
||||
if "cdiscount.com" in url_lower:
|
||||
return 0.9
|
||||
|
||||
return 0.0
|
||||
|
||||
def canonicalize(self, url: str) -> str:
|
||||
"""
|
||||
Normalise l'URL Cdiscount.
|
||||
|
||||
Les URLs Cdiscount ont généralement la forme:
|
||||
https://www.cdiscount.com/category/product-name/f-{ID}-{SKU}.html
|
||||
|
||||
On garde l'URL complète sans query params.
|
||||
"""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
parsed = urlparse(url)
|
||||
# Retirer query params et fragment
|
||||
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
||||
|
||||
def extract_reference(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Extrait le SKU depuis l'URL.
|
||||
|
||||
Format typique: /f-{ID}-{SKU}.html
|
||||
Exemple: /f-1070123-example.html → "1070123-example"
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Pattern: /f-{ID}-{SKU}.html
|
||||
match = re.search(r"/f-(\d+-[\w-]+)\.html", url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Fallback: extraire après /f-
|
||||
match = re.search(r"/f-([\w-]+)", url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
def parse(self, html: str, url: str) -> ProductSnapshot:
|
||||
"""
|
||||
Parse le HTML Cdiscount vers ProductSnapshot.
|
||||
|
||||
Utilise BeautifulSoup et les sélecteurs du fichier YAML.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
debug_info = DebugInfo(
|
||||
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
|
||||
status=DebugStatus.SUCCESS,
|
||||
errors=[],
|
||||
notes=[],
|
||||
)
|
||||
|
||||
# Extraction des champs
|
||||
title = self._extract_title(soup, debug_info)
|
||||
price = self._extract_price(soup, debug_info)
|
||||
currency = self._extract_currency(soup, debug_info)
|
||||
stock_status = self._extract_stock(soup, debug_info)
|
||||
images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_sku_from_html(soup)
|
||||
|
||||
# Déterminer le statut final
|
||||
if not title or price is None:
|
||||
debug_info.status = DebugStatus.PARTIAL
|
||||
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
|
||||
|
||||
snapshot = ProductSnapshot(
|
||||
source=self.store_id,
|
||||
url=self.canonicalize(url),
|
||||
fetched_at=datetime.now(),
|
||||
title=title,
|
||||
price=price,
|
||||
currency=currency or "EUR",
|
||||
shipping_cost=None,
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
images=images,
|
||||
specs=specs,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[Cdiscount] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
|
||||
f"title={bool(title)}, price={price is not None}"
|
||||
)
|
||||
|
||||
return snapshot
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait le titre du produit."""
|
||||
selectors = self.get_selector("title", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
title = element.get_text(strip=True)
|
||||
if title:
|
||||
return title
|
||||
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Attribut content (schema.org) ou texte
|
||||
price_text = element.get("content") or element.get_text(strip=True)
|
||||
|
||||
# Extraire nombre (format: "299,99" ou "299.99")
|
||||
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
|
||||
if match:
|
||||
integer_part = match.group(1)
|
||||
decimal_part = match.group(2) or "00"
|
||||
price_str = f"{integer_part}.{decimal_part}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Attribut content
|
||||
currency = element.get("content")
|
||||
if currency:
|
||||
return currency.upper()
|
||||
|
||||
# Défaut EUR pour Cdiscount
|
||||
return "EUR"
|
||||
|
||||
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
||||
"""Extrait le statut de stock."""
|
||||
selectors = self.get_selector("stock_status", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Attribut href (schema.org) ou texte
|
||||
href = element.get("href", "").lower()
|
||||
text = element.get_text(strip=True).lower()
|
||||
|
||||
combined = href + " " + text
|
||||
|
||||
if "instock" in combined or "en stock" in combined:
|
||||
return StockStatus.IN_STOCK
|
||||
elif (
|
||||
"outofstock" in combined
|
||||
or "rupture" in combined
|
||||
or "indisponible" in combined
|
||||
):
|
||||
return StockStatus.OUT_OF_STOCK
|
||||
|
||||
return StockStatus.UNKNOWN
|
||||
|
||||
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
||||
"""Extrait les URLs d'images."""
|
||||
images = []
|
||||
selectors = self.get_selector("images", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Attribut src, data-src, ou itemprop
|
||||
url = (
|
||||
element.get("src")
|
||||
or element.get("data-src")
|
||||
or element.get("content")
|
||||
)
|
||||
if url and ("http" in url or url.startswith("//")):
|
||||
# Normaliser // vers https://
|
||||
if url.startswith("//"):
|
||||
url = f"https:{url}"
|
||||
images.append(url)
|
||||
|
||||
return list(set(images)) # Dédupliquer
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis les breadcrumbs."""
|
||||
selectors = self.get_selector("category", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Prendre le dernier élément du breadcrumb
|
||||
links = element.select("a")
|
||||
if links:
|
||||
return links[-1].get_text(strip=True)
|
||||
|
||||
# Fallback sur le texte complet
|
||||
text = element.get_text(strip=True)
|
||||
if text:
|
||||
# Séparer par > et prendre le dernier
|
||||
parts = [p.strip() for p in text.split(">")]
|
||||
if parts:
|
||||
return parts[-1]
|
||||
|
||||
return None
|
||||
|
||||
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
||||
"""Extrait les caractéristiques techniques."""
|
||||
specs = {}
|
||||
selectors = self.get_selector("specs_table", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
container = soup.select_one(selector)
|
||||
if container:
|
||||
# Parser les lignes (souvent des divs ou des li)
|
||||
# Chercher des paires clé: valeur
|
||||
lines = container.get_text(separator="\n").split("\n")
|
||||
for line in lines:
|
||||
# Format "Clé: Valeur" ou "Clé : Valeur"
|
||||
if ":" in line:
|
||||
parts = line.split(":", 1)
|
||||
if len(parts) == 2:
|
||||
key = parts[0].strip()
|
||||
value = parts[1].strip()
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
|
||||
def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extrait le SKU depuis le HTML (fallback)."""
|
||||
selectors = self.get_selector("sku", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
# Attribut content ou itemprop
|
||||
sku = element.get("content") or element.get_text(strip=True)
|
||||
if sku:
|
||||
return sku
|
||||
|
||||
return None
|
||||
Reference in New Issue
Block a user