Files
scrap/pricewatch/app/stores/backmarket/store.py
Gilles Soulier d0b73b9319 codex2
2026-01-14 21:54:55 +01:00

384 lines
13 KiB
Python
Executable File

"""
Store Backmarket - Parsing de produits Backmarket.fr.
Supporte l'extraction de: titre, prix, SKU, images, condition (état), etc.
Spécificité: Backmarket vend du reconditionné, donc prix variable selon condition.
"""
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.schema import (
DebugInfo,
DebugStatus,
FetchMethod,
ProductSnapshot,
StockStatus,
)
from pricewatch.app.stores.base import BaseStore
from pricewatch.app.stores.price_parser import parse_price_text
logger = get_logger("stores.backmarket")
class BackmarketStore(BaseStore):
"""Store pour Backmarket.fr (produits reconditionnés)."""
def __init__(self):
"""Initialise le store Backmarket avec ses sélecteurs."""
selectors_path = Path(__file__).parent / "selectors.yml"
super().__init__(store_id="backmarket", selectors_path=selectors_path)
def match(self, url: str) -> float:
"""
Détecte si l'URL est Backmarket.
Returns:
0.9 pour backmarket.fr/backmarket.com
0.0 sinon
"""
if not url:
return 0.0
url_lower = url.lower()
if "backmarket.fr" in url_lower:
return 0.9
elif "backmarket.com" in url_lower:
return 0.8 # .com pour autres pays
return 0.0
def canonicalize(self, url: str) -> str:
"""
Normalise l'URL Backmarket.
Les URLs Backmarket ont généralement la forme:
https://www.backmarket.fr/fr-fr/p/{slug}
On garde l'URL complète sans query params.
"""
if not url:
return url
parsed = urlparse(url)
# Retirer query params et fragment
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
def extract_reference(self, url: str) -> Optional[str]:
"""
Extrait le SKU (slug) depuis l'URL.
Format typique: /fr-fr/p/{slug}
Exemple: /fr-fr/p/iphone-15-pro → "iphone-15-pro"
"""
if not url:
return None
# Pattern: /p/{slug} (peut être /fr-fr/p/ ou /en-us/p/ etc.)
match = re.search(r"/p/([a-z0-9-]+)", url, re.IGNORECASE)
if match:
return match.group(1)
return None
def parse(self, html: str, url: str) -> ProductSnapshot:
"""
Parse le HTML Backmarket vers ProductSnapshot.
Utilise en priorité JSON-LD schema.org, puis BeautifulSoup avec sélecteurs.
"""
soup = BeautifulSoup(html, "lxml")
debug_info = DebugInfo(
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
status=DebugStatus.SUCCESS,
errors=[],
notes=[],
)
# Extraction prioritaire depuis JSON-LD
json_ld_data = self._extract_json_ld(soup)
# Extraction des champs
title = json_ld_data.get("name") or self._extract_title(soup, debug_info)
price = json_ld_data.get("price") or self._extract_price(soup, debug_info)
currency = (
json_ld_data.get("priceCurrency") or self._extract_currency(soup, debug_info) or "EUR"
)
stock_status = self._extract_stock(soup, debug_info)
images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
category = self._extract_category(soup, debug_info)
specs = self._extract_specs(soup, debug_info)
description = self._extract_description(soup, debug_info)
msrp = self._extract_msrp(soup, debug_info)
reference = self.extract_reference(url)
# Spécifique Backmarket: condition (état du reconditionné)
condition = self._extract_condition(soup, debug_info)
if condition:
specs["Condition"] = condition
debug_info.notes.append(f"Produit reconditionné: {condition}")
# Déterminer le statut final
if not title or price is None:
debug_info.status = DebugStatus.PARTIAL
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
snapshot = ProductSnapshot(
source=self.store_id,
url=self.canonicalize(url),
fetched_at=datetime.now(),
title=title,
price=price,
currency=currency,
shipping_cost=None,
stock_status=stock_status,
reference=reference,
category=category,
description=description,
images=images,
specs=specs,
msrp=msrp,
debug=debug_info,
)
logger.info(
f"[Backmarket] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
f"title={bool(title)}, price={price is not None}"
)
return snapshot
def _extract_json_ld(self, soup: BeautifulSoup) -> dict:
"""
Extrait les données depuis JSON-LD schema.org.
Backmarket utilise schema.org Product, c'est la source la plus fiable.
"""
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
for script in json_ld_scripts:
try:
data = json.loads(script.string)
if isinstance(data, dict) and data.get("@type") == "Product":
result = {
"name": data.get("name"),
"priceCurrency": None,
"price": None,
"images": [],
}
# Prix depuis offers
offers = data.get("offers", {})
if isinstance(offers, dict):
result["price"] = offers.get("price")
result["priceCurrency"] = offers.get("priceCurrency")
# Convertir en float si c'est une string
if isinstance(result["price"], str):
try:
result["price"] = float(result["price"])
except ValueError:
result["price"] = None
# Images
image_data = data.get("image")
if isinstance(image_data, str):
result["images"] = [image_data]
elif isinstance(image_data, list):
result["images"] = image_data
return result
except (json.JSONDecodeError, AttributeError):
continue
return {}
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait le titre du produit."""
selectors = self.get_selector("title", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
title = element.get_text(strip=True)
if title:
return title
debug.errors.append("Titre non trouvé")
return None
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la description (meta tags)."""
meta = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if meta:
description = meta.get("content", "").strip()
if description:
return description
return None
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix."""
selectors = self.get_selector("price", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
# Attribut content (schema.org) ou texte
price_text = element.get("content") or element.get_text(strip=True)
price = parse_price_text(price_text)
if price is not None:
return price
debug.errors.append("Prix non trouvé")
return None
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
"""Extrait le prix conseille."""
selectors = [
".price--old",
".price--striked",
".price__old",
"del",
]
for selector in selectors:
element = soup.select_one(selector)
if element:
price = parse_price_text(element.get_text(strip=True))
if price is not None:
return price
return None
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la devise."""
selectors = self.get_selector("currency", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
# Attribut content
currency = element.get("content")
if currency:
return currency.upper()
# Défaut EUR pour Backmarket France
return "EUR"
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
"""Extrait le statut de stock."""
# Chercher le bouton "Ajouter au panier"
add_to_cart = soup.find("button", attrs={"data-test": "add-to-cart"})
if add_to_cart and not add_to_cart.get("disabled"):
return StockStatus.IN_STOCK
# Fallback: chercher textes indiquant la disponibilité
selectors = self.get_selector("stock_status", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True).lower()
if "en stock" in text or "disponible" in text or "ajouter" in text:
return StockStatus.IN_STOCK
elif (
"rupture" in text
or "indisponible" in text
or "épuisé" in text
):
return StockStatus.OUT_OF_STOCK
return StockStatus.UNKNOWN
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
"""Extrait les URLs d'images."""
images = []
selectors = self.get_selector("images", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
# src ou data-src
img_url = element.get("src") or element.get("data-src")
if img_url and img_url.startswith("http"):
# Éviter les doublons
if img_url not in images:
images.append(img_url)
return images
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""Extrait la catégorie depuis le breadcrumb."""
selectors = self.get_selector("category", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
elements = soup.select(selector)
if elements:
# Prendre le dernier élément du breadcrumb (catégorie la plus spécifique)
categories = [elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)]
if categories:
return categories[-1]
return None
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
"""Extrait les caractéristiques techniques."""
specs = {}
# Chercher les dl (definition lists)
dls = soup.find_all("dl")
for dl in dls:
dts = dl.find_all("dt")
dds = dl.find_all("dd")
for dt, dd in zip(dts, dds):
key = dt.get_text(strip=True)
value = dd.get_text(strip=True)
if key and value:
specs[key] = value
return specs
def _extract_condition(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
"""
Extrait la condition/état du produit reconditionné.
Spécifique à Backmarket: Correct, Bon, Très bon, Excellent, etc.
"""
selectors = self.get_selector("condition", [])
if isinstance(selectors, str):
selectors = [selectors]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
text = element.get_text(strip=True)
# Chercher les grades Backmarket
if any(grade in text for grade in ["Correct", "Bon", "Très bon", "Excellent", "Comme neuf"]):
return text
return None