384 lines
13 KiB
Python
Executable File
384 lines
13 KiB
Python
Executable File
"""
|
|
Store Backmarket - Parsing de produits Backmarket.fr.
|
|
|
|
Supporte l'extraction de: titre, prix, SKU, images, condition (état), etc.
|
|
Spécificité: Backmarket vend du reconditionné, donc prix variable selon condition.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
from urllib.parse import urlparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from pricewatch.app.core.logging import get_logger
|
|
from pricewatch.app.core.schema import (
|
|
DebugInfo,
|
|
DebugStatus,
|
|
FetchMethod,
|
|
ProductSnapshot,
|
|
StockStatus,
|
|
)
|
|
from pricewatch.app.stores.base import BaseStore
|
|
from pricewatch.app.stores.price_parser import parse_price_text
|
|
|
|
logger = get_logger("stores.backmarket")
|
|
|
|
|
|
class BackmarketStore(BaseStore):
|
|
"""Store pour Backmarket.fr (produits reconditionnés)."""
|
|
|
|
def __init__(self):
|
|
"""Initialise le store Backmarket avec ses sélecteurs."""
|
|
selectors_path = Path(__file__).parent / "selectors.yml"
|
|
super().__init__(store_id="backmarket", selectors_path=selectors_path)
|
|
|
|
def match(self, url: str) -> float:
|
|
"""
|
|
Détecte si l'URL est Backmarket.
|
|
|
|
Returns:
|
|
0.9 pour backmarket.fr/backmarket.com
|
|
0.0 sinon
|
|
"""
|
|
if not url:
|
|
return 0.0
|
|
|
|
url_lower = url.lower()
|
|
|
|
if "backmarket.fr" in url_lower:
|
|
return 0.9
|
|
elif "backmarket.com" in url_lower:
|
|
return 0.8 # .com pour autres pays
|
|
|
|
return 0.0
|
|
|
|
def canonicalize(self, url: str) -> str:
|
|
"""
|
|
Normalise l'URL Backmarket.
|
|
|
|
Les URLs Backmarket ont généralement la forme:
|
|
https://www.backmarket.fr/fr-fr/p/{slug}
|
|
|
|
On garde l'URL complète sans query params.
|
|
"""
|
|
if not url:
|
|
return url
|
|
|
|
parsed = urlparse(url)
|
|
# Retirer query params et fragment
|
|
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
|
|
def extract_reference(self, url: str) -> Optional[str]:
|
|
"""
|
|
Extrait le SKU (slug) depuis l'URL.
|
|
|
|
Format typique: /fr-fr/p/{slug}
|
|
Exemple: /fr-fr/p/iphone-15-pro → "iphone-15-pro"
|
|
"""
|
|
if not url:
|
|
return None
|
|
|
|
# Pattern: /p/{slug} (peut être /fr-fr/p/ ou /en-us/p/ etc.)
|
|
match = re.search(r"/p/([a-z0-9-]+)", url, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
def parse(self, html: str, url: str) -> ProductSnapshot:
|
|
"""
|
|
Parse le HTML Backmarket vers ProductSnapshot.
|
|
|
|
Utilise en priorité JSON-LD schema.org, puis BeautifulSoup avec sélecteurs.
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
debug_info = DebugInfo(
|
|
method=FetchMethod.HTTP, # Sera mis à jour par l'appelant
|
|
status=DebugStatus.SUCCESS,
|
|
errors=[],
|
|
notes=[],
|
|
)
|
|
|
|
# Extraction prioritaire depuis JSON-LD
|
|
json_ld_data = self._extract_json_ld(soup)
|
|
|
|
# Extraction des champs
|
|
title = json_ld_data.get("name") or self._extract_title(soup, debug_info)
|
|
price = json_ld_data.get("price") or self._extract_price(soup, debug_info)
|
|
currency = (
|
|
json_ld_data.get("priceCurrency") or self._extract_currency(soup, debug_info) or "EUR"
|
|
)
|
|
stock_status = self._extract_stock(soup, debug_info)
|
|
images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
|
|
category = self._extract_category(soup, debug_info)
|
|
specs = self._extract_specs(soup, debug_info)
|
|
description = self._extract_description(soup, debug_info)
|
|
msrp = self._extract_msrp(soup, debug_info)
|
|
reference = self.extract_reference(url)
|
|
|
|
# Spécifique Backmarket: condition (état du reconditionné)
|
|
condition = self._extract_condition(soup, debug_info)
|
|
if condition:
|
|
specs["Condition"] = condition
|
|
debug_info.notes.append(f"Produit reconditionné: {condition}")
|
|
|
|
# Déterminer le statut final
|
|
if not title or price is None:
|
|
debug_info.status = DebugStatus.PARTIAL
|
|
debug_info.notes.append("Parsing incomplet: titre ou prix manquant")
|
|
|
|
snapshot = ProductSnapshot(
|
|
source=self.store_id,
|
|
url=self.canonicalize(url),
|
|
fetched_at=datetime.now(),
|
|
title=title,
|
|
price=price,
|
|
currency=currency,
|
|
shipping_cost=None,
|
|
stock_status=stock_status,
|
|
reference=reference,
|
|
category=category,
|
|
description=description,
|
|
images=images,
|
|
specs=specs,
|
|
msrp=msrp,
|
|
debug=debug_info,
|
|
)
|
|
|
|
logger.info(
|
|
f"[Backmarket] Parsing {'réussi' if snapshot.is_complete() else 'partiel'}: "
|
|
f"title={bool(title)}, price={price is not None}"
|
|
)
|
|
|
|
return snapshot
|
|
|
|
def _extract_json_ld(self, soup: BeautifulSoup) -> dict:
|
|
"""
|
|
Extrait les données depuis JSON-LD schema.org.
|
|
|
|
Backmarket utilise schema.org Product, c'est la source la plus fiable.
|
|
"""
|
|
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
|
|
|
|
for script in json_ld_scripts:
|
|
try:
|
|
data = json.loads(script.string)
|
|
if isinstance(data, dict) and data.get("@type") == "Product":
|
|
result = {
|
|
"name": data.get("name"),
|
|
"priceCurrency": None,
|
|
"price": None,
|
|
"images": [],
|
|
}
|
|
|
|
# Prix depuis offers
|
|
offers = data.get("offers", {})
|
|
if isinstance(offers, dict):
|
|
result["price"] = offers.get("price")
|
|
result["priceCurrency"] = offers.get("priceCurrency")
|
|
|
|
# Convertir en float si c'est une string
|
|
if isinstance(result["price"], str):
|
|
try:
|
|
result["price"] = float(result["price"])
|
|
except ValueError:
|
|
result["price"] = None
|
|
|
|
# Images
|
|
image_data = data.get("image")
|
|
if isinstance(image_data, str):
|
|
result["images"] = [image_data]
|
|
elif isinstance(image_data, list):
|
|
result["images"] = image_data
|
|
|
|
return result
|
|
except (json.JSONDecodeError, AttributeError):
|
|
continue
|
|
|
|
return {}
|
|
|
|
def _extract_title(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
|
"""Extrait le titre du produit."""
|
|
selectors = self.get_selector("title", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
title = element.get_text(strip=True)
|
|
if title:
|
|
return title
|
|
|
|
debug.errors.append("Titre non trouvé")
|
|
return None
|
|
|
|
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
|
"""Extrait la description (meta tags)."""
|
|
meta = soup.find("meta", property="og:description") or soup.find(
|
|
"meta", attrs={"name": "description"}
|
|
)
|
|
if meta:
|
|
description = meta.get("content", "").strip()
|
|
if description:
|
|
return description
|
|
return None
|
|
|
|
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
|
"""Extrait le prix."""
|
|
selectors = self.get_selector("price", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
for element in elements:
|
|
# Attribut content (schema.org) ou texte
|
|
price_text = element.get("content") or element.get_text(strip=True)
|
|
|
|
price = parse_price_text(price_text)
|
|
if price is not None:
|
|
return price
|
|
|
|
debug.errors.append("Prix non trouvé")
|
|
return None
|
|
|
|
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
|
"""Extrait le prix conseille."""
|
|
selectors = [
|
|
".price--old",
|
|
".price--striked",
|
|
".price__old",
|
|
"del",
|
|
]
|
|
for selector in selectors:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
price = parse_price_text(element.get_text(strip=True))
|
|
if price is not None:
|
|
return price
|
|
return None
|
|
|
|
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
|
"""Extrait la devise."""
|
|
selectors = self.get_selector("currency", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
# Attribut content
|
|
currency = element.get("content")
|
|
if currency:
|
|
return currency.upper()
|
|
|
|
# Défaut EUR pour Backmarket France
|
|
return "EUR"
|
|
|
|
def _extract_stock(self, soup: BeautifulSoup, debug: DebugInfo) -> StockStatus:
|
|
"""Extrait le statut de stock."""
|
|
# Chercher le bouton "Ajouter au panier"
|
|
add_to_cart = soup.find("button", attrs={"data-test": "add-to-cart"})
|
|
if add_to_cart and not add_to_cart.get("disabled"):
|
|
return StockStatus.IN_STOCK
|
|
|
|
# Fallback: chercher textes indiquant la disponibilité
|
|
selectors = self.get_selector("stock_status", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
text = element.get_text(strip=True).lower()
|
|
|
|
if "en stock" in text or "disponible" in text or "ajouter" in text:
|
|
return StockStatus.IN_STOCK
|
|
elif (
|
|
"rupture" in text
|
|
or "indisponible" in text
|
|
or "épuisé" in text
|
|
):
|
|
return StockStatus.OUT_OF_STOCK
|
|
|
|
return StockStatus.UNKNOWN
|
|
|
|
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
|
"""Extrait les URLs d'images."""
|
|
images = []
|
|
selectors = self.get_selector("images", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
for element in elements:
|
|
# src ou data-src
|
|
img_url = element.get("src") or element.get("data-src")
|
|
if img_url and img_url.startswith("http"):
|
|
# Éviter les doublons
|
|
if img_url not in images:
|
|
images.append(img_url)
|
|
|
|
return images
|
|
|
|
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
|
"""Extrait la catégorie depuis le breadcrumb."""
|
|
selectors = self.get_selector("category", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
if elements:
|
|
# Prendre le dernier élément du breadcrumb (catégorie la plus spécifique)
|
|
categories = [elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)]
|
|
if categories:
|
|
return categories[-1]
|
|
|
|
return None
|
|
|
|
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
|
"""Extrait les caractéristiques techniques."""
|
|
specs = {}
|
|
|
|
# Chercher les dl (definition lists)
|
|
dls = soup.find_all("dl")
|
|
for dl in dls:
|
|
dts = dl.find_all("dt")
|
|
dds = dl.find_all("dd")
|
|
|
|
for dt, dd in zip(dts, dds):
|
|
key = dt.get_text(strip=True)
|
|
value = dd.get_text(strip=True)
|
|
if key and value:
|
|
specs[key] = value
|
|
|
|
return specs
|
|
|
|
def _extract_condition(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
|
"""
|
|
Extrait la condition/état du produit reconditionné.
|
|
|
|
Spécifique à Backmarket: Correct, Bon, Très bon, Excellent, etc.
|
|
"""
|
|
selectors = self.get_selector("condition", [])
|
|
if isinstance(selectors, str):
|
|
selectors = [selectors]
|
|
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
for element in elements:
|
|
text = element.get_text(strip=True)
|
|
# Chercher les grades Backmarket
|
|
if any(grade in text for grade in ["Correct", "Bon", "Très bon", "Excellent", "Comme neuf"]):
|
|
return text
|
|
|
|
return None
|