#!/usr/bin/env python3
"""Analyse du HTML Backmarket pour identifier les sélecteurs."""
from bs4 import BeautifulSoup
import json
import re
# Lire le HTML
with open("scraped/backmarket_pw.html", "r", encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, "lxml")
print("=" * 80)
print("ANALYSE HTML BACKMARKET.FR")
print("=" * 80)
# 1. Titre
print("\n1. TITRE")
print("-" * 80)
h1_tags = soup.find_all("h1")
print(f"Nombre de h1: {len(h1_tags)}")
for i, h1 in enumerate(h1_tags[:3]):
print(f" [{i+1}] Classes: {h1.get('class')}")
print(f" Texte: {h1.get_text().strip()[:100]}")
# 2. Prix
print("\n2. PRIX")
print("-" * 80)
# Chercher dans JSON-LD
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
print(f"Scripts JSON-LD trouvés: {len(json_ld_scripts)}")
for i, script in enumerate(json_ld_scripts[:3]):
try:
data = json.loads(script.string)
if isinstance(data, dict):
print(f"\n Script [{i+1}] @type: {data.get('@type')}")
if data.get("@type") == "Product":
print(f" name: {data.get('name')}")
offers = data.get('offers', {})
if isinstance(offers, dict):
print(f" price: {offers.get('price')}")
print(f" priceCurrency: {offers.get('priceCurrency')}")
except Exception as e:
print(f" Script [{i+1}] Erreur parsing JSON: {e}")
# Chercher les divs/spans avec prix
price_elements = soup.find_all(["div", "span", "p"], class_=lambda x: x and ("price" in str(x).lower()))
print(f"\nÉléments avec 'price' dans la classe: {len(price_elements)}")
for i, elem in enumerate(price_elements[:5]):
text = elem.get_text().strip()[:80]
print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}")
# Regex prix
matches = re.findall(r'(\d{2,4})[,\s]?(\d{2})\s*€', html)
print(f"\nPrix trouvés par regex: {len(matches)} matches")
unique_prices = list(set([f"{m[0]},{m[1]} €" for m in matches[:10]]))
for price in unique_prices[:5]:
print(f" - {price}")
# 3. Images
print("\n3. IMAGES")
print("-" * 80)
img_product = soup.find_all("img", alt=True)
print(f"Images avec alt: {len(img_product)}")
for i, img in enumerate(img_product[:5]):
alt = img.get("alt", "")
src = img.get("src", "")
if "iphone" in alt.lower() or "apple" in alt.lower():
print(f" [{i+1}] alt: {alt[:60]}")
print(f" src: {src[:80]}")
# 4. État/Condition
print("\n4. ÉTAT / CONDITION")
print("-" * 80)
condition_elements = soup.find_all(["div", "span", "button"], class_=lambda x: x and ("condition" in str(x).lower() or "grade" in str(x).lower() or "état" in str(x).lower()))
print(f"Éléments avec condition/grade/état: {len(condition_elements)}")
for i, elem in enumerate(condition_elements[:5]):
text = elem.get_text().strip()[:80]
print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}")
# 5. SKU / Référence
print("\n5. SKU / RÉFÉRENCE PRODUIT")
print("-" * 80)
# Chercher dans l'URL
print("Dans l'URL: /fr-fr/p/iphone-15-pro")
print("Possible SKU: iphone-15-pro")
# Chercher dans JSON-LD
for script in json_ld_scripts:
try:
data = json.loads(script.string)
if isinstance(data, dict) and data.get("@type") == "Product":
print(f"\nDans JSON-LD:")
print(f" sku: {data.get('sku')}")
print(f" mpn: {data.get('mpn')}")
print(f" productID: {data.get('productID')}")
except:
pass
# 6. Breadcrumb / Catégorie
print("\n6. CATÉGORIE / BREADCRUMB")
print("-" * 80)
breadcrumbs = soup.find_all(["nav", "ol", "ul"], class_=lambda x: x and "breadcrumb" in str(x).lower())
for bc in breadcrumbs[:2]:
print(f"Tag: {bc.name}, Classes: {bc.get('class')}")
links = bc.find_all("a")
for link in links[:5]:
print(f" - {link.get_text().strip()}")
# 7. Spécifications
print("\n7. CARACTÉRISTIQUES TECHNIQUES")
print("-" * 80)
specs_sections = soup.find_all(["div", "dl", "table"], class_=lambda x: x and ("spec" in str(x).lower() or "characteristic" in str(x).lower() or "feature" in str(x).lower()))
print(f"Sections de specs: {len(specs_sections)}")
for i, section in enumerate(specs_sections[:3]):
print(f" [{i+1}] {section.name} {section.get('class')}")
text = section.get_text().strip()[:150]
print(f" {text}")
# 8. Meta tags
print("\n8. META TAGS")
print("-" * 80)
og_title = soup.find("meta", property="og:title")
og_price = soup.find("meta", property="og:price:amount")
if og_title:
print(f"og:title: {og_title.get('content')}")
if og_price:
print(f"og:price:amount: {og_price.get('content')}")
print("\n" + "=" * 80)
print("FIN DE L'ANALYSE")
print("=" * 80)