scrap/analyze_backmarket.py

#!/usr/bin/env python3
"""Analyse du HTML Backmarket pour identifier les sélecteurs."""

from bs4 import BeautifulSoup
import json
import re

# Lire le HTML
with open("scraped/backmarket_pw.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "lxml")

print("=" * 80)
print("ANALYSE HTML BACKMARKET.FR")
print("=" * 80)

# 1. Titre
print("\n1. TITRE")
print("-" * 80)
h1_tags = soup.find_all("h1")
print(f"Nombre de h1: {len(h1_tags)}")
for i, h1 in enumerate(h1_tags[:3]):
    print(f"  [{i+1}] Classes: {h1.get('class')}")
    print(f"      Texte: {h1.get_text().strip()[:100]}")

# 2. Prix
print("\n2. PRIX")
print("-" * 80)
# Chercher dans JSON-LD
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
print(f"Scripts JSON-LD trouvés: {len(json_ld_scripts)}")
for i, script in enumerate(json_ld_scripts[:3]):
    try:
        data = json.loads(script.string)
        if isinstance(data, dict):
            print(f"\n  Script [{i+1}] @type: {data.get('@type')}")
            if data.get("@type") == "Product":
                print(f"    name: {data.get('name')}")
                offers = data.get('offers', {})
                if isinstance(offers, dict):
                    print(f"    price: {offers.get('price')}")
                    print(f"    priceCurrency: {offers.get('priceCurrency')}")
    except Exception as e:
        print(f"  Script [{i+1}] Erreur parsing JSON: {e}")

# Chercher les divs/spans avec prix
price_elements = soup.find_all(["div", "span", "p"], class_=lambda x: x and ("price" in str(x).lower()))
print(f"\nÉléments avec 'price' dans la classe: {len(price_elements)}")
for i, elem in enumerate(price_elements[:5]):
    text = elem.get_text().strip()[:80]
    print(f"  [{i+1}] {elem.name} {elem.get('class')} → {text}")

# Regex prix
matches = re.findall(r'(\d{2,4})[,\s]?(\d{2})\s*€', html)
print(f"\nPrix trouvés par regex: {len(matches)} matches")
unique_prices = list(set([f"{m[0]},{m[1]} €" for m in matches[:10]]))
for price in unique_prices[:5]:
    print(f"  - {price}")

# 3. Images
print("\n3. IMAGES")
print("-" * 80)
img_product = soup.find_all("img", alt=True)
print(f"Images avec alt: {len(img_product)}")
for i, img in enumerate(img_product[:5]):
    alt = img.get("alt", "")
    src = img.get("src", "")
    if "iphone" in alt.lower() or "apple" in alt.lower():
        print(f"  [{i+1}] alt: {alt[:60]}")
        print(f"       src: {src[:80]}")

# 4. État/Condition
print("\n4. ÉTAT / CONDITION")
print("-" * 80)
condition_elements = soup.find_all(["div", "span", "button"], class_=lambda x: x and ("condition" in str(x).lower() or "grade" in str(x).lower() or "état" in str(x).lower()))
print(f"Éléments avec condition/grade/état: {len(condition_elements)}")
for i, elem in enumerate(condition_elements[:5]):
    text = elem.get_text().strip()[:80]
    print(f"  [{i+1}] {elem.name} {elem.get('class')} → {text}")

# 5. SKU / Référence
print("\n5. SKU / RÉFÉRENCE PRODUIT")
print("-" * 80)
# Chercher dans l'URL
print("Dans l'URL: /fr-fr/p/iphone-15-pro")
print("Possible SKU: iphone-15-pro")

# Chercher dans JSON-LD
for script in json_ld_scripts:
    try:
        data = json.loads(script.string)
        if isinstance(data, dict) and data.get("@type") == "Product":
            print(f"\nDans JSON-LD:")
            print(f"  sku: {data.get('sku')}")
            print(f"  mpn: {data.get('mpn')}")
            print(f"  productID: {data.get('productID')}")
    except:
        pass

# 6. Breadcrumb / Catégorie
print("\n6. CATÉGORIE / BREADCRUMB")
print("-" * 80)
breadcrumbs = soup.find_all(["nav", "ol", "ul"], class_=lambda x: x and "breadcrumb" in str(x).lower())
for bc in breadcrumbs[:2]:
    print(f"Tag: {bc.name}, Classes: {bc.get('class')}")
    links = bc.find_all("a")
    for link in links[:5]:
        print(f"  - {link.get_text().strip()}")

# 7. Spécifications
print("\n7. CARACTÉRISTIQUES TECHNIQUES")
print("-" * 80)
specs_sections = soup.find_all(["div", "dl", "table"], class_=lambda x: x and ("spec" in str(x).lower() or "characteristic" in str(x).lower() or "feature" in str(x).lower()))
print(f"Sections de specs: {len(specs_sections)}")
for i, section in enumerate(specs_sections[:3]):
    print(f"  [{i+1}] {section.name} {section.get('class')}")
    text = section.get_text().strip()[:150]
    print(f"       {text}")

# 8. Meta tags
print("\n8. META TAGS")
print("-" * 80)
og_title = soup.find("meta", property="og:title")
og_price = soup.find("meta", property="og:price:amount")
if og_title:
    print(f"og:title: {og_title.get('content')}")
if og_price:
    print(f"og:price:amount: {og_price.get('content')}")

print("\n" + "=" * 80)
print("FIN DE L'ANALYSE")
print("=" * 80)