#!/usr/bin/env python3 """Analyse du HTML Backmarket pour identifier les sélecteurs.""" from bs4 import BeautifulSoup import json import re # Lire le HTML with open("scraped/backmarket_pw.html", "r", encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html, "lxml") print("=" * 80) print("ANALYSE HTML BACKMARKET.FR") print("=" * 80) # 1. Titre print("\n1. TITRE") print("-" * 80) h1_tags = soup.find_all("h1") print(f"Nombre de h1: {len(h1_tags)}") for i, h1 in enumerate(h1_tags[:3]): print(f" [{i+1}] Classes: {h1.get('class')}") print(f" Texte: {h1.get_text().strip()[:100]}") # 2. Prix print("\n2. PRIX") print("-" * 80) # Chercher dans JSON-LD json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"}) print(f"Scripts JSON-LD trouvés: {len(json_ld_scripts)}") for i, script in enumerate(json_ld_scripts[:3]): try: data = json.loads(script.string) if isinstance(data, dict): print(f"\n Script [{i+1}] @type: {data.get('@type')}") if data.get("@type") == "Product": print(f" name: {data.get('name')}") offers = data.get('offers', {}) if isinstance(offers, dict): print(f" price: {offers.get('price')}") print(f" priceCurrency: {offers.get('priceCurrency')}") except Exception as e: print(f" Script [{i+1}] Erreur parsing JSON: {e}") # Chercher les divs/spans avec prix price_elements = soup.find_all(["div", "span", "p"], class_=lambda x: x and ("price" in str(x).lower())) print(f"\nÉléments avec 'price' dans la classe: {len(price_elements)}") for i, elem in enumerate(price_elements[:5]): text = elem.get_text().strip()[:80] print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}") # Regex prix matches = re.findall(r'(\d{2,4})[,\s]?(\d{2})\s*€', html) print(f"\nPrix trouvés par regex: {len(matches)} matches") unique_prices = list(set([f"{m[0]},{m[1]} €" for m in matches[:10]])) for price in unique_prices[:5]: print(f" - {price}") # 3. Images print("\n3. IMAGES") print("-" * 80) img_product = soup.find_all("img", alt=True) print(f"Images avec alt: {len(img_product)}") for i, img in enumerate(img_product[:5]): alt = img.get("alt", "") src = img.get("src", "") if "iphone" in alt.lower() or "apple" in alt.lower(): print(f" [{i+1}] alt: {alt[:60]}") print(f" src: {src[:80]}") # 4. État/Condition print("\n4. ÉTAT / CONDITION") print("-" * 80) condition_elements = soup.find_all(["div", "span", "button"], class_=lambda x: x and ("condition" in str(x).lower() or "grade" in str(x).lower() or "état" in str(x).lower())) print(f"Éléments avec condition/grade/état: {len(condition_elements)}") for i, elem in enumerate(condition_elements[:5]): text = elem.get_text().strip()[:80] print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}") # 5. SKU / Référence print("\n5. SKU / RÉFÉRENCE PRODUIT") print("-" * 80) # Chercher dans l'URL print("Dans l'URL: /fr-fr/p/iphone-15-pro") print("Possible SKU: iphone-15-pro") # Chercher dans JSON-LD for script in json_ld_scripts: try: data = json.loads(script.string) if isinstance(data, dict) and data.get("@type") == "Product": print(f"\nDans JSON-LD:") print(f" sku: {data.get('sku')}") print(f" mpn: {data.get('mpn')}") print(f" productID: {data.get('productID')}") except: pass # 6. Breadcrumb / Catégorie print("\n6. CATÉGORIE / BREADCRUMB") print("-" * 80) breadcrumbs = soup.find_all(["nav", "ol", "ul"], class_=lambda x: x and "breadcrumb" in str(x).lower()) for bc in breadcrumbs[:2]: print(f"Tag: {bc.name}, Classes: {bc.get('class')}") links = bc.find_all("a") for link in links[:5]: print(f" - {link.get_text().strip()}") # 7. Spécifications print("\n7. CARACTÉRISTIQUES TECHNIQUES") print("-" * 80) specs_sections = soup.find_all(["div", "dl", "table"], class_=lambda x: x and ("spec" in str(x).lower() or "characteristic" in str(x).lower() or "feature" in str(x).lower())) print(f"Sections de specs: {len(specs_sections)}") for i, section in enumerate(specs_sections[:3]): print(f" [{i+1}] {section.name} {section.get('class')}") text = section.get_text().strip()[:150] print(f" {text}") # 8. Meta tags print("\n8. META TAGS") print("-" * 80) og_title = soup.find("meta", property="og:title") og_price = soup.find("meta", property="og:price:amount") if og_title: print(f"og:title: {og_title.get('content')}") if og_price: print(f"og:price:amount: {og_price.get('content')}") print("\n" + "=" * 80) print("FIN DE L'ANALYSE") print("=" * 80)