#!/usr/bin/env python3 """Analyse du HTML Cdiscount pour identifier les sélecteurs.""" from bs4 import BeautifulSoup import json # Lire le HTML with open("scraped/cdiscount_tuf608umrv004_pw.html", "r", encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html, "lxml") print("=" * 80) print("ANALYSE HTML CDISCOUNT") print("=" * 80) # 1. Titre print("\n1. TITRE") print("-" * 80) h1 = soup.find("h1") if h1: print(f"Tag: {h1.name}") print(f"Classes: {h1.get('class')}") print(f"data-e2e: {h1.get('data-e2e')}") print(f"Texte: {h1.get_text().strip()[:100]}") # 2. Prix print("\n2. PRIX") print("-" * 80) # Chercher dans JSON-LD json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"}) for script in json_ld_scripts: try: data = json.loads(script.string) if isinstance(data, dict) and data.get("@type") == "Product": print(f"Schema.org Product trouvé!") print(f" name: {data.get('name')}") print(f" price: {data.get('offers', {}).get('price')}") print(f" currency: {data.get('offers', {}).get('priceCurrency')}") print(f" availability: {data.get('offers', {}).get('availability')}") break except: pass # Chercher les classes de prix price_divs = soup.find_all("div", class_=lambda x: x and "price" in x.lower()) print(f"\nDivs avec 'price' dans la classe: {len(price_divs)}") for div in price_divs[:3]: print(f" - {div.get('class')} : {div.get_text().strip()[:50]}") # 3. Images print("\n3. IMAGES") print("-" * 80) img_product = soup.find_all("img", alt=True) print(f"Images avec alt: {len(img_product)}") for img in img_product[:5]: alt = img.get("alt", "") src = img.get("src", "") if "TUF" in alt or "ASUS" in alt: print(f" - alt: {alt[:60]}") print(f" src: {src[:80]}") # 4. Stock print("\n4. DISPONIBILITÉ") print("-" * 80) availability_divs = soup.find_all(["div", "span"], class_=lambda x: x and ("availability" in str(x).lower() or "stock" in str(x).lower())) print(f"Éléments avec 'availability'/'stock': {len(availability_divs)}") for elem in availability_divs[:5]: print(f" - {elem.name} {elem.get('class')} : {elem.get_text().strip()[:60]}") # 5. Catégorie / Breadcrumb print("\n5. CATÉGORIE / BREADCRUMB") print("-" * 80) breadcrumbs = soup.find_all(["nav", "ol", "ul"], class_=lambda x: x and "breadcrumb" in str(x).lower()) for bc in breadcrumbs[:2]: print(f"Tag: {bc.name}, Classes: {bc.get('class')}") links = bc.find_all("a") for link in links: print(f" - {link.get_text().strip()}") # 6. Specs / Caractéristiques print("\n6. CARACTÉRISTIQUES TECHNIQUES") print("-" * 80) specs_sections = soup.find_all(["table", "dl", "div"], class_=lambda x: x and ("spec" in str(x).lower() or "characteristic" in str(x).lower() or "feature" in str(x).lower())) print(f"Sections de specs: {len(specs_sections)}") for section in specs_sections[:3]: print(f" - {section.name} {section.get('class')}") text = section.get_text().strip()[:200] print(f" {text}") # 7. SKU / Référence print("\n7. SKU / RÉFÉRENCE PRODUIT") print("-" * 80) # Chercher dans JSON-LD for script in json_ld_scripts: try: data = json.loads(script.string) if isinstance(data, dict) and data.get("@type") == "Product": print(f"SKU: {data.get('sku')}") print(f"mpn: {data.get('mpn')}") break except: pass # Chercher dans l'URL ou les data attributes print("\nDans l'URL: f-10709-tuf608umrv004.html") print("Possible SKU: tuf608umrv004") print("\n" + "=" * 80) print("FIN DE L'ANALYSE") print("=" * 80)