chore: sync project files
This commit is contained in:
111
analyze_cdiscount.py
Executable file
111
analyze_cdiscount.py
Executable file
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyse du HTML Cdiscount pour identifier les sélecteurs."""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
|
||||
# Lire le HTML
|
||||
with open("scraped/cdiscount_tuf608umrv004_pw.html", "r", encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE HTML CDISCOUNT")
|
||||
print("=" * 80)
|
||||
|
||||
# 1. Titre
|
||||
print("\n1. TITRE")
|
||||
print("-" * 80)
|
||||
h1 = soup.find("h1")
|
||||
if h1:
|
||||
print(f"Tag: {h1.name}")
|
||||
print(f"Classes: {h1.get('class')}")
|
||||
print(f"data-e2e: {h1.get('data-e2e')}")
|
||||
print(f"Texte: {h1.get_text().strip()[:100]}")
|
||||
|
||||
# 2. Prix
|
||||
print("\n2. PRIX")
|
||||
print("-" * 80)
|
||||
# Chercher dans JSON-LD
|
||||
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
|
||||
for script in json_ld_scripts:
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||
print(f"Schema.org Product trouvé!")
|
||||
print(f" name: {data.get('name')}")
|
||||
print(f" price: {data.get('offers', {}).get('price')}")
|
||||
print(f" currency: {data.get('offers', {}).get('priceCurrency')}")
|
||||
print(f" availability: {data.get('offers', {}).get('availability')}")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Chercher les classes de prix
|
||||
price_divs = soup.find_all("div", class_=lambda x: x and "price" in x.lower())
|
||||
print(f"\nDivs avec 'price' dans la classe: {len(price_divs)}")
|
||||
for div in price_divs[:3]:
|
||||
print(f" - {div.get('class')} : {div.get_text().strip()[:50]}")
|
||||
|
||||
# 3. Images
|
||||
print("\n3. IMAGES")
|
||||
print("-" * 80)
|
||||
img_product = soup.find_all("img", alt=True)
|
||||
print(f"Images avec alt: {len(img_product)}")
|
||||
for img in img_product[:5]:
|
||||
alt = img.get("alt", "")
|
||||
src = img.get("src", "")
|
||||
if "TUF" in alt or "ASUS" in alt:
|
||||
print(f" - alt: {alt[:60]}")
|
||||
print(f" src: {src[:80]}")
|
||||
|
||||
# 4. Stock
|
||||
print("\n4. DISPONIBILITÉ")
|
||||
print("-" * 80)
|
||||
availability_divs = soup.find_all(["div", "span"], class_=lambda x: x and ("availability" in str(x).lower() or "stock" in str(x).lower()))
|
||||
print(f"Éléments avec 'availability'/'stock': {len(availability_divs)}")
|
||||
for elem in availability_divs[:5]:
|
||||
print(f" - {elem.name} {elem.get('class')} : {elem.get_text().strip()[:60]}")
|
||||
|
||||
# 5. Catégorie / Breadcrumb
|
||||
print("\n5. CATÉGORIE / BREADCRUMB")
|
||||
print("-" * 80)
|
||||
breadcrumbs = soup.find_all(["nav", "ol", "ul"], class_=lambda x: x and "breadcrumb" in str(x).lower())
|
||||
for bc in breadcrumbs[:2]:
|
||||
print(f"Tag: {bc.name}, Classes: {bc.get('class')}")
|
||||
links = bc.find_all("a")
|
||||
for link in links:
|
||||
print(f" - {link.get_text().strip()}")
|
||||
|
||||
# 6. Specs / Caractéristiques
|
||||
print("\n6. CARACTÉRISTIQUES TECHNIQUES")
|
||||
print("-" * 80)
|
||||
specs_sections = soup.find_all(["table", "dl", "div"], class_=lambda x: x and ("spec" in str(x).lower() or "characteristic" in str(x).lower() or "feature" in str(x).lower()))
|
||||
print(f"Sections de specs: {len(specs_sections)}")
|
||||
for section in specs_sections[:3]:
|
||||
print(f" - {section.name} {section.get('class')}")
|
||||
text = section.get_text().strip()[:200]
|
||||
print(f" {text}")
|
||||
|
||||
# 7. SKU / Référence
|
||||
print("\n7. SKU / RÉFÉRENCE PRODUIT")
|
||||
print("-" * 80)
|
||||
# Chercher dans JSON-LD
|
||||
for script in json_ld_scripts:
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||
print(f"SKU: {data.get('sku')}")
|
||||
print(f"mpn: {data.get('mpn')}")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Chercher dans l'URL ou les data attributes
|
||||
print("\nDans l'URL: f-10709-tuf608umrv004.html")
|
||||
print("Possible SKU: tuf608umrv004")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("FIN DE L'ANALYSE")
|
||||
print("=" * 80)
|
||||
Reference in New Issue
Block a user