134 lines
4.6 KiB
Python
Executable File
134 lines
4.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Analyse du HTML Backmarket pour identifier les sélecteurs."""
|
|
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import re
|
|
|
|
# Lire le HTML
|
|
with open("scraped/backmarket_pw.html", "r", encoding="utf-8") as f:
|
|
html = f.read()
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
print("=" * 80)
|
|
print("ANALYSE HTML BACKMARKET.FR")
|
|
print("=" * 80)
|
|
|
|
# 1. Titre
|
|
print("\n1. TITRE")
|
|
print("-" * 80)
|
|
h1_tags = soup.find_all("h1")
|
|
print(f"Nombre de h1: {len(h1_tags)}")
|
|
for i, h1 in enumerate(h1_tags[:3]):
|
|
print(f" [{i+1}] Classes: {h1.get('class')}")
|
|
print(f" Texte: {h1.get_text().strip()[:100]}")
|
|
|
|
# 2. Prix
|
|
print("\n2. PRIX")
|
|
print("-" * 80)
|
|
# Chercher dans JSON-LD
|
|
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
|
|
print(f"Scripts JSON-LD trouvés: {len(json_ld_scripts)}")
|
|
for i, script in enumerate(json_ld_scripts[:3]):
|
|
try:
|
|
data = json.loads(script.string)
|
|
if isinstance(data, dict):
|
|
print(f"\n Script [{i+1}] @type: {data.get('@type')}")
|
|
if data.get("@type") == "Product":
|
|
print(f" name: {data.get('name')}")
|
|
offers = data.get('offers', {})
|
|
if isinstance(offers, dict):
|
|
print(f" price: {offers.get('price')}")
|
|
print(f" priceCurrency: {offers.get('priceCurrency')}")
|
|
except Exception as e:
|
|
print(f" Script [{i+1}] Erreur parsing JSON: {e}")
|
|
|
|
# Chercher les divs/spans avec prix
|
|
price_elements = soup.find_all(["div", "span", "p"], class_=lambda x: x and ("price" in str(x).lower()))
|
|
print(f"\nÉléments avec 'price' dans la classe: {len(price_elements)}")
|
|
for i, elem in enumerate(price_elements[:5]):
|
|
text = elem.get_text().strip()[:80]
|
|
print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}")
|
|
|
|
# Regex prix
|
|
matches = re.findall(r'(\d{2,4})[,\s]?(\d{2})\s*€', html)
|
|
print(f"\nPrix trouvés par regex: {len(matches)} matches")
|
|
unique_prices = list(set([f"{m[0]},{m[1]} €" for m in matches[:10]]))
|
|
for price in unique_prices[:5]:
|
|
print(f" - {price}")
|
|
|
|
# 3. Images
|
|
print("\n3. IMAGES")
|
|
print("-" * 80)
|
|
img_product = soup.find_all("img", alt=True)
|
|
print(f"Images avec alt: {len(img_product)}")
|
|
for i, img in enumerate(img_product[:5]):
|
|
alt = img.get("alt", "")
|
|
src = img.get("src", "")
|
|
if "iphone" in alt.lower() or "apple" in alt.lower():
|
|
print(f" [{i+1}] alt: {alt[:60]}")
|
|
print(f" src: {src[:80]}")
|
|
|
|
# 4. État/Condition
|
|
print("\n4. ÉTAT / CONDITION")
|
|
print("-" * 80)
|
|
condition_elements = soup.find_all(["div", "span", "button"], class_=lambda x: x and ("condition" in str(x).lower() or "grade" in str(x).lower() or "état" in str(x).lower()))
|
|
print(f"Éléments avec condition/grade/état: {len(condition_elements)}")
|
|
for i, elem in enumerate(condition_elements[:5]):
|
|
text = elem.get_text().strip()[:80]
|
|
print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}")
|
|
|
|
# 5. SKU / Référence
|
|
print("\n5. SKU / RÉFÉRENCE PRODUIT")
|
|
print("-" * 80)
|
|
# Chercher dans l'URL
|
|
print("Dans l'URL: /fr-fr/p/iphone-15-pro")
|
|
print("Possible SKU: iphone-15-pro")
|
|
|
|
# Chercher dans JSON-LD
|
|
for script in json_ld_scripts:
|
|
try:
|
|
data = json.loads(script.string)
|
|
if isinstance(data, dict) and data.get("@type") == "Product":
|
|
print(f"\nDans JSON-LD:")
|
|
print(f" sku: {data.get('sku')}")
|
|
print(f" mpn: {data.get('mpn')}")
|
|
print(f" productID: {data.get('productID')}")
|
|
except:
|
|
pass
|
|
|
|
# 6. Breadcrumb / Catégorie
|
|
print("\n6. CATÉGORIE / BREADCRUMB")
|
|
print("-" * 80)
|
|
breadcrumbs = soup.find_all(["nav", "ol", "ul"], class_=lambda x: x and "breadcrumb" in str(x).lower())
|
|
for bc in breadcrumbs[:2]:
|
|
print(f"Tag: {bc.name}, Classes: {bc.get('class')}")
|
|
links = bc.find_all("a")
|
|
for link in links[:5]:
|
|
print(f" - {link.get_text().strip()}")
|
|
|
|
# 7. Spécifications
|
|
print("\n7. CARACTÉRISTIQUES TECHNIQUES")
|
|
print("-" * 80)
|
|
specs_sections = soup.find_all(["div", "dl", "table"], class_=lambda x: x and ("spec" in str(x).lower() or "characteristic" in str(x).lower() or "feature" in str(x).lower()))
|
|
print(f"Sections de specs: {len(specs_sections)}")
|
|
for i, section in enumerate(specs_sections[:3]):
|
|
print(f" [{i+1}] {section.name} {section.get('class')}")
|
|
text = section.get_text().strip()[:150]
|
|
print(f" {text}")
|
|
|
|
# 8. Meta tags
|
|
print("\n8. META TAGS")
|
|
print("-" * 80)
|
|
og_title = soup.find("meta", property="og:title")
|
|
og_price = soup.find("meta", property="og:price:amount")
|
|
if og_title:
|
|
print(f"og:title: {og_title.get('content')}")
|
|
if og_price:
|
|
print(f"og:price:amount: {og_price.get('content')}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("FIN DE L'ANALYSE")
|
|
print("=" * 80)
|