chore: sync project files
This commit is contained in:
133
analyze_backmarket.py
Executable file
133
analyze_backmarket.py
Executable file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyse du HTML Backmarket pour identifier les sélecteurs."""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import re
|
||||
|
||||
# Lire le HTML
|
||||
with open("scraped/backmarket_pw.html", "r", encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE HTML BACKMARKET.FR")
|
||||
print("=" * 80)
|
||||
|
||||
# 1. Titre
|
||||
print("\n1. TITRE")
|
||||
print("-" * 80)
|
||||
h1_tags = soup.find_all("h1")
|
||||
print(f"Nombre de h1: {len(h1_tags)}")
|
||||
for i, h1 in enumerate(h1_tags[:3]):
|
||||
print(f" [{i+1}] Classes: {h1.get('class')}")
|
||||
print(f" Texte: {h1.get_text().strip()[:100]}")
|
||||
|
||||
# 2. Prix
|
||||
print("\n2. PRIX")
|
||||
print("-" * 80)
|
||||
# Chercher dans JSON-LD
|
||||
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
|
||||
print(f"Scripts JSON-LD trouvés: {len(json_ld_scripts)}")
|
||||
for i, script in enumerate(json_ld_scripts[:3]):
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, dict):
|
||||
print(f"\n Script [{i+1}] @type: {data.get('@type')}")
|
||||
if data.get("@type") == "Product":
|
||||
print(f" name: {data.get('name')}")
|
||||
offers = data.get('offers', {})
|
||||
if isinstance(offers, dict):
|
||||
print(f" price: {offers.get('price')}")
|
||||
print(f" priceCurrency: {offers.get('priceCurrency')}")
|
||||
except Exception as e:
|
||||
print(f" Script [{i+1}] Erreur parsing JSON: {e}")
|
||||
|
||||
# Chercher les divs/spans avec prix
|
||||
price_elements = soup.find_all(["div", "span", "p"], class_=lambda x: x and ("price" in str(x).lower()))
|
||||
print(f"\nÉléments avec 'price' dans la classe: {len(price_elements)}")
|
||||
for i, elem in enumerate(price_elements[:5]):
|
||||
text = elem.get_text().strip()[:80]
|
||||
print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}")
|
||||
|
||||
# Regex prix
|
||||
matches = re.findall(r'(\d{2,4})[,\s]?(\d{2})\s*€', html)
|
||||
print(f"\nPrix trouvés par regex: {len(matches)} matches")
|
||||
unique_prices = list(set([f"{m[0]},{m[1]} €" for m in matches[:10]]))
|
||||
for price in unique_prices[:5]:
|
||||
print(f" - {price}")
|
||||
|
||||
# 3. Images
|
||||
print("\n3. IMAGES")
|
||||
print("-" * 80)
|
||||
img_product = soup.find_all("img", alt=True)
|
||||
print(f"Images avec alt: {len(img_product)}")
|
||||
for i, img in enumerate(img_product[:5]):
|
||||
alt = img.get("alt", "")
|
||||
src = img.get("src", "")
|
||||
if "iphone" in alt.lower() or "apple" in alt.lower():
|
||||
print(f" [{i+1}] alt: {alt[:60]}")
|
||||
print(f" src: {src[:80]}")
|
||||
|
||||
# 4. État/Condition
|
||||
print("\n4. ÉTAT / CONDITION")
|
||||
print("-" * 80)
|
||||
condition_elements = soup.find_all(["div", "span", "button"], class_=lambda x: x and ("condition" in str(x).lower() or "grade" in str(x).lower() or "état" in str(x).lower()))
|
||||
print(f"Éléments avec condition/grade/état: {len(condition_elements)}")
|
||||
for i, elem in enumerate(condition_elements[:5]):
|
||||
text = elem.get_text().strip()[:80]
|
||||
print(f" [{i+1}] {elem.name} {elem.get('class')} → {text}")
|
||||
|
||||
# 5. SKU / Référence
|
||||
print("\n5. SKU / RÉFÉRENCE PRODUIT")
|
||||
print("-" * 80)
|
||||
# Chercher dans l'URL
|
||||
print("Dans l'URL: /fr-fr/p/iphone-15-pro")
|
||||
print("Possible SKU: iphone-15-pro")
|
||||
|
||||
# Chercher dans JSON-LD
|
||||
for script in json_ld_scripts:
|
||||
try:
|
||||
data = json.loads(script.string)
|
||||
if isinstance(data, dict) and data.get("@type") == "Product":
|
||||
print(f"\nDans JSON-LD:")
|
||||
print(f" sku: {data.get('sku')}")
|
||||
print(f" mpn: {data.get('mpn')}")
|
||||
print(f" productID: {data.get('productID')}")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 6. Breadcrumb / Catégorie
|
||||
print("\n6. CATÉGORIE / BREADCRUMB")
|
||||
print("-" * 80)
|
||||
breadcrumbs = soup.find_all(["nav", "ol", "ul"], class_=lambda x: x and "breadcrumb" in str(x).lower())
|
||||
for bc in breadcrumbs[:2]:
|
||||
print(f"Tag: {bc.name}, Classes: {bc.get('class')}")
|
||||
links = bc.find_all("a")
|
||||
for link in links[:5]:
|
||||
print(f" - {link.get_text().strip()}")
|
||||
|
||||
# 7. Spécifications
|
||||
print("\n7. CARACTÉRISTIQUES TECHNIQUES")
|
||||
print("-" * 80)
|
||||
specs_sections = soup.find_all(["div", "dl", "table"], class_=lambda x: x and ("spec" in str(x).lower() or "characteristic" in str(x).lower() or "feature" in str(x).lower()))
|
||||
print(f"Sections de specs: {len(specs_sections)}")
|
||||
for i, section in enumerate(specs_sections[:3]):
|
||||
print(f" [{i+1}] {section.name} {section.get('class')}")
|
||||
text = section.get_text().strip()[:150]
|
||||
print(f" {text}")
|
||||
|
||||
# 8. Meta tags
|
||||
print("\n8. META TAGS")
|
||||
print("-" * 80)
|
||||
og_title = soup.find("meta", property="og:title")
|
||||
og_price = soup.find("meta", property="og:price:amount")
|
||||
if og_title:
|
||||
print(f"og:title: {og_title.get('content')}")
|
||||
if og_price:
|
||||
print(f"og:price:amount: {og_price.get('content')}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("FIN DE L'ANALYSE")
|
||||
print("=" * 80)
|
||||
Reference in New Issue
Block a user