181 lines
5.6 KiB
Python
Executable File
181 lines
5.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Fetch AliExpress avec Playwright pour obtenir le contenu rendu."""
|
|
|
|
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import re
|
|
|
|
url = "https://fr.aliexpress.com/item/1005007187023722.html"
|
|
|
|
print("=" * 80)
|
|
print("ALIEXPRESS - Fetch avec Playwright")
|
|
print("=" * 80)
|
|
print(f"\nURL: {url}\n")
|
|
|
|
print("Récupération avec Playwright (headless)...")
|
|
result = fetch_playwright(url, headless=True, timeout_ms=60000)
|
|
|
|
if not result.success:
|
|
print(f"❌ ÉCHEC: {result.error}")
|
|
exit(1)
|
|
|
|
print(f"✓ Page récupérée: {len(result.html):,} caractères")
|
|
print(f" Durée: {result.duration_ms}ms")
|
|
|
|
# Sauvegarder
|
|
html_file = "scraped/aliexpress_pw.html"
|
|
with open(html_file, "w", encoding="utf-8") as f:
|
|
f.write(result.html)
|
|
print(f"✓ HTML sauvegardé: {html_file}\n")
|
|
|
|
# Analyse détaillée
|
|
print("=" * 80)
|
|
print("ANALYSE DU CONTENU RENDU")
|
|
print("=" * 80)
|
|
|
|
soup = BeautifulSoup(result.html, "lxml")
|
|
|
|
# JSON-LD
|
|
print("\n[1] JSON-LD Schema.org")
|
|
print("-" * 80)
|
|
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
|
|
if json_ld_scripts:
|
|
print(f"✓ {len(json_ld_scripts)} bloc(s) JSON-LD trouvé(s)")
|
|
for i, script in enumerate(json_ld_scripts, 1):
|
|
try:
|
|
data = json.loads(script.string)
|
|
print(f"\n Bloc {i}: @type = {data.get('@type', 'N/A')}")
|
|
if isinstance(data, dict):
|
|
for key, value in list(data.items())[:5]:
|
|
if isinstance(value, str):
|
|
print(f" → {key}: {value[:80]}")
|
|
else:
|
|
print(f" → {key}: {type(value).__name__}")
|
|
except Exception as e:
|
|
print(f" Bloc {i}: Erreur parsing - {e}")
|
|
else:
|
|
print("✗ Pas de JSON-LD")
|
|
|
|
# Titre
|
|
print("\n[2] Titre du produit")
|
|
print("-" * 80)
|
|
title_selectors = [
|
|
"h1",
|
|
"h1.product-title-text",
|
|
"div.product-title",
|
|
"span[class*='title']",
|
|
"div[class*='ProductTitle']",
|
|
"span[class*='ProductTitle']",
|
|
]
|
|
for selector in title_selectors:
|
|
elem = soup.select_one(selector)
|
|
if elem:
|
|
text = elem.get_text(strip=True)
|
|
if text and len(text) > 10:
|
|
print(f"✓ Trouvé avec '{selector}':")
|
|
print(f" {text[:150]}")
|
|
break
|
|
else:
|
|
print("✗ Titre non trouvé - essai avec og:title")
|
|
og_title = soup.find("meta", property="og:title")
|
|
if og_title:
|
|
print(f"✓ og:title: {og_title.get('content', 'N/A')[:150]}")
|
|
|
|
# Prix
|
|
print("\n[3] Prix")
|
|
print("-" * 80)
|
|
price_selectors = [
|
|
"span.product-price-value",
|
|
"div.product-price-current",
|
|
"span[class*='price']",
|
|
"div[class*='Price']",
|
|
"span[class*='Price']",
|
|
]
|
|
found_price = False
|
|
for selector in price_selectors:
|
|
elems = soup.select(selector)
|
|
if elems:
|
|
print(f"✓ Trouvé {len(elems)} élément(s) avec '{selector}':")
|
|
for elem in elems[:5]:
|
|
text = elem.get_text(strip=True)
|
|
if text:
|
|
print(f" → {text}")
|
|
found_price = True
|
|
if found_price:
|
|
break
|
|
|
|
if not found_price:
|
|
print("✗ Prix non trouvé avec sélecteurs CSS")
|
|
# Chercher dans le texte brut
|
|
price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€', result.html)
|
|
if price_match:
|
|
print(f"✓ Prix trouvé par regex: {price_match.group(0)}")
|
|
|
|
# Images
|
|
print("\n[4] Images produit")
|
|
print("-" * 80)
|
|
img_elems = soup.find_all("img", src=True)
|
|
product_images = []
|
|
for img in img_elems:
|
|
src = img.get("src", "")
|
|
if "alicdn.com" in src and not any(x in src for x in ["logo", "icon", "avatar", "seller"]):
|
|
if src not in product_images:
|
|
product_images.append(src)
|
|
|
|
if product_images:
|
|
print(f"✓ {len(product_images)} image(s) trouvée(s):")
|
|
for i, img_url in enumerate(product_images[:5], 1):
|
|
print(f" [{i}] {img_url[:80]}...")
|
|
else:
|
|
print("✗ Aucune image trouvée")
|
|
|
|
# Data dans les scripts
|
|
print("\n[5] Data embarquée dans <script>")
|
|
print("-" * 80)
|
|
scripts = soup.find_all("script", type=None)
|
|
print(f"✓ {len(scripts)} scripts trouvés")
|
|
|
|
# Chercher window.runParams ou window.__INITIAL_STATE__
|
|
for script in scripts:
|
|
if script.string and ("runParams" in script.string or "__INITIAL_STATE__" in script.string or "window.pageData" in script.string):
|
|
print("✓ Script avec données trouvé:")
|
|
print(f" Taille: {len(script.string):,} caractères")
|
|
|
|
# Essayer d'extraire des infos
|
|
if "runParams" in script.string:
|
|
print(" → Contient 'runParams'")
|
|
if "__INITIAL_STATE__" in script.string:
|
|
print(" → Contient '__INITIAL_STATE__'")
|
|
if "pageData" in script.string:
|
|
print(" → Contient 'pageData'")
|
|
|
|
# Chercher le titre dans le script
|
|
title_match = re.search(r'"title":\s*"([^"]{20,})"', script.string)
|
|
if title_match:
|
|
print(f" → Titre extrait: {title_match.group(1)[:100]}")
|
|
|
|
# Chercher le prix dans le script
|
|
price_match = re.search(r'"(minPrice|maxPrice|price|currentPrice)":\s*"?([0-9.]+)"?', script.string)
|
|
if price_match:
|
|
print(f" → Prix extrait: {price_match.group(2)}")
|
|
|
|
# Classes CSS
|
|
print("\n[6] Classes CSS Fréquentes (indice de structure)")
|
|
print("-" * 80)
|
|
all_classes = []
|
|
for elem in soup.find_all(class_=True):
|
|
if isinstance(elem["class"], list):
|
|
all_classes.extend(elem["class"])
|
|
|
|
from collections import Counter
|
|
common_classes = Counter(all_classes).most_common(15)
|
|
if common_classes:
|
|
print("Classes les plus fréquentes:")
|
|
for cls, count in common_classes:
|
|
print(f" • {cls}: {count}x")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("FIN DE L'ANALYSE")
|
|
print("=" * 80)
|