Files
scrap/fetch_aliexpress_pw.py
2026-01-13 19:49:04 +01:00

181 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""Fetch AliExpress avec Playwright pour obtenir le contenu rendu."""
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from bs4 import BeautifulSoup
import json
import re
url = "https://fr.aliexpress.com/item/1005007187023722.html"
print("=" * 80)
print("ALIEXPRESS - Fetch avec Playwright")
print("=" * 80)
print(f"\nURL: {url}\n")
print("Récupération avec Playwright (headless)...")
result = fetch_playwright(url, headless=True, timeout_ms=60000)
if not result.success:
print(f"❌ ÉCHEC: {result.error}")
exit(1)
print(f"✓ Page récupérée: {len(result.html):,} caractères")
print(f" Durée: {result.duration_ms}ms")
# Sauvegarder
html_file = "scraped/aliexpress_pw.html"
with open(html_file, "w", encoding="utf-8") as f:
f.write(result.html)
print(f"✓ HTML sauvegardé: {html_file}\n")
# Analyse détaillée
print("=" * 80)
print("ANALYSE DU CONTENU RENDU")
print("=" * 80)
soup = BeautifulSoup(result.html, "lxml")
# JSON-LD
print("\n[1] JSON-LD Schema.org")
print("-" * 80)
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
if json_ld_scripts:
print(f"{len(json_ld_scripts)} bloc(s) JSON-LD trouvé(s)")
for i, script in enumerate(json_ld_scripts, 1):
try:
data = json.loads(script.string)
print(f"\n Bloc {i}: @type = {data.get('@type', 'N/A')}")
if isinstance(data, dict):
for key, value in list(data.items())[:5]:
if isinstance(value, str):
print(f"{key}: {value[:80]}")
else:
print(f"{key}: {type(value).__name__}")
except Exception as e:
print(f" Bloc {i}: Erreur parsing - {e}")
else:
print("✗ Pas de JSON-LD")
# Titre
print("\n[2] Titre du produit")
print("-" * 80)
title_selectors = [
"h1",
"h1.product-title-text",
"div.product-title",
"span[class*='title']",
"div[class*='ProductTitle']",
"span[class*='ProductTitle']",
]
for selector in title_selectors:
elem = soup.select_one(selector)
if elem:
text = elem.get_text(strip=True)
if text and len(text) > 10:
print(f"✓ Trouvé avec '{selector}':")
print(f" {text[:150]}")
break
else:
print("✗ Titre non trouvé - essai avec og:title")
og_title = soup.find("meta", property="og:title")
if og_title:
print(f"✓ og:title: {og_title.get('content', 'N/A')[:150]}")
# Prix
print("\n[3] Prix")
print("-" * 80)
price_selectors = [
"span.product-price-value",
"div.product-price-current",
"span[class*='price']",
"div[class*='Price']",
"span[class*='Price']",
]
found_price = False
for selector in price_selectors:
elems = soup.select(selector)
if elems:
print(f"✓ Trouvé {len(elems)} élément(s) avec '{selector}':")
for elem in elems[:5]:
text = elem.get_text(strip=True)
if text:
print(f"{text}")
found_price = True
if found_price:
break
if not found_price:
print("✗ Prix non trouvé avec sélecteurs CSS")
# Chercher dans le texte brut
price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€', result.html)
if price_match:
print(f"✓ Prix trouvé par regex: {price_match.group(0)}")
# Images
print("\n[4] Images produit")
print("-" * 80)
img_elems = soup.find_all("img", src=True)
product_images = []
for img in img_elems:
src = img.get("src", "")
if "alicdn.com" in src and not any(x in src for x in ["logo", "icon", "avatar", "seller"]):
if src not in product_images:
product_images.append(src)
if product_images:
print(f"{len(product_images)} image(s) trouvée(s):")
for i, img_url in enumerate(product_images[:5], 1):
print(f" [{i}] {img_url[:80]}...")
else:
print("✗ Aucune image trouvée")
# Data dans les scripts
print("\n[5] Data embarquée dans <script>")
print("-" * 80)
scripts = soup.find_all("script", type=None)
print(f"{len(scripts)} scripts trouvés")
# Chercher window.runParams ou window.__INITIAL_STATE__
for script in scripts:
if script.string and ("runParams" in script.string or "__INITIAL_STATE__" in script.string or "window.pageData" in script.string):
print("✓ Script avec données trouvé:")
print(f" Taille: {len(script.string):,} caractères")
# Essayer d'extraire des infos
if "runParams" in script.string:
print(" → Contient 'runParams'")
if "__INITIAL_STATE__" in script.string:
print(" → Contient '__INITIAL_STATE__'")
if "pageData" in script.string:
print(" → Contient 'pageData'")
# Chercher le titre dans le script
title_match = re.search(r'"title":\s*"([^"]{20,})"', script.string)
if title_match:
print(f" → Titre extrait: {title_match.group(1)[:100]}")
# Chercher le prix dans le script
price_match = re.search(r'"(minPrice|maxPrice|price|currentPrice)":\s*"?([0-9.]+)"?', script.string)
if price_match:
print(f" → Prix extrait: {price_match.group(2)}")
# Classes CSS
print("\n[6] Classes CSS Fréquentes (indice de structure)")
print("-" * 80)
all_classes = []
for elem in soup.find_all(class_=True):
if isinstance(elem["class"], list):
all_classes.extend(elem["class"])
from collections import Counter
common_classes = Counter(all_classes).most_common(15)
if common_classes:
print("Classes les plus fréquentes:")
for cls, count in common_classes:
print(f"{cls}: {count}x")
print("\n" + "=" * 80)
print("FIN DE L'ANALYSE")
print("=" * 80)