99 lines
2.6 KiB
Python
Executable File
99 lines
2.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Test AliExpress avec attente du chargement dynamique."""
|
|
|
|
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
import json
|
|
|
|
url = "https://fr.aliexpress.com/item/1005007187023722.html"
|
|
|
|
print("=" * 80)
|
|
print("ALIEXPRESS - Fetch avec wait")
|
|
print("=" * 80)
|
|
print(f"URL: {url}\n")
|
|
|
|
# Essayer différents sélecteurs d'attente
|
|
wait_selectors = [
|
|
("h1", "Titre h1"),
|
|
(".product-title", "Product title class"),
|
|
("img[alt]", "Image avec alt"),
|
|
(".product-price", "Prix"),
|
|
]
|
|
|
|
best_result = None
|
|
best_size = 0
|
|
|
|
for selector, desc in wait_selectors:
|
|
print(f"\nTest avec wait_for_selector='{selector}' ({desc})...")
|
|
|
|
result = fetch_playwright(
|
|
url,
|
|
headless=True,
|
|
timeout_ms=15000, # 15s timeout
|
|
wait_for_selector=selector
|
|
)
|
|
|
|
if result.success:
|
|
size = len(result.html)
|
|
print(f"✓ Succès: {size:,} chars ({result.duration_ms}ms)")
|
|
|
|
if size > best_size:
|
|
best_size = size
|
|
best_result = result
|
|
else:
|
|
print(f"✗ Échec: {result.error}")
|
|
|
|
# Utiliser le meilleur résultat
|
|
if not best_result:
|
|
print("\n❌ Aucun résultat valide")
|
|
exit(1)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("ANALYSE DU MEILLEUR RÉSULTAT")
|
|
print("=" * 80)
|
|
print(f"Taille HTML: {len(best_result.html):,} chars")
|
|
|
|
# Sauvegarder
|
|
with open("scraped/aliexpress_wait.html", "w", encoding="utf-8") as f:
|
|
f.write(best_result.html)
|
|
print("✓ Sauvegardé: scraped/aliexpress_wait.html")
|
|
|
|
# Analyse rapide
|
|
soup = BeautifulSoup(best_result.html, "lxml")
|
|
|
|
print("\n[Titre]")
|
|
og_title = soup.find("meta", property="og:title")
|
|
if og_title:
|
|
title = og_title.get("content", "")
|
|
print(f"✓ og:title: {title[:100]}")
|
|
|
|
h1 = soup.find("h1")
|
|
if h1:
|
|
print(f"✓ h1: {h1.get_text(strip=True)[:100]}")
|
|
else:
|
|
print("✗ Pas de h1")
|
|
|
|
print("\n[Prix]")
|
|
price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€|€\s*([0-9]+[.,][0-9]{2})', best_result.html)
|
|
if price_match:
|
|
price = price_match.group(1) or price_match.group(2)
|
|
print(f"✓ Prix trouvé par regex: {price} €")
|
|
else:
|
|
print("✗ Prix non trouvé")
|
|
|
|
print("\n[Images]")
|
|
dcdata_match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', best_result.html, re.DOTALL)
|
|
if dcdata_match:
|
|
try:
|
|
data = json.loads(dcdata_match.group(1))
|
|
if "imagePathList" in data:
|
|
images = data["imagePathList"]
|
|
print(f"✓ {len(images)} images trouvées dans DCData")
|
|
for i, img in enumerate(images[:3], 1):
|
|
print(f" [{i}] {img[:70]}...")
|
|
except:
|
|
pass
|
|
|
|
print("\n" + "=" * 80)
|