Files
scrap/fetch_aliexpress_wait.py
2026-01-13 19:49:04 +01:00

99 lines
2.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""Test AliExpress avec attente du chargement dynamique."""
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from bs4 import BeautifulSoup
import re
import json
url = "https://fr.aliexpress.com/item/1005007187023722.html"
print("=" * 80)
print("ALIEXPRESS - Fetch avec wait")
print("=" * 80)
print(f"URL: {url}\n")
# Essayer différents sélecteurs d'attente
wait_selectors = [
("h1", "Titre h1"),
(".product-title", "Product title class"),
("img[alt]", "Image avec alt"),
(".product-price", "Prix"),
]
best_result = None
best_size = 0
for selector, desc in wait_selectors:
print(f"\nTest avec wait_for_selector='{selector}' ({desc})...")
result = fetch_playwright(
url,
headless=True,
timeout_ms=15000, # 15s timeout
wait_for_selector=selector
)
if result.success:
size = len(result.html)
print(f"✓ Succès: {size:,} chars ({result.duration_ms}ms)")
if size > best_size:
best_size = size
best_result = result
else:
print(f"✗ Échec: {result.error}")
# Utiliser le meilleur résultat
if not best_result:
print("\n❌ Aucun résultat valide")
exit(1)
print("\n" + "=" * 80)
print("ANALYSE DU MEILLEUR RÉSULTAT")
print("=" * 80)
print(f"Taille HTML: {len(best_result.html):,} chars")
# Sauvegarder
with open("scraped/aliexpress_wait.html", "w", encoding="utf-8") as f:
f.write(best_result.html)
print("✓ Sauvegardé: scraped/aliexpress_wait.html")
# Analyse rapide
soup = BeautifulSoup(best_result.html, "lxml")
print("\n[Titre]")
og_title = soup.find("meta", property="og:title")
if og_title:
title = og_title.get("content", "")
print(f"✓ og:title: {title[:100]}")
h1 = soup.find("h1")
if h1:
print(f"✓ h1: {h1.get_text(strip=True)[:100]}")
else:
print("✗ Pas de h1")
print("\n[Prix]")
price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€|€\s*([0-9]+[.,][0-9]{2})', best_result.html)
if price_match:
price = price_match.group(1) or price_match.group(2)
print(f"✓ Prix trouvé par regex: {price}")
else:
print("✗ Prix non trouvé")
print("\n[Images]")
dcdata_match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', best_result.html, re.DOTALL)
if dcdata_match:
try:
data = json.loads(dcdata_match.group(1))
if "imagePathList" in data:
images = data["imagePathList"]
print(f"{len(images)} images trouvées dans DCData")
for i, img in enumerate(images[:3], 1):
print(f" [{i}] {img[:70]}...")
except:
pass
print("\n" + "=" * 80)