scrap/fetch_aliexpress_wait.py

#!/usr/bin/env python3
"""Test AliExpress avec attente du chargement dynamique."""

from pricewatch.app.scraping.pw_fetch import fetch_playwright
from bs4 import BeautifulSoup
import re
import json

url = "https://fr.aliexpress.com/item/1005007187023722.html"

print("=" * 80)
print("ALIEXPRESS - Fetch avec wait")
print("=" * 80)
print(f"URL: {url}\n")

# Essayer différents sélecteurs d'attente
wait_selectors = [
    ("h1", "Titre h1"),
    (".product-title", "Product title class"),
    ("img[alt]", "Image avec alt"),
    (".product-price", "Prix"),
]

best_result = None
best_size = 0

for selector, desc in wait_selectors:
    print(f"\nTest avec wait_for_selector='{selector}' ({desc})...")

    result = fetch_playwright(
        url,
        headless=True,
        timeout_ms=15000,  # 15s timeout
        wait_for_selector=selector
    )

    if result.success:
        size = len(result.html)
        print(f"✓ Succès: {size:,} chars ({result.duration_ms}ms)")

        if size > best_size:
            best_size = size
            best_result = result
    else:
        print(f"✗ Échec: {result.error}")

# Utiliser le meilleur résultat
if not best_result:
    print("\n❌ Aucun résultat valide")
    exit(1)

print("\n" + "=" * 80)
print("ANALYSE DU MEILLEUR RÉSULTAT")
print("=" * 80)
print(f"Taille HTML: {len(best_result.html):,} chars")

# Sauvegarder
with open("scraped/aliexpress_wait.html", "w", encoding="utf-8") as f:
    f.write(best_result.html)
print("✓ Sauvegardé: scraped/aliexpress_wait.html")

# Analyse rapide
soup = BeautifulSoup(best_result.html, "lxml")

print("\n[Titre]")
og_title = soup.find("meta", property="og:title")
if og_title:
    title = og_title.get("content", "")
    print(f"✓ og:title: {title[:100]}")

h1 = soup.find("h1")
if h1:
    print(f"✓ h1: {h1.get_text(strip=True)[:100]}")
else:
    print("✗ Pas de h1")

print("\n[Prix]")
price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€|€\s*([0-9]+[.,][0-9]{2})', best_result.html)
if price_match:
    price = price_match.group(1) or price_match.group(2)
    print(f"✓ Prix trouvé par regex: {price} €")
else:
    print("✗ Prix non trouvé")

print("\n[Images]")
dcdata_match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', best_result.html, re.DOTALL)
if dcdata_match:
    try:
        data = json.loads(dcdata_match.group(1))
        if "imagePathList" in data:
            images = data["imagePathList"]
            print(f"✓ {len(images)} images trouvées dans DCData")
            for i, img in enumerate(images[:3], 1):
                print(f"  [{i}] {img[:70]}...")
    except:
        pass

print("\n" + "=" * 80)