#!/usr/bin/env python3 """Test AliExpress avec attente du chargement dynamique.""" from pricewatch.app.scraping.pw_fetch import fetch_playwright from bs4 import BeautifulSoup import re import json url = "https://fr.aliexpress.com/item/1005007187023722.html" print("=" * 80) print("ALIEXPRESS - Fetch avec wait") print("=" * 80) print(f"URL: {url}\n") # Essayer différents sélecteurs d'attente wait_selectors = [ ("h1", "Titre h1"), (".product-title", "Product title class"), ("img[alt]", "Image avec alt"), (".product-price", "Prix"), ] best_result = None best_size = 0 for selector, desc in wait_selectors: print(f"\nTest avec wait_for_selector='{selector}' ({desc})...") result = fetch_playwright( url, headless=True, timeout_ms=15000, # 15s timeout wait_for_selector=selector ) if result.success: size = len(result.html) print(f"✓ Succès: {size:,} chars ({result.duration_ms}ms)") if size > best_size: best_size = size best_result = result else: print(f"✗ Échec: {result.error}") # Utiliser le meilleur résultat if not best_result: print("\n❌ Aucun résultat valide") exit(1) print("\n" + "=" * 80) print("ANALYSE DU MEILLEUR RÉSULTAT") print("=" * 80) print(f"Taille HTML: {len(best_result.html):,} chars") # Sauvegarder with open("scraped/aliexpress_wait.html", "w", encoding="utf-8") as f: f.write(best_result.html) print("✓ Sauvegardé: scraped/aliexpress_wait.html") # Analyse rapide soup = BeautifulSoup(best_result.html, "lxml") print("\n[Titre]") og_title = soup.find("meta", property="og:title") if og_title: title = og_title.get("content", "") print(f"✓ og:title: {title[:100]}") h1 = soup.find("h1") if h1: print(f"✓ h1: {h1.get_text(strip=True)[:100]}") else: print("✗ Pas de h1") print("\n[Prix]") price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€|€\s*([0-9]+[.,][0-9]{2})', best_result.html) if price_match: price = price_match.group(1) or price_match.group(2) print(f"✓ Prix trouvé par regex: {price} €") else: print("✗ Prix non trouvé") print("\n[Images]") dcdata_match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', best_result.html, re.DOTALL) if dcdata_match: try: data = json.loads(dcdata_match.group(1)) if "imagePathList" in data: images = data["imagePathList"] print(f"✓ {len(images)} images trouvées dans DCData") for i, img in enumerate(images[:3], 1): print(f" [{i}] {img[:70]}...") except: pass print("\n" + "=" * 80)