#!/usr/bin/env python3 """Fetch AliExpress avec Playwright pour obtenir le contenu rendu.""" from pricewatch.app.scraping.pw_fetch import fetch_playwright from bs4 import BeautifulSoup import json import re url = "https://fr.aliexpress.com/item/1005007187023722.html" print("=" * 80) print("ALIEXPRESS - Fetch avec Playwright") print("=" * 80) print(f"\nURL: {url}\n") print("Récupération avec Playwright (headless)...") result = fetch_playwright(url, headless=True, timeout_ms=60000) if not result.success: print(f"❌ ÉCHEC: {result.error}") exit(1) print(f"✓ Page récupérée: {len(result.html):,} caractères") print(f" Durée: {result.duration_ms}ms") # Sauvegarder html_file = "scraped/aliexpress_pw.html" with open(html_file, "w", encoding="utf-8") as f: f.write(result.html) print(f"✓ HTML sauvegardé: {html_file}\n") # Analyse détaillée print("=" * 80) print("ANALYSE DU CONTENU RENDU") print("=" * 80) soup = BeautifulSoup(result.html, "lxml") # JSON-LD print("\n[1] JSON-LD Schema.org") print("-" * 80) json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"}) if json_ld_scripts: print(f"✓ {len(json_ld_scripts)} bloc(s) JSON-LD trouvé(s)") for i, script in enumerate(json_ld_scripts, 1): try: data = json.loads(script.string) print(f"\n Bloc {i}: @type = {data.get('@type', 'N/A')}") if isinstance(data, dict): for key, value in list(data.items())[:5]: if isinstance(value, str): print(f" → {key}: {value[:80]}") else: print(f" → {key}: {type(value).__name__}") except Exception as e: print(f" Bloc {i}: Erreur parsing - {e}") else: print("✗ Pas de JSON-LD") # Titre print("\n[2] Titre du produit") print("-" * 80) title_selectors = [ "h1", "h1.product-title-text", "div.product-title", "span[class*='title']", "div[class*='ProductTitle']", "span[class*='ProductTitle']", ] for selector in title_selectors: elem = soup.select_one(selector) if elem: text = elem.get_text(strip=True) if text and len(text) > 10: print(f"✓ Trouvé avec '{selector}':") print(f" {text[:150]}") break else: print("✗ Titre non trouvé - essai avec og:title") og_title = soup.find("meta", property="og:title") if og_title: print(f"✓ og:title: {og_title.get('content', 'N/A')[:150]}") # Prix print("\n[3] Prix") print("-" * 80) price_selectors = [ "span.product-price-value", "div.product-price-current", "span[class*='price']", "div[class*='Price']", "span[class*='Price']", ] found_price = False for selector in price_selectors: elems = soup.select(selector) if elems: print(f"✓ Trouvé {len(elems)} élément(s) avec '{selector}':") for elem in elems[:5]: text = elem.get_text(strip=True) if text: print(f" → {text}") found_price = True if found_price: break if not found_price: print("✗ Prix non trouvé avec sélecteurs CSS") # Chercher dans le texte brut price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€', result.html) if price_match: print(f"✓ Prix trouvé par regex: {price_match.group(0)}") # Images print("\n[4] Images produit") print("-" * 80) img_elems = soup.find_all("img", src=True) product_images = [] for img in img_elems: src = img.get("src", "") if "alicdn.com" in src and not any(x in src for x in ["logo", "icon", "avatar", "seller"]): if src not in product_images: product_images.append(src) if product_images: print(f"✓ {len(product_images)} image(s) trouvée(s):") for i, img_url in enumerate(product_images[:5], 1): print(f" [{i}] {img_url[:80]}...") else: print("✗ Aucune image trouvée") # Data dans les scripts print("\n[5] Data embarquée dans