#!/usr/bin/env python3 """Fetch et analyse initiale d'une page produit AliExpress.""" from pricewatch.app.scraping.http_fetch import fetch_http from pricewatch.app.scraping.pw_fetch import fetch_playwright from bs4 import BeautifulSoup import json url = "https://fr.aliexpress.com/item/1005007187023722.html" print("=" * 80) print("ANALYSE ALIEXPRESS - Fetch & Structure HTML") print("=" * 80) print(f"\nURL: {url}\n") # Test 1: Essayer HTTP d'abord print("[TEST 1] Tentative avec HTTP simple...") result_http = fetch_http(url, timeout=30) if result_http.success: print(f"✓ HTTP fonctionne: {len(result_http.html):,} caractères") print(f" Durée: {result_http.duration_ms}ms") html_to_use = result_http.html method = "http" # Sauvegarder with open("scraped/aliexpress_http.html", "w", encoding="utf-8") as f: f.write(result_http.html) print(f"✓ Sauvegardé: scraped/aliexpress_http.html") else: print(f"✗ HTTP échoue: {result_http.error}") print("\n[TEST 2] Tentative avec Playwright...") result_pw = fetch_playwright(url, headless=True, timeout_ms=60000) if not result_pw.success: print(f"❌ ÉCHEC Playwright: {result_pw.error}") exit(1) print(f"✓ Playwright fonctionne: {len(result_pw.html):,} caractères") print(f" Durée: {result_pw.duration_ms}ms") html_to_use = result_pw.html method = "playwright" # Sauvegarder with open("scraped/aliexpress_pw.html", "w", encoding="utf-8") as f: f.write(result_pw.html) print(f"✓ Sauvegardé: scraped/aliexpress_pw.html") # Analyse de la structure HTML print("\n" + "=" * 80) print("ANALYSE DE LA STRUCTURE HTML") print("=" * 80) soup = BeautifulSoup(html_to_use, "lxml") # JSON-LD ? print("\n[1] JSON-LD Schema.org") print("-" * 80) json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"}) if json_ld_scripts: print(f"✓ {len(json_ld_scripts)} bloc(s) JSON-LD trouvé(s)") for i, script in enumerate(json_ld_scripts[:2], 1): try: data = json.loads(script.string) print(f"\n Bloc {i}: @type = {data.get('@type', 'N/A')}") if data.get("@type") == "Product": print(f" → name: {data.get('name', 'N/A')}") print(f" → offers: {data.get('offers', {}).get('price', 'N/A')}") except: print(f" Bloc {i}: Erreur de parsing JSON") else: print("✗ Pas de JSON-LD trouvé") # Titre print("\n[2] Titre du produit") print("-" * 80) title_selectors = [ "h1", "div.product-title-text", "span.product-title", "div[class*='title']", ] for selector in title_selectors: elem = soup.select_one(selector) if elem: text = elem.get_text(strip=True)[:100] print(f"✓ Trouvé avec '{selector}': {text}") break else: print("✗ Titre non trouvé avec sélecteurs basiques") # Prix print("\n[3] Prix") print("-" * 80) price_selectors = [ "span[class*='price']", "div[class*='price']", "span.product-price-value", "div.product-price", ] for selector in price_selectors: elems = soup.select(selector) if elems: print(f"✓ Trouvé {len(elems)} élément(s) avec '{selector}':") for elem in elems[:3]: text = elem.get_text(strip=True) print(f" → {text}") break else: print("✗ Prix non trouvé avec sélecteurs basiques") # Images print("\n[4] Images produit") print("-" * 80) img_elems = soup.find_all("img", src=True) product_images = [ img["src"] for img in img_elems if "alicdn.com" in img.get("src", "") and not any(x in img["src"] for x in ["logo", "icon", "avatar"]) ][:5] if product_images: print(f"✓ {len(product_images)} image(s) produit trouvée(s):") for i, img_url in enumerate(product_images, 1): print(f" [{i}] {img_url[:80]}...") else: print("✗ Aucune image produit trouvée") # Meta tags print("\n[5] Meta Tags") print("-" * 80) meta_tags = { "og:title": soup.find("meta", property="og:title"), "og:price:amount": soup.find("meta", property="og:price:amount"), "og:price:currency": soup.find("meta", property="og:price:currency"), "og:image": soup.find("meta", property="og:image"), } for key, elem in meta_tags.items(): if elem: content = elem.get("content", "N/A")[:80] print(f"✓ {key}: {content}") else: print(f"✗ {key}: Non trouvé") # Data attributes (pour identifier les sélecteurs) print("\n[6] Data Attributes (pour sélecteurs)") print("-" * 80) data_elems = soup.find_all(attrs={"data-pl": True})[:5] if data_elems: print(f"✓ {len(data_elems)} éléments avec data-pl:") for elem in data_elems: print(f" → {elem.name} data-pl='{elem.get('data-pl')}'") else: print("✗ Pas d'attributs data-pl") # Classes CSS intéressantes print("\n[7] Classes CSS Fréquentes") print("-" * 80) all_classes = [] for elem in soup.find_all(class_=True): if isinstance(elem["class"], list): all_classes.extend(elem["class"]) from collections import Counter common_classes = Counter(all_classes).most_common(10) if common_classes: print("Classes les plus fréquentes:") for cls, count in common_classes: print(f" • {cls}: {count} occurrences") print("\n" + "=" * 80) print("RECOMMANDATIONS") print("=" * 80) print(f"✓ Méthode de fetch: {method.upper()}") if method == "http": print(" → HTTP fonctionne, utiliser fetch_http() prioritaire") else: print(" → Playwright requis (anti-bot)") print("\n✓ Analyse terminée - Fichiers sauvegardés dans scraped/") print("=" * 80)