#!/usr/bin/env python3 """Analyse des données JavaScript d'AliExpress.""" import re import json from pathlib import Path html_file = Path("scraped/aliexpress_pw.html") html = html_file.read_text(encoding="utf-8") print("=" * 80) print("EXTRACTION DES DONNÉES ALIEXPRESS") print("=" * 80) # 1. Extract window.runParams print("\n[1] window.runParams") print("-" * 80) match = re.search(r'window\.runParams\s*=\s*(\{[^;]*\});', html, re.DOTALL) if match: try: data = json.loads(match.group(1)) print(f"✓ Trouvé et parsé: {len(json.dumps(data))} chars") print(f"Keys: {list(data.keys())}") # Save for inspection with open("scraped/aliexpress_runParams.json", "w") as f: json.dump(data, f, indent=2, ensure_ascii=False) print("✓ Sauvegardé: scraped/aliexpress_runParams.json") except Exception as e: print(f"✗ Erreur parsing: {e}") else: print("✗ Non trouvé") # 2. Extract DCData print("\n[2] window._d_c_.DCData") print("-" * 80) match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', html, re.DOTALL) if match: try: data = json.loads(match.group(1)) print(f"✓ Trouvé et parsé") print(f"Keys: {list(data.keys())}") if "imagePathList" in data: print(f" → {len(data['imagePathList'])} images") for i, img in enumerate(data['imagePathList'][:3], 1): print(f" [{i}] {img[:70]}...") # Save with open("scraped/aliexpress_dcdata.json", "w") as f: json.dump(data, f, indent=2, ensure_ascii=False) print("✓ Sauvegardé: scraped/aliexpress_dcdata.json") except Exception as e: print(f"✗ Erreur: {e}") else: print("✗ Non trouvé") # 3. Search for any object with product/price/title keys print("\n[3] Recherche de données produit dans tous les scripts") print("-" * 80) # Find all potential JSON objects json_pattern = re.compile(r'\{[^{}]*(?:"(?:title|price|product|sku|id)"[^{}]*)+\}', re.IGNORECASE) matches = json_pattern.findall(html) found_data = [] for match_text in matches[:50]: # Limit to first 50 try: # Try to complete the JSON if needed data = json.loads(match_text) # Check if it has interesting keys keys = set(data.keys()) interesting = {"title", "price", "product", "sku", "id", "name"} if keys & interesting: found_data.append((keys & interesting, data)) except: pass if found_data: print(f"✓ Trouvé {len(found_data)} objet(s) avec données intéressantes:") for i, (keys, data) in enumerate(found_data[:5], 1): print(f"\n Objet {i}: keys = {keys}") for key in keys: value = str(data.get(key, ""))[:60] print(f" → {key}: {value}") else: print("✗ Aucun objet intéressant trouvé") # 4. Search for price patterns in text print("\n[4] Recherche de patterns de prix") print("-" * 80) price_patterns = [ (r'"price":\s*"?([0-9]+\.?[0-9]*)"?', "price key"), (r'"minPrice":\s*"?([0-9]+\.?[0-9]*)"?', "minPrice key"), (r'"maxPrice":\s*"?([0-9]+\.?[0-9]*)"?', "maxPrice key"), (r'€\s*([0-9]+[.,][0-9]{2})', "Euro symbol"), (r'([0-9]+[.,][0-9]{2})\s*€', "Price before Euro"), ] for pattern, desc in price_patterns: matches = re.findall(pattern, html) if matches: print(f"✓ {desc}: {len(matches)} match(es)") for price in set(matches[:5]): print(f" → {price}") else: print(f"✗ {desc}: Aucun") print("\n" + "=" * 80) print("FIN DE L'EXTRACTION") print("=" * 80)