113 lines
3.5 KiB
Python
Executable File
113 lines
3.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Analyse des données JavaScript d'AliExpress."""
|
|
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
|
|
html_file = Path("scraped/aliexpress_pw.html")
|
|
html = html_file.read_text(encoding="utf-8")
|
|
|
|
print("=" * 80)
|
|
print("EXTRACTION DES DONNÉES ALIEXPRESS")
|
|
print("=" * 80)
|
|
|
|
# 1. Extract window.runParams
|
|
print("\n[1] window.runParams")
|
|
print("-" * 80)
|
|
match = re.search(r'window\.runParams\s*=\s*(\{[^;]*\});', html, re.DOTALL)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group(1))
|
|
print(f"✓ Trouvé et parsé: {len(json.dumps(data))} chars")
|
|
print(f"Keys: {list(data.keys())}")
|
|
|
|
# Save for inspection
|
|
with open("scraped/aliexpress_runParams.json", "w") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
print("✓ Sauvegardé: scraped/aliexpress_runParams.json")
|
|
except Exception as e:
|
|
print(f"✗ Erreur parsing: {e}")
|
|
else:
|
|
print("✗ Non trouvé")
|
|
|
|
# 2. Extract DCData
|
|
print("\n[2] window._d_c_.DCData")
|
|
print("-" * 80)
|
|
match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', html, re.DOTALL)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group(1))
|
|
print(f"✓ Trouvé et parsé")
|
|
print(f"Keys: {list(data.keys())}")
|
|
|
|
if "imagePathList" in data:
|
|
print(f" → {len(data['imagePathList'])} images")
|
|
for i, img in enumerate(data['imagePathList'][:3], 1):
|
|
print(f" [{i}] {img[:70]}...")
|
|
|
|
# Save
|
|
with open("scraped/aliexpress_dcdata.json", "w") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
print("✓ Sauvegardé: scraped/aliexpress_dcdata.json")
|
|
except Exception as e:
|
|
print(f"✗ Erreur: {e}")
|
|
else:
|
|
print("✗ Non trouvé")
|
|
|
|
# 3. Search for any object with product/price/title keys
|
|
print("\n[3] Recherche de données produit dans tous les scripts")
|
|
print("-" * 80)
|
|
|
|
# Find all potential JSON objects
|
|
json_pattern = re.compile(r'\{[^{}]*(?:"(?:title|price|product|sku|id)"[^{}]*)+\}', re.IGNORECASE)
|
|
matches = json_pattern.findall(html)
|
|
|
|
found_data = []
|
|
for match_text in matches[:50]: # Limit to first 50
|
|
try:
|
|
# Try to complete the JSON if needed
|
|
data = json.loads(match_text)
|
|
|
|
# Check if it has interesting keys
|
|
keys = set(data.keys())
|
|
interesting = {"title", "price", "product", "sku", "id", "name"}
|
|
if keys & interesting:
|
|
found_data.append((keys & interesting, data))
|
|
except:
|
|
pass
|
|
|
|
if found_data:
|
|
print(f"✓ Trouvé {len(found_data)} objet(s) avec données intéressantes:")
|
|
for i, (keys, data) in enumerate(found_data[:5], 1):
|
|
print(f"\n Objet {i}: keys = {keys}")
|
|
for key in keys:
|
|
value = str(data.get(key, ""))[:60]
|
|
print(f" → {key}: {value}")
|
|
else:
|
|
print("✗ Aucun objet intéressant trouvé")
|
|
|
|
# 4. Search for price patterns in text
|
|
print("\n[4] Recherche de patterns de prix")
|
|
print("-" * 80)
|
|
price_patterns = [
|
|
(r'"price":\s*"?([0-9]+\.?[0-9]*)"?', "price key"),
|
|
(r'"minPrice":\s*"?([0-9]+\.?[0-9]*)"?', "minPrice key"),
|
|
(r'"maxPrice":\s*"?([0-9]+\.?[0-9]*)"?', "maxPrice key"),
|
|
(r'€\s*([0-9]+[.,][0-9]{2})', "Euro symbol"),
|
|
(r'([0-9]+[.,][0-9]{2})\s*€', "Price before Euro"),
|
|
]
|
|
|
|
for pattern, desc in price_patterns:
|
|
matches = re.findall(pattern, html)
|
|
if matches:
|
|
print(f"✓ {desc}: {len(matches)} match(es)")
|
|
for price in set(matches[:5]):
|
|
print(f" → {price}")
|
|
else:
|
|
print(f"✗ {desc}: Aucun")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("FIN DE L'EXTRACTION")
|
|
print("=" * 80)
|