chore: sync project files
This commit is contained in:
112
analyze_aliexpress_data.py
Executable file
112
analyze_aliexpress_data.py
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyse des données JavaScript d'AliExpress."""
|
||||
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
html_file = Path("scraped/aliexpress_pw.html")
|
||||
html = html_file.read_text(encoding="utf-8")
|
||||
|
||||
print("=" * 80)
|
||||
print("EXTRACTION DES DONNÉES ALIEXPRESS")
|
||||
print("=" * 80)
|
||||
|
||||
# 1. Extract window.runParams
|
||||
print("\n[1] window.runParams")
|
||||
print("-" * 80)
|
||||
match = re.search(r'window\.runParams\s*=\s*(\{[^;]*\});', html, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
print(f"✓ Trouvé et parsé: {len(json.dumps(data))} chars")
|
||||
print(f"Keys: {list(data.keys())}")
|
||||
|
||||
# Save for inspection
|
||||
with open("scraped/aliexpress_runParams.json", "w") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
print("✓ Sauvegardé: scraped/aliexpress_runParams.json")
|
||||
except Exception as e:
|
||||
print(f"✗ Erreur parsing: {e}")
|
||||
else:
|
||||
print("✗ Non trouvé")
|
||||
|
||||
# 2. Extract DCData
|
||||
print("\n[2] window._d_c_.DCData")
|
||||
print("-" * 80)
|
||||
match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', html, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
print(f"✓ Trouvé et parsé")
|
||||
print(f"Keys: {list(data.keys())}")
|
||||
|
||||
if "imagePathList" in data:
|
||||
print(f" → {len(data['imagePathList'])} images")
|
||||
for i, img in enumerate(data['imagePathList'][:3], 1):
|
||||
print(f" [{i}] {img[:70]}...")
|
||||
|
||||
# Save
|
||||
with open("scraped/aliexpress_dcdata.json", "w") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
print("✓ Sauvegardé: scraped/aliexpress_dcdata.json")
|
||||
except Exception as e:
|
||||
print(f"✗ Erreur: {e}")
|
||||
else:
|
||||
print("✗ Non trouvé")
|
||||
|
||||
# 3. Search for any object with product/price/title keys
|
||||
print("\n[3] Recherche de données produit dans tous les scripts")
|
||||
print("-" * 80)
|
||||
|
||||
# Find all potential JSON objects
|
||||
json_pattern = re.compile(r'\{[^{}]*(?:"(?:title|price|product|sku|id)"[^{}]*)+\}', re.IGNORECASE)
|
||||
matches = json_pattern.findall(html)
|
||||
|
||||
found_data = []
|
||||
for match_text in matches[:50]: # Limit to first 50
|
||||
try:
|
||||
# Try to complete the JSON if needed
|
||||
data = json.loads(match_text)
|
||||
|
||||
# Check if it has interesting keys
|
||||
keys = set(data.keys())
|
||||
interesting = {"title", "price", "product", "sku", "id", "name"}
|
||||
if keys & interesting:
|
||||
found_data.append((keys & interesting, data))
|
||||
except:
|
||||
pass
|
||||
|
||||
if found_data:
|
||||
print(f"✓ Trouvé {len(found_data)} objet(s) avec données intéressantes:")
|
||||
for i, (keys, data) in enumerate(found_data[:5], 1):
|
||||
print(f"\n Objet {i}: keys = {keys}")
|
||||
for key in keys:
|
||||
value = str(data.get(key, ""))[:60]
|
||||
print(f" → {key}: {value}")
|
||||
else:
|
||||
print("✗ Aucun objet intéressant trouvé")
|
||||
|
||||
# 4. Search for price patterns in text
|
||||
print("\n[4] Recherche de patterns de prix")
|
||||
print("-" * 80)
|
||||
price_patterns = [
|
||||
(r'"price":\s*"?([0-9]+\.?[0-9]*)"?', "price key"),
|
||||
(r'"minPrice":\s*"?([0-9]+\.?[0-9]*)"?', "minPrice key"),
|
||||
(r'"maxPrice":\s*"?([0-9]+\.?[0-9]*)"?', "maxPrice key"),
|
||||
(r'€\s*([0-9]+[.,][0-9]{2})', "Euro symbol"),
|
||||
(r'([0-9]+[.,][0-9]{2})\s*€', "Price before Euro"),
|
||||
]
|
||||
|
||||
for pattern, desc in price_patterns:
|
||||
matches = re.findall(pattern, html)
|
||||
if matches:
|
||||
print(f"✓ {desc}: {len(matches)} match(es)")
|
||||
for price in set(matches[:5]):
|
||||
print(f" → {price}")
|
||||
else:
|
||||
print(f"✗ {desc}: Aucun")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("FIN DE L'EXTRACTION")
|
||||
print("=" * 80)
|
||||
Reference in New Issue
Block a user