chore: sync project files

This commit is contained in:
Gilles Soulier
2026-01-13 19:49:04 +01:00
parent 53f8227941
commit ecda149a4b
149 changed files with 65272 additions and 1 deletions

112
analyze_aliexpress_data.py Executable file
View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""Analyse des données JavaScript d'AliExpress."""
import re
import json
from pathlib import Path
html_file = Path("scraped/aliexpress_pw.html")
html = html_file.read_text(encoding="utf-8")
print("=" * 80)
print("EXTRACTION DES DONNÉES ALIEXPRESS")
print("=" * 80)
# 1. Extract window.runParams
print("\n[1] window.runParams")
print("-" * 80)
match = re.search(r'window\.runParams\s*=\s*(\{[^;]*\});', html, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
print(f"✓ Trouvé et parsé: {len(json.dumps(data))} chars")
print(f"Keys: {list(data.keys())}")
# Save for inspection
with open("scraped/aliexpress_runParams.json", "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print("✓ Sauvegardé: scraped/aliexpress_runParams.json")
except Exception as e:
print(f"✗ Erreur parsing: {e}")
else:
print("✗ Non trouvé")
# 2. Extract DCData
print("\n[2] window._d_c_.DCData")
print("-" * 80)
match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', html, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
print(f"✓ Trouvé et parsé")
print(f"Keys: {list(data.keys())}")
if "imagePathList" in data:
print(f"{len(data['imagePathList'])} images")
for i, img in enumerate(data['imagePathList'][:3], 1):
print(f" [{i}] {img[:70]}...")
# Save
with open("scraped/aliexpress_dcdata.json", "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print("✓ Sauvegardé: scraped/aliexpress_dcdata.json")
except Exception as e:
print(f"✗ Erreur: {e}")
else:
print("✗ Non trouvé")
# 3. Search for any object with product/price/title keys
print("\n[3] Recherche de données produit dans tous les scripts")
print("-" * 80)
# Find all potential JSON objects
json_pattern = re.compile(r'\{[^{}]*(?:"(?:title|price|product|sku|id)"[^{}]*)+\}', re.IGNORECASE)
matches = json_pattern.findall(html)
found_data = []
for match_text in matches[:50]: # Limit to first 50
try:
# Try to complete the JSON if needed
data = json.loads(match_text)
# Check if it has interesting keys
keys = set(data.keys())
interesting = {"title", "price", "product", "sku", "id", "name"}
if keys & interesting:
found_data.append((keys & interesting, data))
except:
pass
if found_data:
print(f"✓ Trouvé {len(found_data)} objet(s) avec données intéressantes:")
for i, (keys, data) in enumerate(found_data[:5], 1):
print(f"\n Objet {i}: keys = {keys}")
for key in keys:
value = str(data.get(key, ""))[:60]
print(f"{key}: {value}")
else:
print("✗ Aucun objet intéressant trouvé")
# 4. Search for price patterns in text
print("\n[4] Recherche de patterns de prix")
print("-" * 80)
price_patterns = [
(r'"price":\s*"?([0-9]+\.?[0-9]*)"?', "price key"),
(r'"minPrice":\s*"?([0-9]+\.?[0-9]*)"?', "minPrice key"),
(r'"maxPrice":\s*"?([0-9]+\.?[0-9]*)"?', "maxPrice key"),
(r'\s*([0-9]+[.,][0-9]{2})', "Euro symbol"),
(r'([0-9]+[.,][0-9]{2})\s*€', "Price before Euro"),
]
for pattern, desc in price_patterns:
matches = re.findall(pattern, html)
if matches:
print(f"{desc}: {len(matches)} match(es)")
for price in set(matches[:5]):
print(f"{price}")
else:
print(f"{desc}: Aucun")
print("\n" + "=" * 80)
print("FIN DE L'EXTRACTION")
print("=" * 80)