chore: sync project files

2026-01-13 19:49:04 +01:00
parent 53f8227941
commit ecda149a4b
149 changed files with 65272 additions and 1 deletions
--- a/fetch_aliexpress_wait.py
+++ b/fetch_aliexpress_wait.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""Test AliExpress avec attente du chargement dynamique."""
+
+from pricewatch.app.scraping.pw_fetch import fetch_playwright
+from bs4 import BeautifulSoup
+import re
+import json
+
+url = "https://fr.aliexpress.com/item/1005007187023722.html"
+
+print("=" * 80)
+print("ALIEXPRESS - Fetch avec wait")
+print("=" * 80)
+print(f"URL: {url}\n")
+
+# Essayer différents sélecteurs d'attente
+wait_selectors = [
+    ("h1", "Titre h1"),
+    (".product-title", "Product title class"),
+    ("img[alt]", "Image avec alt"),
+    (".product-price", "Prix"),
+]
+
+best_result = None
+best_size = 0
+
+for selector, desc in wait_selectors:
+    print(f"\nTest avec wait_for_selector='{selector}' ({desc})...")
+
+    result = fetch_playwright(
+        url,
+        headless=True,
+        timeout_ms=15000,  # 15s timeout
+        wait_for_selector=selector
+    )
+
+    if result.success:
+        size = len(result.html)
+        print(f"✓ Succès: {size:,} chars ({result.duration_ms}ms)")
+
+        if size > best_size:
+            best_size = size
+            best_result = result
+    else:
+        print(f"✗ Échec: {result.error}")
+
+# Utiliser le meilleur résultat
+if not best_result:
+    print("\n❌ Aucun résultat valide")
+    exit(1)
+
+print("\n" + "=" * 80)
+print("ANALYSE DU MEILLEUR RÉSULTAT")
+print("=" * 80)
+print(f"Taille HTML: {len(best_result.html):,} chars")
+
+# Sauvegarder
+with open("scraped/aliexpress_wait.html", "w", encoding="utf-8") as f:
+    f.write(best_result.html)
+print("✓ Sauvegardé: scraped/aliexpress_wait.html")
+
+# Analyse rapide
+soup = BeautifulSoup(best_result.html, "lxml")
+
+print("\n[Titre]")
+og_title = soup.find("meta", property="og:title")
+if og_title:
+    title = og_title.get("content", "")
+    print(f"✓ og:title: {title[:100]}")
+
+h1 = soup.find("h1")
+if h1:
+    print(f"✓ h1: {h1.get_text(strip=True)[:100]}")
+else:
+    print("✗ Pas de h1")
+
+print("\n[Prix]")
+price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€|€\s*([0-9]+[.,][0-9]{2})', best_result.html)
+if price_match:
+    price = price_match.group(1) or price_match.group(2)
+    print(f"✓ Prix trouvé par regex: {price} €")
+else:
+    print("✗ Prix non trouvé")
+
+print("\n[Images]")
+dcdata_match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', best_result.html, re.DOTALL)
+if dcdata_match:
+    try:
+        data = json.loads(dcdata_match.group(1))
+        if "imagePathList" in data:
+            images = data["imagePathList"]
+            print(f"✓ {len(images)} images trouvées dans DCData")
+            for i, img in enumerate(images[:3], 1):
+                print(f"  [{i}] {img[:70]}...")
+    except:
+        pass
+
+print("\n" + "=" * 80)