chore: sync project files

2026-01-13 19:49:04 +01:00
parent 53f8227941
commit ecda149a4b
149 changed files with 65272 additions and 1 deletions
--- a/fetch_aliexpress.py
+++ b/fetch_aliexpress.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Fetch et analyse initiale d'une page produit AliExpress."""
+
+from pricewatch.app.scraping.http_fetch import fetch_http
+from pricewatch.app.scraping.pw_fetch import fetch_playwright
+from bs4 import BeautifulSoup
+import json
+
+url = "https://fr.aliexpress.com/item/1005007187023722.html"
+
+print("=" * 80)
+print("ANALYSE ALIEXPRESS - Fetch & Structure HTML")
+print("=" * 80)
+print(f"\nURL: {url}\n")
+
+# Test 1: Essayer HTTP d'abord
+print("[TEST 1] Tentative avec HTTP simple...")
+result_http = fetch_http(url, timeout=30)
+
+if result_http.success:
+    print(f"✓ HTTP fonctionne: {len(result_http.html):,} caractères")
+    print(f"  Durée: {result_http.duration_ms}ms")
+    html_to_use = result_http.html
+    method = "http"
+
+    # Sauvegarder
+    with open("scraped/aliexpress_http.html", "w", encoding="utf-8") as f:
+        f.write(result_http.html)
+    print(f"✓ Sauvegardé: scraped/aliexpress_http.html")
+else:
+    print(f"✗ HTTP échoue: {result_http.error}")
+    print("\n[TEST 2] Tentative avec Playwright...")
+
+    result_pw = fetch_playwright(url, headless=True, timeout_ms=60000)
+
+    if not result_pw.success:
+        print(f"❌ ÉCHEC Playwright: {result_pw.error}")
+        exit(1)
+
+    print(f"✓ Playwright fonctionne: {len(result_pw.html):,} caractères")
+    print(f"  Durée: {result_pw.duration_ms}ms")
+    html_to_use = result_pw.html
+    method = "playwright"
+
+    # Sauvegarder
+    with open("scraped/aliexpress_pw.html", "w", encoding="utf-8") as f:
+        f.write(result_pw.html)
+    print(f"✓ Sauvegardé: scraped/aliexpress_pw.html")
+
+# Analyse de la structure HTML
+print("\n" + "=" * 80)
+print("ANALYSE DE LA STRUCTURE HTML")
+print("=" * 80)
+
+soup = BeautifulSoup(html_to_use, "lxml")
+
+# JSON-LD ?
+print("\n[1] JSON-LD Schema.org")
+print("-" * 80)
+json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
+if json_ld_scripts:
+    print(f"✓ {len(json_ld_scripts)} bloc(s) JSON-LD trouvé(s)")
+    for i, script in enumerate(json_ld_scripts[:2], 1):
+        try:
+            data = json.loads(script.string)
+            print(f"\n  Bloc {i}: @type = {data.get('@type', 'N/A')}")
+            if data.get("@type") == "Product":
+                print(f"    → name: {data.get('name', 'N/A')}")
+                print(f"    → offers: {data.get('offers', {}).get('price', 'N/A')}")
+        except:
+            print(f"  Bloc {i}: Erreur de parsing JSON")
+else:
+    print("✗ Pas de JSON-LD trouvé")
+
+# Titre
+print("\n[2] Titre du produit")
+print("-" * 80)
+title_selectors = [
+    "h1",
+    "div.product-title-text",
+    "span.product-title",
+    "div[class*='title']",
+]
+for selector in title_selectors:
+    elem = soup.select_one(selector)
+    if elem:
+        text = elem.get_text(strip=True)[:100]
+        print(f"✓ Trouvé avec '{selector}': {text}")
+        break
+else:
+    print("✗ Titre non trouvé avec sélecteurs basiques")
+
+# Prix
+print("\n[3] Prix")
+print("-" * 80)
+price_selectors = [
+    "span[class*='price']",
+    "div[class*='price']",
+    "span.product-price-value",
+    "div.product-price",
+]
+for selector in price_selectors:
+    elems = soup.select(selector)
+    if elems:
+        print(f"✓ Trouvé {len(elems)} élément(s) avec '{selector}':")
+        for elem in elems[:3]:
+            text = elem.get_text(strip=True)
+            print(f"    → {text}")
+        break
+else:
+    print("✗ Prix non trouvé avec sélecteurs basiques")
+
+# Images
+print("\n[4] Images produit")
+print("-" * 80)
+img_elems = soup.find_all("img", src=True)
+product_images = [
+    img["src"] for img in img_elems
+    if "alicdn.com" in img.get("src", "")
+    and not any(x in img["src"] for x in ["logo", "icon", "avatar"])
+][:5]
+
+if product_images:
+    print(f"✓ {len(product_images)} image(s) produit trouvée(s):")
+    for i, img_url in enumerate(product_images, 1):
+        print(f"  [{i}] {img_url[:80]}...")
+else:
+    print("✗ Aucune image produit trouvée")
+
+# Meta tags
+print("\n[5] Meta Tags")
+print("-" * 80)
+meta_tags = {
+    "og:title": soup.find("meta", property="og:title"),
+    "og:price:amount": soup.find("meta", property="og:price:amount"),
+    "og:price:currency": soup.find("meta", property="og:price:currency"),
+    "og:image": soup.find("meta", property="og:image"),
+}
+
+for key, elem in meta_tags.items():
+    if elem:
+        content = elem.get("content", "N/A")[:80]
+        print(f"✓ {key}: {content}")
+    else:
+        print(f"✗ {key}: Non trouvé")
+
+# Data attributes (pour identifier les sélecteurs)
+print("\n[6] Data Attributes (pour sélecteurs)")
+print("-" * 80)
+data_elems = soup.find_all(attrs={"data-pl": True})[:5]
+if data_elems:
+    print(f"✓ {len(data_elems)} éléments avec data-pl:")
+    for elem in data_elems:
+        print(f"  → {elem.name} data-pl='{elem.get('data-pl')}'")
+else:
+    print("✗ Pas d'attributs data-pl")
+
+# Classes CSS intéressantes
+print("\n[7] Classes CSS Fréquentes")
+print("-" * 80)
+all_classes = []
+for elem in soup.find_all(class_=True):
+    if isinstance(elem["class"], list):
+        all_classes.extend(elem["class"])
+
+from collections import Counter
+common_classes = Counter(all_classes).most_common(10)
+if common_classes:
+    print("Classes les plus fréquentes:")
+    for cls, count in common_classes:
+        print(f"  • {cls}: {count} occurrences")
+
+print("\n" + "=" * 80)
+print("RECOMMANDATIONS")
+print("=" * 80)
+print(f"✓ Méthode de fetch: {method.upper()}")
+if method == "http":
+    print("  → HTTP fonctionne, utiliser fetch_http() prioritaire")
+else:
+    print("  → Playwright requis (anti-bot)")
+
+print("\n✓ Analyse terminée - Fichiers sauvegardés dans scraped/")
+print("=" * 80)