scrap/fetch_aliexpress_pw.py

#!/usr/bin/env python3
"""Fetch AliExpress avec Playwright pour obtenir le contenu rendu."""

from pricewatch.app.scraping.pw_fetch import fetch_playwright
from bs4 import BeautifulSoup
import json
import re

url = "https://fr.aliexpress.com/item/1005007187023722.html"

print("=" * 80)
print("ALIEXPRESS - Fetch avec Playwright")
print("=" * 80)
print(f"\nURL: {url}\n")

print("Récupération avec Playwright (headless)...")
result = fetch_playwright(url, headless=True, timeout_ms=60000)

if not result.success:
    print(f"❌ ÉCHEC: {result.error}")
    exit(1)

print(f"✓ Page récupérée: {len(result.html):,} caractères")
print(f"  Durée: {result.duration_ms}ms")

# Sauvegarder
html_file = "scraped/aliexpress_pw.html"
with open(html_file, "w", encoding="utf-8") as f:
    f.write(result.html)
print(f"✓ HTML sauvegardé: {html_file}\n")

# Analyse détaillée
print("=" * 80)
print("ANALYSE DU CONTENU RENDU")
print("=" * 80)

soup = BeautifulSoup(result.html, "lxml")

# JSON-LD
print("\n[1] JSON-LD Schema.org")
print("-" * 80)
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
if json_ld_scripts:
    print(f"✓ {len(json_ld_scripts)} bloc(s) JSON-LD trouvé(s)")
    for i, script in enumerate(json_ld_scripts, 1):
        try:
            data = json.loads(script.string)
            print(f"\n  Bloc {i}: @type = {data.get('@type', 'N/A')}")
            if isinstance(data, dict):
                for key, value in list(data.items())[:5]:
                    if isinstance(value, str):
                        print(f"    → {key}: {value[:80]}")
                    else:
                        print(f"    → {key}: {type(value).__name__}")
        except Exception as e:
            print(f"  Bloc {i}: Erreur parsing - {e}")
else:
    print("✗ Pas de JSON-LD")

# Titre
print("\n[2] Titre du produit")
print("-" * 80)
title_selectors = [
    "h1",
    "h1.product-title-text",
    "div.product-title",
    "span[class*='title']",
    "div[class*='ProductTitle']",
    "span[class*='ProductTitle']",
]
for selector in title_selectors:
    elem = soup.select_one(selector)
    if elem:
        text = elem.get_text(strip=True)
        if text and len(text) > 10:
            print(f"✓ Trouvé avec '{selector}':")
            print(f"  {text[:150]}")
            break
else:
    print("✗ Titre non trouvé - essai avec og:title")
    og_title = soup.find("meta", property="og:title")
    if og_title:
        print(f"✓ og:title: {og_title.get('content', 'N/A')[:150]}")

# Prix
print("\n[3] Prix")
print("-" * 80)
price_selectors = [
    "span.product-price-value",
    "div.product-price-current",
    "span[class*='price']",
    "div[class*='Price']",
    "span[class*='Price']",
]
found_price = False
for selector in price_selectors:
    elems = soup.select(selector)
    if elems:
        print(f"✓ Trouvé {len(elems)} élément(s) avec '{selector}':")
        for elem in elems[:5]:
            text = elem.get_text(strip=True)
            if text:
                print(f"  → {text}")
                found_price = True
        if found_price:
            break

if not found_price:
    print("✗ Prix non trouvé avec sélecteurs CSS")
    # Chercher dans le texte brut
    price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€', result.html)
    if price_match:
        print(f"✓ Prix trouvé par regex: {price_match.group(0)}")

# Images
print("\n[4] Images produit")
print("-" * 80)
img_elems = soup.find_all("img", src=True)
product_images = []
for img in img_elems:
    src = img.get("src", "")
    if "alicdn.com" in src and not any(x in src for x in ["logo", "icon", "avatar", "seller"]):
        if src not in product_images:
            product_images.append(src)

if product_images:
    print(f"✓ {len(product_images)} image(s) trouvée(s):")
    for i, img_url in enumerate(product_images[:5], 1):
        print(f"  [{i}] {img_url[:80]}...")
else:
    print("✗ Aucune image trouvée")

# Data dans les scripts
print("\n[5] Data embarquée dans <script>")
print("-" * 80)
scripts = soup.find_all("script", type=None)
print(f"✓ {len(scripts)} scripts trouvés")

# Chercher window.runParams ou window.__INITIAL_STATE__
for script in scripts:
    if script.string and ("runParams" in script.string or "__INITIAL_STATE__" in script.string or "window.pageData" in script.string):
        print("✓ Script avec données trouvé:")
        print(f"  Taille: {len(script.string):,} caractères")

        # Essayer d'extraire des infos
        if "runParams" in script.string:
            print("  → Contient 'runParams'")
        if "__INITIAL_STATE__" in script.string:
            print("  → Contient '__INITIAL_STATE__'")
        if "pageData" in script.string:
            print("  → Contient 'pageData'")

        # Chercher le titre dans le script
        title_match = re.search(r'"title":\s*"([^"]{20,})"', script.string)
        if title_match:
            print(f"  → Titre extrait: {title_match.group(1)[:100]}")

        # Chercher le prix dans le script
        price_match = re.search(r'"(minPrice|maxPrice|price|currentPrice)":\s*"?([0-9.]+)"?', script.string)
        if price_match:
            print(f"  → Prix extrait: {price_match.group(2)}")

# Classes CSS
print("\n[6] Classes CSS Fréquentes (indice de structure)")
print("-" * 80)
all_classes = []
for elem in soup.find_all(class_=True):
    if isinstance(elem["class"], list):
        all_classes.extend(elem["class"])

from collections import Counter
common_classes = Counter(all_classes).most_common(15)
if common_classes:
    print("Classes les plus fréquentes:")
    for cls, count in common_classes:
        print(f"  • {cls}: {count}x")

print("\n" + "=" * 80)
print("FIN DE L'ANALYSE")
print("=" * 80)