184 lines
5.5 KiB
Python
Executable File
184 lines
5.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Fetch et analyse initiale d'une page produit AliExpress."""
|
|
|
|
from pricewatch.app.scraping.http_fetch import fetch_http
|
|
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
|
|
url = "https://fr.aliexpress.com/item/1005007187023722.html"
|
|
|
|
print("=" * 80)
|
|
print("ANALYSE ALIEXPRESS - Fetch & Structure HTML")
|
|
print("=" * 80)
|
|
print(f"\nURL: {url}\n")
|
|
|
|
# Test 1: Essayer HTTP d'abord
|
|
print("[TEST 1] Tentative avec HTTP simple...")
|
|
result_http = fetch_http(url, timeout=30)
|
|
|
|
if result_http.success:
|
|
print(f"✓ HTTP fonctionne: {len(result_http.html):,} caractères")
|
|
print(f" Durée: {result_http.duration_ms}ms")
|
|
html_to_use = result_http.html
|
|
method = "http"
|
|
|
|
# Sauvegarder
|
|
with open("scraped/aliexpress_http.html", "w", encoding="utf-8") as f:
|
|
f.write(result_http.html)
|
|
print(f"✓ Sauvegardé: scraped/aliexpress_http.html")
|
|
else:
|
|
print(f"✗ HTTP échoue: {result_http.error}")
|
|
print("\n[TEST 2] Tentative avec Playwright...")
|
|
|
|
result_pw = fetch_playwright(url, headless=True, timeout_ms=60000)
|
|
|
|
if not result_pw.success:
|
|
print(f"❌ ÉCHEC Playwright: {result_pw.error}")
|
|
exit(1)
|
|
|
|
print(f"✓ Playwright fonctionne: {len(result_pw.html):,} caractères")
|
|
print(f" Durée: {result_pw.duration_ms}ms")
|
|
html_to_use = result_pw.html
|
|
method = "playwright"
|
|
|
|
# Sauvegarder
|
|
with open("scraped/aliexpress_pw.html", "w", encoding="utf-8") as f:
|
|
f.write(result_pw.html)
|
|
print(f"✓ Sauvegardé: scraped/aliexpress_pw.html")
|
|
|
|
# Analyse de la structure HTML
|
|
print("\n" + "=" * 80)
|
|
print("ANALYSE DE LA STRUCTURE HTML")
|
|
print("=" * 80)
|
|
|
|
soup = BeautifulSoup(html_to_use, "lxml")
|
|
|
|
# JSON-LD ?
|
|
print("\n[1] JSON-LD Schema.org")
|
|
print("-" * 80)
|
|
json_ld_scripts = soup.find_all("script", {"type": "application/ld+json"})
|
|
if json_ld_scripts:
|
|
print(f"✓ {len(json_ld_scripts)} bloc(s) JSON-LD trouvé(s)")
|
|
for i, script in enumerate(json_ld_scripts[:2], 1):
|
|
try:
|
|
data = json.loads(script.string)
|
|
print(f"\n Bloc {i}: @type = {data.get('@type', 'N/A')}")
|
|
if data.get("@type") == "Product":
|
|
print(f" → name: {data.get('name', 'N/A')}")
|
|
print(f" → offers: {data.get('offers', {}).get('price', 'N/A')}")
|
|
except:
|
|
print(f" Bloc {i}: Erreur de parsing JSON")
|
|
else:
|
|
print("✗ Pas de JSON-LD trouvé")
|
|
|
|
# Titre
|
|
print("\n[2] Titre du produit")
|
|
print("-" * 80)
|
|
title_selectors = [
|
|
"h1",
|
|
"div.product-title-text",
|
|
"span.product-title",
|
|
"div[class*='title']",
|
|
]
|
|
for selector in title_selectors:
|
|
elem = soup.select_one(selector)
|
|
if elem:
|
|
text = elem.get_text(strip=True)[:100]
|
|
print(f"✓ Trouvé avec '{selector}': {text}")
|
|
break
|
|
else:
|
|
print("✗ Titre non trouvé avec sélecteurs basiques")
|
|
|
|
# Prix
|
|
print("\n[3] Prix")
|
|
print("-" * 80)
|
|
price_selectors = [
|
|
"span[class*='price']",
|
|
"div[class*='price']",
|
|
"span.product-price-value",
|
|
"div.product-price",
|
|
]
|
|
for selector in price_selectors:
|
|
elems = soup.select(selector)
|
|
if elems:
|
|
print(f"✓ Trouvé {len(elems)} élément(s) avec '{selector}':")
|
|
for elem in elems[:3]:
|
|
text = elem.get_text(strip=True)
|
|
print(f" → {text}")
|
|
break
|
|
else:
|
|
print("✗ Prix non trouvé avec sélecteurs basiques")
|
|
|
|
# Images
|
|
print("\n[4] Images produit")
|
|
print("-" * 80)
|
|
img_elems = soup.find_all("img", src=True)
|
|
product_images = [
|
|
img["src"] for img in img_elems
|
|
if "alicdn.com" in img.get("src", "")
|
|
and not any(x in img["src"] for x in ["logo", "icon", "avatar"])
|
|
][:5]
|
|
|
|
if product_images:
|
|
print(f"✓ {len(product_images)} image(s) produit trouvée(s):")
|
|
for i, img_url in enumerate(product_images, 1):
|
|
print(f" [{i}] {img_url[:80]}...")
|
|
else:
|
|
print("✗ Aucune image produit trouvée")
|
|
|
|
# Meta tags
|
|
print("\n[5] Meta Tags")
|
|
print("-" * 80)
|
|
meta_tags = {
|
|
"og:title": soup.find("meta", property="og:title"),
|
|
"og:price:amount": soup.find("meta", property="og:price:amount"),
|
|
"og:price:currency": soup.find("meta", property="og:price:currency"),
|
|
"og:image": soup.find("meta", property="og:image"),
|
|
}
|
|
|
|
for key, elem in meta_tags.items():
|
|
if elem:
|
|
content = elem.get("content", "N/A")[:80]
|
|
print(f"✓ {key}: {content}")
|
|
else:
|
|
print(f"✗ {key}: Non trouvé")
|
|
|
|
# Data attributes (pour identifier les sélecteurs)
|
|
print("\n[6] Data Attributes (pour sélecteurs)")
|
|
print("-" * 80)
|
|
data_elems = soup.find_all(attrs={"data-pl": True})[:5]
|
|
if data_elems:
|
|
print(f"✓ {len(data_elems)} éléments avec data-pl:")
|
|
for elem in data_elems:
|
|
print(f" → {elem.name} data-pl='{elem.get('data-pl')}'")
|
|
else:
|
|
print("✗ Pas d'attributs data-pl")
|
|
|
|
# Classes CSS intéressantes
|
|
print("\n[7] Classes CSS Fréquentes")
|
|
print("-" * 80)
|
|
all_classes = []
|
|
for elem in soup.find_all(class_=True):
|
|
if isinstance(elem["class"], list):
|
|
all_classes.extend(elem["class"])
|
|
|
|
from collections import Counter
|
|
common_classes = Counter(all_classes).most_common(10)
|
|
if common_classes:
|
|
print("Classes les plus fréquentes:")
|
|
for cls, count in common_classes:
|
|
print(f" • {cls}: {count} occurrences")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("RECOMMANDATIONS")
|
|
print("=" * 80)
|
|
print(f"✓ Méthode de fetch: {method.upper()}")
|
|
if method == "http":
|
|
print(" → HTTP fonctionne, utiliser fetch_http() prioritaire")
|
|
else:
|
|
print(" → Playwright requis (anti-bot)")
|
|
|
|
print("\n✓ Analyse terminée - Fichiers sauvegardés dans scraped/")
|
|
print("=" * 80)
|