chore: sync project files
This commit is contained in:
98
fetch_aliexpress_wait.py
Executable file
98
fetch_aliexpress_wait.py
Executable file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test AliExpress avec attente du chargement dynamique."""
|
||||
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import json
|
||||
|
||||
url = "https://fr.aliexpress.com/item/1005007187023722.html"
|
||||
|
||||
print("=" * 80)
|
||||
print("ALIEXPRESS - Fetch avec wait")
|
||||
print("=" * 80)
|
||||
print(f"URL: {url}\n")
|
||||
|
||||
# Essayer différents sélecteurs d'attente
|
||||
wait_selectors = [
|
||||
("h1", "Titre h1"),
|
||||
(".product-title", "Product title class"),
|
||||
("img[alt]", "Image avec alt"),
|
||||
(".product-price", "Prix"),
|
||||
]
|
||||
|
||||
best_result = None
|
||||
best_size = 0
|
||||
|
||||
for selector, desc in wait_selectors:
|
||||
print(f"\nTest avec wait_for_selector='{selector}' ({desc})...")
|
||||
|
||||
result = fetch_playwright(
|
||||
url,
|
||||
headless=True,
|
||||
timeout_ms=15000, # 15s timeout
|
||||
wait_for_selector=selector
|
||||
)
|
||||
|
||||
if result.success:
|
||||
size = len(result.html)
|
||||
print(f"✓ Succès: {size:,} chars ({result.duration_ms}ms)")
|
||||
|
||||
if size > best_size:
|
||||
best_size = size
|
||||
best_result = result
|
||||
else:
|
||||
print(f"✗ Échec: {result.error}")
|
||||
|
||||
# Utiliser le meilleur résultat
|
||||
if not best_result:
|
||||
print("\n❌ Aucun résultat valide")
|
||||
exit(1)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ANALYSE DU MEILLEUR RÉSULTAT")
|
||||
print("=" * 80)
|
||||
print(f"Taille HTML: {len(best_result.html):,} chars")
|
||||
|
||||
# Sauvegarder
|
||||
with open("scraped/aliexpress_wait.html", "w", encoding="utf-8") as f:
|
||||
f.write(best_result.html)
|
||||
print("✓ Sauvegardé: scraped/aliexpress_wait.html")
|
||||
|
||||
# Analyse rapide
|
||||
soup = BeautifulSoup(best_result.html, "lxml")
|
||||
|
||||
print("\n[Titre]")
|
||||
og_title = soup.find("meta", property="og:title")
|
||||
if og_title:
|
||||
title = og_title.get("content", "")
|
||||
print(f"✓ og:title: {title[:100]}")
|
||||
|
||||
h1 = soup.find("h1")
|
||||
if h1:
|
||||
print(f"✓ h1: {h1.get_text(strip=True)[:100]}")
|
||||
else:
|
||||
print("✗ Pas de h1")
|
||||
|
||||
print("\n[Prix]")
|
||||
price_match = re.search(r'([0-9]+[.,][0-9]{2})\s*€|€\s*([0-9]+[.,][0-9]{2})', best_result.html)
|
||||
if price_match:
|
||||
price = price_match.group(1) or price_match.group(2)
|
||||
print(f"✓ Prix trouvé par regex: {price} €")
|
||||
else:
|
||||
print("✗ Prix non trouvé")
|
||||
|
||||
print("\n[Images]")
|
||||
dcdata_match = re.search(r'window\._d_c_\.DCData\s*=\s*(\{[^;]*\});', best_result.html, re.DOTALL)
|
||||
if dcdata_match:
|
||||
try:
|
||||
data = json.loads(dcdata_match.group(1))
|
||||
if "imagePathList" in data:
|
||||
images = data["imagePathList"]
|
||||
print(f"✓ {len(images)} images trouvées dans DCData")
|
||||
for i, img in enumerate(images[:3], 1):
|
||||
print(f" [{i}] {img[:70]}...")
|
||||
except:
|
||||
pass
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
Reference in New Issue
Block a user