1
This commit is contained in:
Binary file not shown.
@@ -214,15 +214,17 @@ def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
|
||||
if not price_text:
|
||||
price_text = _safe_attr_soup(soup, "#twister-plus-price-data-price", "value")
|
||||
|
||||
price_list_text = _safe_text_soup(
|
||||
soup, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen"
|
||||
)
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, "#priceblock_strikeprice")
|
||||
# prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlockAUI .a-offscreen")
|
||||
price_list_text = _safe_text_soup(soup, "#priceblock_strikeprice")
|
||||
# fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(
|
||||
soup, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen"
|
||||
)
|
||||
|
||||
stock_text = _safe_text_soup(soup, "#availability span")
|
||||
if not stock_text:
|
||||
@@ -252,25 +254,33 @@ def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
|
||||
prime_eligible = True
|
||||
amazon_exclusive = "Exclusivité Amazon" if "Exclusivité Amazon" in soup.get_text() else None
|
||||
|
||||
# prix plus bas 30 jours (basisPrice avec mention "30 jours")
|
||||
lowest_30d_text = _extract_lowest_30d_text_soup(soup)
|
||||
lowest_30d_price = None
|
||||
if lowest_30d_text:
|
||||
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
||||
if lowest_30d_price is not None:
|
||||
candidate_list = parse_price_fr(price_list_text)
|
||||
if candidate_list == lowest_30d_price:
|
||||
price_list_text = None
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text_soup(soup, ".srpPriceBlockAUI .a-offscreen")
|
||||
|
||||
# si le prix conseillé == prix min 30j, c'est une erreur de détection
|
||||
# (le prix barré dans corePriceDisplay est en fait le prix min 30j, pas le conseillé)
|
||||
price_list_value = parse_price_fr(price_list_text)
|
||||
if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
|
||||
price_list_text = None
|
||||
price_list_value = None
|
||||
|
||||
# réductions
|
||||
reduction_savings_text = _safe_text_soup(
|
||||
soup, "#corePriceDisplay_desktop_feature_div .savingsPercentage"
|
||||
)
|
||||
reduction_conseille_text = _safe_text_soup(soup, ".srpSavingsPercentageBlock")
|
||||
reduction_min_30j = _parse_percent(reduction_savings_text)
|
||||
reduction_conseille = _parse_percent(reduction_conseille_text)
|
||||
|
||||
# attribuer correctement les réductions selon ce qui est présent
|
||||
# - si prix min 30j présent, savingsPercentage = réduction par rapport au min 30j
|
||||
# - si prix conseillé présent (srpPriceBlock), srpSavingsPercentageBlock = réduction par rapport au conseillé
|
||||
reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
|
||||
reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
|
||||
# si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
|
||||
if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
|
||||
reduction_conseille = _parse_percent(reduction_savings_text)
|
||||
|
||||
a_propos = _extract_about_bullets(soup)
|
||||
description = _extract_description(soup)
|
||||
@@ -285,7 +295,7 @@ def extract_product_data_from_html(html: str, url: str) -> dict[str, Any]:
|
||||
"titre": title,
|
||||
"url_image_principale": image_main_url,
|
||||
"prix_actuel": parse_price_fr(price_text),
|
||||
"prix_conseille": parse_price_fr(price_list_text),
|
||||
"prix_conseille": price_list_value,
|
||||
"prix_min_30j": lowest_30d_price,
|
||||
"prix_conseille_reduction": reduction_conseille,
|
||||
"prix_min_30j_reduction": reduction_min_30j,
|
||||
@@ -333,14 +343,15 @@ def extract_product_data(page: Page, url: str) -> dict[str, Any]:
|
||||
if not price_text:
|
||||
price_text = _safe_attr(page, "#twister-plus-price-data-price", "value")
|
||||
|
||||
# prix barré / conseillé
|
||||
price_list_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, "#priceblock_strikeprice")
|
||||
# prix conseillé (srpPriceBlock = "Prix conseillé : XXX €")
|
||||
price_list_text = _safe_text(page, ".srpPriceBlock .srpPriceBlockAUI .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlockAUI .a-offscreen")
|
||||
price_list_text = _safe_text(page, "#priceblock_strikeprice")
|
||||
# fallback sur corePriceDisplay (prix barré) si pas de srpPriceBlock
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price span.a-offscreen")
|
||||
|
||||
# stock
|
||||
stock_text = _safe_text(page, "#availability span")
|
||||
@@ -374,34 +385,44 @@ def extract_product_data(page: Page, url: str) -> dict[str, Any]:
|
||||
|
||||
amazon_exclusive = _safe_text(page, "text=Exclusivité Amazon")
|
||||
|
||||
# prix plus bas 30 jours
|
||||
# prix plus bas 30 jours (basisPrice ou corePriceDisplay avec mention "30 jours")
|
||||
lowest_30d_text = None
|
||||
lowest_30d_price = None
|
||||
if page.locator(".basisPrice").count() > 0:
|
||||
basis_text = page.locator(".basisPrice").first.inner_text()
|
||||
if basis_text and re.search(r"prix.+(30|trente).+jour", basis_text.lower()):
|
||||
lowest_30d_text = _safe_text(page, ".basisPrice .a-offscreen") or basis_text
|
||||
if not lowest_30d_text and page.locator("#priceBadging_feature_div").count() > 0:
|
||||
lowest_30d_text = _safe_text(page, ".basisPrice .a-price .a-offscreen") or basis_text
|
||||
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
||||
# fallback sur corePriceDisplay si contient mention 30 jours
|
||||
if lowest_30d_price is None and page.locator("#corePriceDisplay_desktop_feature_div .a-text-price").count() > 0:
|
||||
core_text = page.locator("#corePriceDisplay_desktop_feature_div").first.inner_text()
|
||||
if core_text and re.search(r"prix.+(30|trente).+jour", core_text.lower()):
|
||||
lowest_30d_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .a-text-price .a-offscreen")
|
||||
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
||||
if not lowest_30d_price and page.locator("#priceBadging_feature_div").count() > 0:
|
||||
badging_text = page.locator("#priceBadging_feature_div").first.inner_text()
|
||||
if badging_text and re.search(r"prix.+(30|trente).+jour", badging_text.lower()):
|
||||
lowest_30d_text = _safe_text(page, "#priceBadging_feature_div .a-offscreen") or badging_text
|
||||
if lowest_30d_text and not re.search(r"prix.+(30|trente).+jour", lowest_30d_text.lower()):
|
||||
lowest_30d_text = None
|
||||
lowest_30d_price = None
|
||||
if lowest_30d_text and "prix" in lowest_30d_text.lower():
|
||||
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
||||
if lowest_30d_price is not None:
|
||||
candidate_list = parse_price_fr(price_list_text)
|
||||
if candidate_list == lowest_30d_price:
|
||||
price_list_text = None
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlock .a-offscreen")
|
||||
if not price_list_text:
|
||||
price_list_text = _safe_text(page, ".srpPriceBlockAUI .a-offscreen")
|
||||
lowest_30d_price = parse_price_fr(lowest_30d_text)
|
||||
|
||||
# si le prix conseillé == prix min 30j, c'est une erreur de détection
|
||||
price_list_value = parse_price_fr(price_list_text)
|
||||
if price_list_value is not None and lowest_30d_price is not None and price_list_value == lowest_30d_price:
|
||||
price_list_text = None
|
||||
price_list_value = None
|
||||
|
||||
# réductions
|
||||
# savingsPercentage dans corePriceDisplay = réduction par rapport au prix min 30j (si présent)
|
||||
# srpSavingsPercentageBlock = réduction par rapport au prix conseillé
|
||||
reduction_savings_text = _safe_text(page, "#corePriceDisplay_desktop_feature_div .savingsPercentage")
|
||||
reduction_conseille_text = _safe_text(page, ".srpSavingsPercentageBlock")
|
||||
reduction_min_30j = _parse_percent(reduction_savings_text)
|
||||
reduction_conseille = _parse_percent(reduction_conseille_text)
|
||||
|
||||
# attribuer correctement les réductions selon ce qui est présent
|
||||
reduction_min_30j = _parse_percent(reduction_savings_text) if lowest_30d_price is not None else None
|
||||
reduction_conseille = _parse_percent(reduction_conseille_text) if price_list_value is not None else None
|
||||
# si pas de srpSavingsPercentageBlock mais un savingsPercentage et un prix conseillé (sans min 30j)
|
||||
if reduction_conseille is None and price_list_value is not None and lowest_30d_price is None:
|
||||
reduction_conseille = _parse_percent(reduction_savings_text)
|
||||
|
||||
asin = _safe_attr(page, "input#ASIN", "value") or _extract_asin_from_url(url)
|
||||
|
||||
@@ -417,7 +438,7 @@ def extract_product_data(page: Page, url: str) -> dict[str, Any]:
|
||||
"titre": title,
|
||||
"url_image_principale": image_main_url,
|
||||
"prix_actuel": parse_price_fr(price_text),
|
||||
"prix_conseille": parse_price_fr(price_list_text),
|
||||
"prix_conseille": price_list_value,
|
||||
"prix_min_30j": lowest_30d_price,
|
||||
"prix_conseille_reduction": reduction_conseille,
|
||||
"prix_min_30j_reduction": reduction_min_30j,
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.async_api import async_playwright, Browser, BrowserContext
|
||||
|
||||
|
||||
|
||||
@@ -13,11 +13,11 @@ PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from loguru import logger
|
||||
from playwright.sync_api import sync_playwright
|
||||
from loguru import logger # noqa: E402
|
||||
from playwright.sync_api import sync_playwright # noqa: E402
|
||||
|
||||
from backend.app.core.config import load_config
|
||||
from backend.app.scraper.amazon.parser import extract_product_data
|
||||
from backend.app.core.config import load_config # noqa: E402
|
||||
from backend.app.scraper.amazon.parser import extract_product_data # noqa: E402
|
||||
|
||||
SAMPLES_DIR = Path(__file__).resolve().parent.parent / "samples"
|
||||
TESTS_PATH = SAMPLES_DIR / "scrape_test.json"
|
||||
|
||||
@@ -153,7 +153,7 @@ def scrape_product(product_id: int) -> None:
|
||||
# fermeture propre du navigateur
|
||||
context.close()
|
||||
browser.close()
|
||||
except Exception as exc: # pragma: no cover
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception("Erreur pendant le scraping de %s", product_id)
|
||||
_finalize_run(run, session, "erreur")
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user