codex2
This commit is contained in:
Binary file not shown.
@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.aliexpress")
|
||||
|
||||
@@ -126,6 +127,8 @@ class AliexpressStore(BaseStore):
|
||||
images = self._extract_images(html, soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(html, debug_info)
|
||||
reference = self.extract_reference(url)
|
||||
|
||||
# Note sur le rendu client-side
|
||||
@@ -150,8 +153,10 @@ class AliexpressStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -183,6 +188,17 @@ class AliexpressStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
return None
|
||||
|
||||
def _extract_price(
|
||||
self, html: str, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> Optional[float]:
|
||||
@@ -193,35 +209,39 @@ class AliexpressStore(BaseStore):
|
||||
On utilise regex sur le HTML brut.
|
||||
"""
|
||||
# Pattern 1: Prix avant € (ex: "136,69 €")
|
||||
match = re.search(r"([0-9]+[.,][0-9]{2})\s*€", html)
|
||||
match = re.search(r"([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)\\s*€", html)
|
||||
if match:
|
||||
price_str = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(match.group(1))
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# Pattern 2: € avant prix (ex: "€ 136.69")
|
||||
match = re.search(r"€\s*([0-9]+[.,][0-9]{2})", html)
|
||||
match = re.search(r"€\\s*([0-9][0-9\\s.,\\u00a0\\u202f\\u2009]*)", html)
|
||||
if match:
|
||||
price_str = match.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(match.group(1))
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# Pattern 3: Chercher dans meta tags (moins fiable)
|
||||
og_price = soup.find("meta", property="og:price:amount")
|
||||
if og_price:
|
||||
price_str = og_price.get("content", "")
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(price_str)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, html: str, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille si present."""
|
||||
match = re.search(r"originalPrice\"\\s*:\\s*\"([0-9\\s.,]+)\"", html)
|
||||
if match:
|
||||
price = parse_price_text(match.group(1))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(
|
||||
self, url: str, soup: BeautifulSoup, debug: DebugInfo
|
||||
) -> str:
|
||||
|
||||
Executable → Regular
@@ -54,12 +54,12 @@ specs_table:
|
||||
# ASIN (parfois dans les métadonnées)
|
||||
asin:
|
||||
- "input[name='ASIN']"
|
||||
- "th:contains('ASIN') + td"
|
||||
- "th:-soup-contains('ASIN') + td"
|
||||
|
||||
# Messages captcha / robot check
|
||||
captcha_indicators:
|
||||
- "form[action*='validateCaptcha']"
|
||||
- "p.a-last:contains('Sorry')"
|
||||
- "p.a-last:-soup-contains('Sorry')"
|
||||
- "img[alt*='captcha']"
|
||||
|
||||
# Notes pour le parsing:
|
||||
|
||||
@@ -4,7 +4,9 @@ Store Amazon - Parsing de produits Amazon.fr et Amazon.com.
|
||||
Supporte l'extraction de: titre, prix, ASIN, images, specs, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from html import unescape
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
@@ -21,6 +23,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.amazon")
|
||||
|
||||
@@ -131,6 +134,8 @@ class AmazonStore(BaseStore):
|
||||
images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_asin_from_html(soup)
|
||||
|
||||
# Déterminer le statut final (ne pas écraser FAILED)
|
||||
@@ -150,8 +155,10 @@ class AmazonStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -195,6 +202,17 @@ class AmazonStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
@@ -205,14 +223,9 @@ class AmazonStore(BaseStore):
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
text = element.get_text(strip=True)
|
||||
# Extraire nombre (format: "299,99" ou "299.99")
|
||||
match = re.search(r"(\d+)[.,](\d+)", text)
|
||||
if match:
|
||||
price_str = f"{match.group(1)}.{match.group(2)}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
price = parse_price_text(text)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
# Fallback: chercher les spans séparés a-price-whole et a-price-fraction
|
||||
whole = soup.select_one("span.a-price-whole")
|
||||
@@ -220,15 +233,24 @@ class AmazonStore(BaseStore):
|
||||
if whole and fraction:
|
||||
whole_text = whole.get_text(strip=True)
|
||||
fraction_text = fraction.get_text(strip=True)
|
||||
try:
|
||||
price_str = f"{whole_text}.{fraction_text}"
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
pass
|
||||
price = parse_price_text(f"{whole_text}.{fraction_text}")
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille."""
|
||||
strike = soup.select_one("span.priceBlockStrikePriceString") or soup.select_one(
|
||||
"span.a-text-price span.a-offscreen"
|
||||
)
|
||||
if strike:
|
||||
price = parse_price_text(strike.get_text(strip=True))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
@@ -270,6 +292,7 @@ class AmazonStore(BaseStore):
|
||||
def _extract_images(self, soup: BeautifulSoup, debug: DebugInfo) -> list[str]:
|
||||
"""Extrait les URLs d'images."""
|
||||
images = []
|
||||
seen = set()
|
||||
selectors = self.get_selector("images", [])
|
||||
if isinstance(selectors, str):
|
||||
selectors = [selectors]
|
||||
@@ -278,19 +301,57 @@ class AmazonStore(BaseStore):
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Attribut src ou data-src
|
||||
url = element.get("src") or element.get("data-src")
|
||||
url = element.get("src") or element.get("data-src") or element.get("data-old-hires")
|
||||
if url and url.startswith("http"):
|
||||
images.append(url)
|
||||
if self._is_product_image(url) and url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
dynamic = element.get("data-a-dynamic-image")
|
||||
if dynamic:
|
||||
urls = self._extract_dynamic_images(dynamic)
|
||||
for dyn_url in urls:
|
||||
if self._is_product_image(dyn_url) and dyn_url not in seen:
|
||||
images.append(dyn_url)
|
||||
seen.add(dyn_url)
|
||||
|
||||
# Fallback: chercher tous les img tags si aucune image trouvée
|
||||
if not images:
|
||||
all_imgs = soup.find_all("img")
|
||||
for img in all_imgs:
|
||||
url = img.get("src") or img.get("data-src")
|
||||
if url and url.startswith("http"):
|
||||
images.append(url)
|
||||
if url and url.startswith("http") and self._is_product_image(url):
|
||||
if url not in seen:
|
||||
images.append(url)
|
||||
seen.add(url)
|
||||
|
||||
return list(set(images)) # Dédupliquer
|
||||
return images
|
||||
|
||||
def _extract_dynamic_images(self, raw: str) -> list[str]:
|
||||
"""Extrait les URLs du JSON data-a-dynamic-image."""
|
||||
try:
|
||||
data = json.loads(unescape(raw))
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
urls = []
|
||||
if isinstance(data, dict):
|
||||
candidates = []
|
||||
for url, dims in data.items():
|
||||
if not isinstance(url, str) or not url.startswith("http"):
|
||||
continue
|
||||
size = dims[0] if isinstance(dims, list) and dims else 0
|
||||
candidates.append((size, url))
|
||||
candidates.sort(key=lambda item: item[0], reverse=True)
|
||||
for _, url in candidates:
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
def _is_product_image(self, url: str) -> bool:
|
||||
"""Filtre basique pour eviter les logos et sprites."""
|
||||
lowered = url.lower()
|
||||
if "prime_logo" in lowered or "sprite" in lowered:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis les breadcrumbs."""
|
||||
|
||||
@@ -23,6 +23,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.backmarket")
|
||||
|
||||
@@ -116,6 +117,8 @@ class BackmarketStore(BaseStore):
|
||||
images = json_ld_data.get("images") or self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url)
|
||||
|
||||
# Spécifique Backmarket: condition (état du reconditionné)
|
||||
@@ -140,8 +143,10 @@ class BackmarketStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -213,6 +218,17 @@ class BackmarketStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
@@ -225,20 +241,29 @@ class BackmarketStore(BaseStore):
|
||||
# Attribut content (schema.org) ou texte
|
||||
price_text = element.get("content") or element.get_text(strip=True)
|
||||
|
||||
# Extraire nombre (format: "299,99" ou "299.99" ou "299")
|
||||
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
|
||||
if match:
|
||||
integer_part = match.group(1)
|
||||
decimal_part = match.group(2) or "00"
|
||||
price_str = f"{integer_part}.{decimal_part}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
price = parse_price_text(price_text)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille."""
|
||||
selectors = [
|
||||
".price--old",
|
||||
".price--striked",
|
||||
".price__old",
|
||||
"del",
|
||||
]
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
price = parse_price_text(element.get_text(strip=True))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
|
||||
@@ -4,6 +4,7 @@ Store Cdiscount - Parsing de produits Cdiscount.com.
|
||||
Supporte l'extraction de: titre, prix, SKU, images, specs, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -21,6 +22,7 @@ from pricewatch.app.core.schema import (
|
||||
StockStatus,
|
||||
)
|
||||
from pricewatch.app.stores.base import BaseStore
|
||||
from pricewatch.app.stores.price_parser import parse_price_text
|
||||
|
||||
logger = get_logger("stores.cdiscount")
|
||||
|
||||
@@ -112,6 +114,8 @@ class CdiscountStore(BaseStore):
|
||||
images = self._extract_images(soup, debug_info)
|
||||
category = self._extract_category(soup, debug_info)
|
||||
specs = self._extract_specs(soup, debug_info)
|
||||
description = self._extract_description(soup, debug_info)
|
||||
msrp = self._extract_msrp(soup, debug_info)
|
||||
reference = self.extract_reference(url) or self._extract_sku_from_html(soup)
|
||||
|
||||
# Déterminer le statut final
|
||||
@@ -130,8 +134,10 @@ class CdiscountStore(BaseStore):
|
||||
stock_status=stock_status,
|
||||
reference=reference,
|
||||
category=category,
|
||||
description=description,
|
||||
images=images,
|
||||
specs=specs,
|
||||
msrp=msrp,
|
||||
debug=debug_info,
|
||||
)
|
||||
|
||||
@@ -158,6 +164,21 @@ class CdiscountStore(BaseStore):
|
||||
debug.errors.append("Titre non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_description(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la description (meta tags)."""
|
||||
meta = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if meta:
|
||||
description = meta.get("content", "").strip()
|
||||
if description:
|
||||
return description
|
||||
product_ld = self._find_product_ld(soup)
|
||||
desc_ld = product_ld.get("description") if product_ld else None
|
||||
if isinstance(desc_ld, str) and desc_ld.strip():
|
||||
return desc_ld.strip()
|
||||
return None
|
||||
|
||||
def _extract_price(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix."""
|
||||
selectors = self.get_selector("price", [])
|
||||
@@ -170,20 +191,29 @@ class CdiscountStore(BaseStore):
|
||||
# Attribut content (schema.org) ou texte
|
||||
price_text = element.get("content") or element.get_text(strip=True)
|
||||
|
||||
# Extraire nombre (format: "299,99" ou "299.99")
|
||||
match = re.search(r"(\d+)[.,]?(\d*)", price_text)
|
||||
if match:
|
||||
integer_part = match.group(1)
|
||||
decimal_part = match.group(2) or "00"
|
||||
price_str = f"{integer_part}.{decimal_part}"
|
||||
try:
|
||||
return float(price_str)
|
||||
except ValueError:
|
||||
continue
|
||||
price = parse_price_text(price_text)
|
||||
if price is not None:
|
||||
return price
|
||||
|
||||
debug.errors.append("Prix non trouvé")
|
||||
return None
|
||||
|
||||
def _extract_msrp(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[float]:
|
||||
"""Extrait le prix conseille."""
|
||||
selectors = [
|
||||
".jsStrikePrice",
|
||||
".price__old",
|
||||
".c-price__strike",
|
||||
".price-strike",
|
||||
]
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
price = parse_price_text(element.get_text(strip=True))
|
||||
if price is not None:
|
||||
return price
|
||||
return None
|
||||
|
||||
def _extract_currency(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la devise."""
|
||||
selectors = self.get_selector("currency", [])
|
||||
@@ -249,7 +279,14 @@ class CdiscountStore(BaseStore):
|
||||
url = f"https:{url}"
|
||||
images.append(url)
|
||||
|
||||
return list(set(images)) # Dédupliquer
|
||||
ld_images = self._extract_ld_images(self._find_product_ld(soup))
|
||||
for url in ld_images:
|
||||
if url and url not in images:
|
||||
if url.startswith("//"):
|
||||
url = f"https:{url}"
|
||||
images.append(url)
|
||||
|
||||
return list(dict.fromkeys(images)) # Préserver l’ordre
|
||||
|
||||
def _extract_category(self, soup: BeautifulSoup, debug: DebugInfo) -> Optional[str]:
|
||||
"""Extrait la catégorie depuis les breadcrumbs."""
|
||||
@@ -275,6 +312,53 @@ class CdiscountStore(BaseStore):
|
||||
|
||||
return None
|
||||
|
||||
def _extract_json_ld_entries(self, soup: BeautifulSoup) -> list[dict]:
|
||||
"""Parse les scripts JSON-LD et retourne les objets."""
|
||||
entries = []
|
||||
scripts = soup.find_all("script", type="application/ld+json")
|
||||
for script in scripts:
|
||||
raw = script.string or script.text
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
payload = json.loads(raw.strip())
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
if isinstance(payload, list):
|
||||
entries.extend(payload)
|
||||
else:
|
||||
entries.append(payload)
|
||||
return entries
|
||||
|
||||
def _find_product_ld(self, soup: BeautifulSoup) -> dict:
|
||||
"""Retourne l’objet Product JSON-LD si présent."""
|
||||
for entry in self._extract_json_ld_entries(soup):
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
type_field = entry.get("@type") or entry.get("type")
|
||||
if isinstance(type_field, str) and "product" in type_field.lower():
|
||||
return entry
|
||||
return {}
|
||||
|
||||
def _extract_ld_images(self, product_ld: dict) -> list[str]:
|
||||
"""Récupère les images listées dans le JSON-LD."""
|
||||
if not product_ld:
|
||||
return []
|
||||
images = product_ld.get("image") or product_ld.get("images")
|
||||
if not images:
|
||||
return []
|
||||
if isinstance(images, str):
|
||||
images = [images]
|
||||
extracted = []
|
||||
for item in images:
|
||||
if isinstance(item, str):
|
||||
extracted.append(item)
|
||||
elif isinstance(item, dict):
|
||||
url = item.get("url")
|
||||
if isinstance(url, str):
|
||||
extracted.append(url)
|
||||
return extracted
|
||||
|
||||
def _extract_specs(self, soup: BeautifulSoup, debug: DebugInfo) -> dict[str, str]:
|
||||
"""Extrait les caractéristiques techniques."""
|
||||
specs = {}
|
||||
@@ -298,6 +382,19 @@ class CdiscountStore(BaseStore):
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
product_ld = self._find_product_ld(soup)
|
||||
additional = product_ld.get("additionalProperty") if product_ld else None
|
||||
if isinstance(additional, dict):
|
||||
additional = [additional]
|
||||
if isinstance(additional, list):
|
||||
for item in additional:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
key = item.get("name") or item.get("propertyID")
|
||||
value = item.get("value") or item.get("valueReference")
|
||||
if key and value:
|
||||
specs[key] = value
|
||||
|
||||
return specs
|
||||
|
||||
def _extract_sku_from_html(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Helpers pour parser des prix avec separateurs de milliers.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def parse_price_text(text: str) -> Optional[float]:
|
||||
"""
|
||||
Parse un texte de prix en float.
|
||||
|
||||
Gere les separateurs espace, point, virgule et espaces insécables.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text = re.sub(r"(\d)\s*€\s*(\d)", r"\1,\2", text)
|
||||
cleaned = text.replace("\u00a0", " ").replace("\u202f", " ").replace("\u2009", " ")
|
||||
cleaned = "".join(ch for ch in cleaned if ch.isdigit() or ch in ".,")
|
||||
if not cleaned:
|
||||
return None
|
||||
|
||||
if "," in cleaned and "." in cleaned:
|
||||
if cleaned.rfind(",") > cleaned.rfind("."):
|
||||
cleaned = cleaned.replace(".", "")
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
else:
|
||||
cleaned = cleaned.replace(",", "")
|
||||
elif "," in cleaned:
|
||||
parts = cleaned.split(",")
|
||||
if len(parts) > 1:
|
||||
decimal = parts[-1]
|
||||
integer = "".join(parts[:-1])
|
||||
cleaned = f"{integer}.{decimal}" if decimal else integer
|
||||
elif "." in cleaned:
|
||||
parts = cleaned.split(".")
|
||||
if len(parts) > 1:
|
||||
decimal = parts[-1]
|
||||
integer = "".join(parts[:-1])
|
||||
cleaned = f"{integer}.{decimal}" if decimal else integer
|
||||
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
Reference in New Issue
Block a user