1er
This commit is contained in:
61
backend/app/scraper/normalize.py
Normal file
61
backend/app/scraper/normalize.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def parse_price_fr(text: str | None) -> Optional[float]:
|
||||
if not text:
|
||||
return None
|
||||
# Exemple: "1 249,99 €" -> 1249.99 (gère espaces insécables)
|
||||
match = re.search(r"([0-9][0-9\s\.\u00a0\u202f]*(?:[,.][0-9]{2})?)", text)
|
||||
if not match:
|
||||
return None
|
||||
cleaned = match.group(1).replace(" ", "").replace("\u00a0", "").replace("\u202f", "")
|
||||
if "," in cleaned:
|
||||
cleaned = cleaned.replace(".", "").replace(",", ".")
|
||||
elif cleaned.count(".") == 1 and len(cleaned.split(".")[-1]) == 2:
|
||||
# conserve le point comme séparateur décimal
|
||||
pass
|
||||
else:
|
||||
cleaned = cleaned.replace(".", "")
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def parse_rating_value(text: str | None) -> Optional[float]:
|
||||
if not text:
|
||||
return None
|
||||
match = re.search(r"([0-9]+(?:[\.,][0-9]+)?)", text)
|
||||
if not match:
|
||||
return None
|
||||
try:
|
||||
return float(match.group(1).replace(",", "."))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def parse_rating_count(text: str | None) -> Optional[int]:
|
||||
if not text:
|
||||
return None
|
||||
digits = re.sub(r"[^0-9]", "", text)
|
||||
if not digits:
|
||||
return None
|
||||
try:
|
||||
return int(digits)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def parse_stock_status(text: str | None) -> tuple[Optional[bool], Optional[str]]:
|
||||
if not text:
|
||||
return None, None
|
||||
cleaned = " ".join(text.split())
|
||||
lowered = cleaned.lower()
|
||||
if "en stock" in lowered or "disponible" in lowered:
|
||||
return True, cleaned
|
||||
if "indisponible" in lowered or "rupture" in lowered:
|
||||
return False, cleaned
|
||||
return None, cleaned
|
||||
Reference in New Issue
Block a user