Files
jardin/calendrier_lunaire/saints_dictons/saint_dicton_year_scraper.py
2026-02-22 15:05:40 +01:00

172 lines
5.3 KiB
Python

#!/usr/bin/env python3
import argparse
import json
import re
import time
from datetime import datetime
from datetime import date, timedelta
from html import unescape
from urllib.request import Request, urlopen
MONTHS_FR = {
1: "janvier", 2: "février", 3: "mars", 4: "avril", 5: "mai", 6: "juin",
7: "juillet", 8: "août", 9: "septembre", 10: "octobre", 11: "novembre", 12: "décembre",
}
def fetch_html(url: str) -> str:
req = Request(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"})
with urlopen(req, timeout=25) as resp:
raw = resp.read()
charset = (resp.headers.get_content_charset() or "utf-8").lower()
try:
return raw.decode(charset, errors="replace")
except Exception:
return raw.decode("utf-8", errors="replace")
def clean_html_text(s: str) -> str:
s = re.sub(r"<br\s*/?>", " ", s, flags=re.I)
s = re.sub(r"<[^>]+>", "", s)
s = unescape(s)
s = s.replace("\xa0", " ")
return re.sub(r"\s+", " ", s).strip()
def parse_saints(html: str) -> list[str]:
rows = re.findall(r'<p[^>]*class="sd-name"[^>]*>(.*?)</p>', html, flags=re.I | re.S)
out, seen = [], set()
for row in rows:
txt = clean_html_text(row)
if txt and txt not in seen:
out.append(txt)
seen.add(txt)
return out
def parse_dictons(html: str) -> list[str]:
rows = re.findall(r'<p[^>]*class="dict"[^>]*>(.*?)</p>', html, flags=re.I | re.S)
out = []
for row in rows:
txt = clean_html_text(row)
if txt:
out.append(txt)
return out
def parse_prenoms(html: str) -> list[str]:
block = re.search(
r'<h2[^>]*>[^<]*Pr[^<]*noms[^<]*f[^<]*ter[^<]*</h2>.*?<ul[^>]*>(.*?)</ul>',
html,
flags=re.I | re.S,
)
target = block.group(1) if block else ""
rows = re.findall(r'<li[^>]*>(.*?)</li>', target, flags=re.I | re.S)
out, seen = [], set()
for row in rows:
txt = clean_html_text(row)
if txt and txt not in seen:
out.append(txt)
seen.add(txt)
return out
def iter_mmdd_full_year(year: int):
d = date(year, 1, 1)
end = date(year, 12, 31)
while d <= end:
yield d.strftime("%m%d"), d
d += timedelta(days=1)
# assure 29 février même année non bissextile
if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0):
yield "0229", None
def scrape_day(base_url: str, mmdd: str, d: date | None) -> dict:
url = f"{base_url.rstrip('/')}/{mmdd}.html"
html = fetch_html(url)
if d:
label = f"{d.day:02d} {MONTHS_FR[d.month]}"
iso = d.isoformat()
else:
label = "29 février"
iso = None
return {
"date": label,
"date_iso": iso,
"mmdd": mmdd,
"saints": parse_saints(html),
"dictons": parse_dictons(html),
"prenoms_a_feter": parse_prenoms(html),
"source_url": url,
}
def _ts() -> str:
return datetime.now().strftime("%H:%M:%S")
def _log(message: str, enabled: bool) -> None:
if enabled:
print(f"[{_ts()}] {message}", flush=True)
def main() -> int:
ap = argparse.ArgumentParser(description="Scrape saints/dictons pour toute une année (inclut 29 février)")
ap.add_argument("--year", type=int, default=date.today().year)
ap.add_argument("--base", default="https://www.saint-dicton.com")
ap.add_argument("--sleep-ms", type=int, default=150, help="Pause entre requêtes")
ap.add_argument("--limit", type=int, default=0, help="Limiter le nb de jours (test rapide)")
ap.add_argument("--out", default="", help="Fichier de sortie JSON (sinon stdout)")
ap.add_argument("--log-every", type=int, default=10, help="Affiche un log de progression tous les N jours")
ap.add_argument("--quiet", action="store_true", help="Réduit les logs")
args = ap.parse_args()
results = []
count = 0
verbose = not args.quiet
log_every = max(1, args.log_every)
_log(f"Démarrage scrape année={args.year}, base={args.base}", verbose)
for mmdd, d in iter_mmdd_full_year(args.year):
url = f"{args.base.rstrip('/')}/{mmdd}.html"
_log(f"[{count + 1}] fetch {mmdd} -> {url}", verbose)
try:
results.append(scrape_day(args.base, mmdd, d))
_log(f"[{count + 1}] ok {mmdd}", verbose and ((count + 1) % log_every == 0 or count == 0))
except Exception as e:
results.append({
"mmdd": mmdd,
"date_iso": d.isoformat() if d else None,
"error": str(e),
"source_url": url,
})
_log(f"[{count + 1}] erreur {mmdd}: {e}", True)
count += 1
if args.limit and count >= args.limit:
_log(f"Arrêt par --limit={args.limit}", verbose)
break
if args.sleep_ms > 0:
time.sleep(args.sleep_ms / 1000)
payload = {
"year": args.year,
"count": len(results),
"includes_feb29": any(r.get("mmdd") == "0229" for r in results),
"data": results,
}
txt = json.dumps(payload, ensure_ascii=False, indent=2)
if args.out:
with open(args.out, "w", encoding="utf-8") as f:
f.write(txt)
_log(f"Fichier écrit: {args.out}", True)
else:
print(txt)
_log(f"Terminé: {len(results)} jours", verbose)
return 0
if __name__ == "__main__":
raise SystemExit(main())