#!/usr/bin/env python3 import argparse import json import re import time from datetime import datetime from datetime import date, timedelta from html import unescape from urllib.request import Request, urlopen MONTHS_FR = { 1: "janvier", 2: "février", 3: "mars", 4: "avril", 5: "mai", 6: "juin", 7: "juillet", 8: "août", 9: "septembre", 10: "octobre", 11: "novembre", 12: "décembre", } def fetch_html(url: str) -> str: req = Request(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}) with urlopen(req, timeout=25) as resp: raw = resp.read() charset = (resp.headers.get_content_charset() or "utf-8").lower() try: return raw.decode(charset, errors="replace") except Exception: return raw.decode("utf-8", errors="replace") def clean_html_text(s: str) -> str: s = re.sub(r"", " ", s, flags=re.I) s = re.sub(r"<[^>]+>", "", s) s = unescape(s) s = s.replace("\xa0", " ") return re.sub(r"\s+", " ", s).strip() def parse_saints(html: str) -> list[str]: rows = re.findall(r']*class="sd-name"[^>]*>(.*?)

', html, flags=re.I | re.S) out, seen = [], set() for row in rows: txt = clean_html_text(row) if txt and txt not in seen: out.append(txt) seen.add(txt) return out def parse_dictons(html: str) -> list[str]: rows = re.findall(r']*class="dict"[^>]*>(.*?)

', html, flags=re.I | re.S) out = [] for row in rows: txt = clean_html_text(row) if txt: out.append(txt) return out def parse_prenoms(html: str) -> list[str]: block = re.search( r']*>[^<]*Pr[^<]*noms[^<]*f[^<]*ter[^<]*.*?]*>(.*?)', html, flags=re.I | re.S, ) target = block.group(1) if block else "" rows = re.findall(r']*>(.*?)', target, flags=re.I | re.S) out, seen = [], set() for row in rows: txt = clean_html_text(row) if txt and txt not in seen: out.append(txt) seen.add(txt) return out def iter_mmdd_full_year(year: int): d = date(year, 1, 1) end = date(year, 12, 31) while d <= end: yield d.strftime("%m%d"), d d += timedelta(days=1) # assure 29 février même année non bissextile if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0): yield "0229", None def scrape_day(base_url: str, mmdd: str, d: date | None) -> dict: url = f"{base_url.rstrip('/')}/{mmdd}.html" html = fetch_html(url) if d: label = f"{d.day:02d} {MONTHS_FR[d.month]}" iso = d.isoformat() else: label = "29 février" iso = None return { "date": label, "date_iso": iso, "mmdd": mmdd, "saints": parse_saints(html), "dictons": parse_dictons(html), "prenoms_a_feter": parse_prenoms(html), "source_url": url, } def _ts() -> str: return datetime.now().strftime("%H:%M:%S") def _log(message: str, enabled: bool) -> None: if enabled: print(f"[{_ts()}] {message}", flush=True) def main() -> int: ap = argparse.ArgumentParser(description="Scrape saints/dictons pour toute une année (inclut 29 février)") ap.add_argument("--year", type=int, default=date.today().year) ap.add_argument("--base", default="https://www.saint-dicton.com") ap.add_argument("--sleep-ms", type=int, default=150, help="Pause entre requêtes") ap.add_argument("--limit", type=int, default=0, help="Limiter le nb de jours (test rapide)") ap.add_argument("--out", default="", help="Fichier de sortie JSON (sinon stdout)") ap.add_argument("--log-every", type=int, default=10, help="Affiche un log de progression tous les N jours") ap.add_argument("--quiet", action="store_true", help="Réduit les logs") args = ap.parse_args() results = [] count = 0 verbose = not args.quiet log_every = max(1, args.log_every) _log(f"Démarrage scrape année={args.year}, base={args.base}", verbose) for mmdd, d in iter_mmdd_full_year(args.year): url = f"{args.base.rstrip('/')}/{mmdd}.html" _log(f"[{count + 1}] fetch {mmdd} -> {url}", verbose) try: results.append(scrape_day(args.base, mmdd, d)) _log(f"[{count + 1}] ok {mmdd}", verbose and ((count + 1) % log_every == 0 or count == 0)) except Exception as e: results.append({ "mmdd": mmdd, "date_iso": d.isoformat() if d else None, "error": str(e), "source_url": url, }) _log(f"[{count + 1}] erreur {mmdd}: {e}", True) count += 1 if args.limit and count >= args.limit: _log(f"Arrêt par --limit={args.limit}", verbose) break if args.sleep_ms > 0: time.sleep(args.sleep_ms / 1000) payload = { "year": args.year, "count": len(results), "includes_feb29": any(r.get("mmdd") == "0229" for r in results), "data": results, } txt = json.dumps(payload, ensure_ascii=False, indent=2) if args.out: with open(args.out, "w", encoding="utf-8") as f: f.write(txt) _log(f"Fichier écrit: {args.out}", True) else: print(txt) _log(f"Terminé: {len(results)} jours", verbose) return 0 if __name__ == "__main__": raise SystemExit(main())