172 lines
5.3 KiB
Python
172 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
import re
|
|
import time
|
|
from datetime import datetime
|
|
from datetime import date, timedelta
|
|
from html import unescape
|
|
from urllib.request import Request, urlopen
|
|
|
|
MONTHS_FR = {
|
|
1: "janvier", 2: "février", 3: "mars", 4: "avril", 5: "mai", 6: "juin",
|
|
7: "juillet", 8: "août", 9: "septembre", 10: "octobre", 11: "novembre", 12: "décembre",
|
|
}
|
|
|
|
|
|
def fetch_html(url: str) -> str:
|
|
req = Request(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"})
|
|
with urlopen(req, timeout=25) as resp:
|
|
raw = resp.read()
|
|
charset = (resp.headers.get_content_charset() or "utf-8").lower()
|
|
try:
|
|
return raw.decode(charset, errors="replace")
|
|
except Exception:
|
|
return raw.decode("utf-8", errors="replace")
|
|
|
|
|
|
def clean_html_text(s: str) -> str:
|
|
s = re.sub(r"<br\s*/?>", " ", s, flags=re.I)
|
|
s = re.sub(r"<[^>]+>", "", s)
|
|
s = unescape(s)
|
|
s = s.replace("\xa0", " ")
|
|
return re.sub(r"\s+", " ", s).strip()
|
|
|
|
|
|
def parse_saints(html: str) -> list[str]:
|
|
rows = re.findall(r'<p[^>]*class="sd-name"[^>]*>(.*?)</p>', html, flags=re.I | re.S)
|
|
out, seen = [], set()
|
|
for row in rows:
|
|
txt = clean_html_text(row)
|
|
if txt and txt not in seen:
|
|
out.append(txt)
|
|
seen.add(txt)
|
|
return out
|
|
|
|
|
|
def parse_dictons(html: str) -> list[str]:
|
|
rows = re.findall(r'<p[^>]*class="dict"[^>]*>(.*?)</p>', html, flags=re.I | re.S)
|
|
out = []
|
|
for row in rows:
|
|
txt = clean_html_text(row)
|
|
if txt:
|
|
out.append(txt)
|
|
return out
|
|
|
|
|
|
def parse_prenoms(html: str) -> list[str]:
|
|
block = re.search(
|
|
r'<h2[^>]*>[^<]*Pr[^<]*noms[^<]*f[^<]*ter[^<]*</h2>.*?<ul[^>]*>(.*?)</ul>',
|
|
html,
|
|
flags=re.I | re.S,
|
|
)
|
|
target = block.group(1) if block else ""
|
|
rows = re.findall(r'<li[^>]*>(.*?)</li>', target, flags=re.I | re.S)
|
|
out, seen = [], set()
|
|
for row in rows:
|
|
txt = clean_html_text(row)
|
|
if txt and txt not in seen:
|
|
out.append(txt)
|
|
seen.add(txt)
|
|
return out
|
|
|
|
|
|
def iter_mmdd_full_year(year: int):
|
|
d = date(year, 1, 1)
|
|
end = date(year, 12, 31)
|
|
while d <= end:
|
|
yield d.strftime("%m%d"), d
|
|
d += timedelta(days=1)
|
|
# assure 29 février même année non bissextile
|
|
if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0):
|
|
yield "0229", None
|
|
|
|
|
|
def scrape_day(base_url: str, mmdd: str, d: date | None) -> dict:
|
|
url = f"{base_url.rstrip('/')}/{mmdd}.html"
|
|
html = fetch_html(url)
|
|
if d:
|
|
label = f"{d.day:02d} {MONTHS_FR[d.month]}"
|
|
iso = d.isoformat()
|
|
else:
|
|
label = "29 février"
|
|
iso = None
|
|
|
|
return {
|
|
"date": label,
|
|
"date_iso": iso,
|
|
"mmdd": mmdd,
|
|
"saints": parse_saints(html),
|
|
"dictons": parse_dictons(html),
|
|
"prenoms_a_feter": parse_prenoms(html),
|
|
"source_url": url,
|
|
}
|
|
|
|
|
|
def _ts() -> str:
|
|
return datetime.now().strftime("%H:%M:%S")
|
|
|
|
|
|
def _log(message: str, enabled: bool) -> None:
|
|
if enabled:
|
|
print(f"[{_ts()}] {message}", flush=True)
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description="Scrape saints/dictons pour toute une année (inclut 29 février)")
|
|
ap.add_argument("--year", type=int, default=date.today().year)
|
|
ap.add_argument("--base", default="https://www.saint-dicton.com")
|
|
ap.add_argument("--sleep-ms", type=int, default=150, help="Pause entre requêtes")
|
|
ap.add_argument("--limit", type=int, default=0, help="Limiter le nb de jours (test rapide)")
|
|
ap.add_argument("--out", default="", help="Fichier de sortie JSON (sinon stdout)")
|
|
ap.add_argument("--log-every", type=int, default=10, help="Affiche un log de progression tous les N jours")
|
|
ap.add_argument("--quiet", action="store_true", help="Réduit les logs")
|
|
args = ap.parse_args()
|
|
|
|
results = []
|
|
count = 0
|
|
verbose = not args.quiet
|
|
log_every = max(1, args.log_every)
|
|
_log(f"Démarrage scrape année={args.year}, base={args.base}", verbose)
|
|
for mmdd, d in iter_mmdd_full_year(args.year):
|
|
url = f"{args.base.rstrip('/')}/{mmdd}.html"
|
|
_log(f"[{count + 1}] fetch {mmdd} -> {url}", verbose)
|
|
try:
|
|
results.append(scrape_day(args.base, mmdd, d))
|
|
_log(f"[{count + 1}] ok {mmdd}", verbose and ((count + 1) % log_every == 0 or count == 0))
|
|
except Exception as e:
|
|
results.append({
|
|
"mmdd": mmdd,
|
|
"date_iso": d.isoformat() if d else None,
|
|
"error": str(e),
|
|
"source_url": url,
|
|
})
|
|
_log(f"[{count + 1}] erreur {mmdd}: {e}", True)
|
|
count += 1
|
|
if args.limit and count >= args.limit:
|
|
_log(f"Arrêt par --limit={args.limit}", verbose)
|
|
break
|
|
if args.sleep_ms > 0:
|
|
time.sleep(args.sleep_ms / 1000)
|
|
|
|
payload = {
|
|
"year": args.year,
|
|
"count": len(results),
|
|
"includes_feb29": any(r.get("mmdd") == "0229" for r in results),
|
|
"data": results,
|
|
}
|
|
|
|
txt = json.dumps(payload, ensure_ascii=False, indent=2)
|
|
if args.out:
|
|
with open(args.out, "w", encoding="utf-8") as f:
|
|
f.write(txt)
|
|
_log(f"Fichier écrit: {args.out}", True)
|
|
else:
|
|
print(txt)
|
|
_log(f"Terminé: {len(results)} jours", verbose)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|