Files
serv_benchmark/backend/app/utils/md_parser.py
Gilles Soulier c67befc549 addon
2026-01-05 16:08:01 +01:00

323 lines
15 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Markdown specification file parser for peripherals.
Parses .md files containing USB device specifications.
"""
import re
from typing import Dict, Any, Optional
def parse_md_specification(md_content: str) -> Dict[str, Any]:
"""
Parse a markdown specification file and extract peripheral information.
Supports two formats:
1. Simple format: Title + Description
2. Detailed format: Full USB specification with vendor/product IDs, characteristics, etc.
Args:
md_content: Raw markdown content
Returns:
Dictionary with peripheral data ready for database insertion
"""
result = {
"nom": None,
"type_principal": "USB",
"sous_type": None,
"marque": None,
"modele": None,
"numero_serie": None,
"description": None,
"synthese": md_content, # Store complete markdown content
"caracteristiques_specifiques": {},
"notes": None
}
lines = md_content.strip().split('\n')
# Extract title (first H1)
title_match = re.search(r'^#\s+(.+?)$', md_content, re.MULTILINE)
if title_match:
title = title_match.group(1).strip()
# Extract USB IDs from title if present
id_match = re.search(r'(?:ID\s+)?([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', title)
if id_match:
vendor_id = id_match.group(1).lower()
product_id = id_match.group(2).lower()
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_id}"
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_id}"
# Parse content
current_section = None
description_lines = []
notes_lines = []
for line in lines:
line = line.strip()
# Section headers (H2)
if line.startswith('## '):
section_raw = line[3:].strip()
# Remove numbering (e.g., "1. ", "2. ", "10. ")
current_section = re.sub(r'^\d+\.\s*', '', section_raw)
continue
# Description section
if current_section == "Description":
if line and not line.startswith('#'):
description_lines.append(line)
# Try to extract device type from description
if not result["sous_type"]:
# Common patterns
if re.search(r'souris|mouse', line, re.IGNORECASE):
result["sous_type"] = "Souris"
elif re.search(r'clavier|keyboard', line, re.IGNORECASE):
result["sous_type"] = "Clavier"
elif re.search(r'wi-?fi|wireless', line, re.IGNORECASE):
result["type_principal"] = "WiFi"
result["sous_type"] = "Adaptateur WiFi"
elif re.search(r'bluetooth', line, re.IGNORECASE):
result["type_principal"] = "Bluetooth"
result["sous_type"] = "Adaptateur Bluetooth"
elif re.search(r'usb\s+flash|clé\s+usb|flash\s+drive', line, re.IGNORECASE):
result["sous_type"] = "Clé USB"
elif re.search(r'dongle', line, re.IGNORECASE):
result["sous_type"] = "Dongle"
# Identification section (support both "Identification" and "Identification USB")
elif current_section in ["Identification", "Identification USB", "Identification générale"]:
# Vendor ID (support multiple formats)
vendor_match = re.search(r'\*\*Vendor\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})\s*(?:\((.+?)\))?', line)
if vendor_match:
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_match.group(1)}"
if vendor_match.group(2):
result["marque"] = vendor_match.group(2).strip()
# Product ID (support multiple formats)
product_match = re.search(r'\*\*Product\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})', line)
if product_match:
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_match.group(1)}"
# Commercial name or Désignation USB
name_match = re.search(r'\*\*(?:Commercial\s+name|Désignation\s+USB)\*\*\s*:\s*(.+?)$', line, re.IGNORECASE)
if name_match:
result["nom"] = name_match.group(1).strip()
# Manufacturer
mfg_match = re.search(r'\*\*Manufacturer\s+string\*\*:\s*(.+?)$', line)
if mfg_match and not result["marque"]:
result["marque"] = mfg_match.group(1).strip()
# Product string
prod_match = re.search(r'\*\*Product\s+string\*\*:\s*(.+?)$', line)
if prod_match and not result["nom"]:
result["nom"] = prod_match.group(1).strip()
# Serial number
serial_match = re.search(r'\*\*Serial\s+number\*\*:\s*(.+?)$', line)
if serial_match:
result["numero_serie"] = serial_match.group(1).strip()
# Catégorie (format FR)
cat_match = re.search(r'\*\*Catégorie\*\*:\s*(.+?)$', line)
if cat_match:
cat_value = cat_match.group(1).strip()
if 'réseau' in cat_value.lower():
result["type_principal"] = "Réseau"
# Sous-catégorie (format FR)
subcat_match = re.search(r'\*\*Sous-catégorie\*\*:\s*(.+?)$', line)
if subcat_match:
result["sous_type"] = subcat_match.group(1).strip()
# Nom courant (format FR)
common_match = re.search(r'\*\*Nom\s+courant\*\*\s*:\s*(.+?)$', line)
if common_match and not result.get("modele"):
result["modele"] = common_match.group(1).strip()
# Version USB (from Identification USB section)
version_match = re.search(r'\*\*Version\s+USB\*\*\s*:\s*(.+?)$', line)
if version_match:
result["caracteristiques_specifiques"]["usb_version"] = version_match.group(1).strip()
# Vitesse négociée (from Identification USB section)
speed_match2 = re.search(r'\*\*Vitesse\s+négociée\*\*\s*:\s*(.+?)$', line)
if speed_match2:
result["caracteristiques_specifiques"]["usb_speed"] = speed_match2.group(1).strip()
# Consommation maximale (from Identification USB section)
power_match2 = re.search(r'\*\*Consommation\s+maximale\*\*\s*:\s*(.+?)$', line)
if power_match2:
result["caracteristiques_specifiques"]["max_power"] = power_match2.group(1).strip()
# USB Characteristics
elif current_section == "USB Characteristics":
# USB version (support both formats)
usb_ver_match = re.search(r'\*\*(?:USB\s+version|Version\s+USB)\*\*:\s*(.+?)$', line, re.IGNORECASE)
if usb_ver_match:
result["caracteristiques_specifiques"]["usb_version"] = usb_ver_match.group(1).strip()
# Speed (support both formats)
speed_match = re.search(r'\*\*(?:Negotiated\s+speed|Vitesse\s+négociée)\*\*:\s*(.+?)$', line, re.IGNORECASE)
if speed_match:
result["caracteristiques_specifiques"]["usb_speed"] = speed_match.group(1).strip()
# bcdUSB
bcd_match = re.search(r'\*\*bcdUSB\*\*:\s*(.+?)$', line)
if bcd_match:
result["caracteristiques_specifiques"]["bcdUSB"] = bcd_match.group(1).strip()
# Power (support both formats)
power_match = re.search(r'\*\*(?:Max\s+power\s+draw|Consommation\s+maximale)\*\*:\s*(.+?)$', line, re.IGNORECASE)
if power_match:
result["caracteristiques_specifiques"]["max_power"] = power_match.group(1).strip()
# Device Class (support both formats)
elif current_section in ["Device Class", "Classe et interface USB"]:
# Interface class (EN format)
class_match = re.search(r'\*\*Interface\s+class\*\*:\s*(\d+)\s*—\s*(.+?)$', line)
if class_match:
result["caracteristiques_specifiques"]["interface_class"] = class_match.group(1)
result["caracteristiques_specifiques"]["interface_class_name"] = class_match.group(2).strip()
# Classe USB (FR format)
class_fr_match = re.search(r'\*\*Classe\s+USB\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
if class_fr_match:
result["caracteristiques_specifiques"]["interface_class"] = class_fr_match.group(2)
result["caracteristiques_specifiques"]["interface_class_name"] = class_fr_match.group(1).strip()
# Subclass (EN format)
subclass_match = re.search(r'\*\*Subclass\*\*\s*:\s*(\d+)\s*—\s*(.+?)$', line)
if subclass_match:
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_match.group(1)
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_match.group(2).strip()
# Sous-classe (FR format)
subclass_fr_match = re.search(r'\*\*Sous-classe\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
if subclass_fr_match:
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_fr_match.group(2)
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_fr_match.group(1).strip()
# Protocol (EN format)
protocol_match = re.search(r'\*\*Protocol\*\*\s*:\s*(\d+|[0-9a-fA-F]{2})\s*—\s*(.+?)$', line)
if protocol_match:
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_match.group(1)
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_match.group(2).strip()
# Protocole (FR format)
protocol_fr_match = re.search(r'\*\*Protocole\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
if protocol_fr_match:
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_fr_match.group(2)
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_fr_match.group(1).strip()
# Functional Role
elif current_section == "Functional Role":
if line.startswith('- '):
notes_lines.append(line[2:])
# Classification Summary
elif current_section == "Classification Summary":
# Category
category_match = re.search(r'\*\*Category\*\*:\s*(.+?)$', line)
if category_match:
result["caracteristiques_specifiques"]["category"] = category_match.group(1).strip()
# Subcategory
subcategory_match = re.search(r'\*\*Subcategory\*\*:\s*(.+?)$', line)
if subcategory_match:
result["caracteristiques_specifiques"]["subcategory"] = subcategory_match.group(1).strip()
# Wi-Fi characteristics (new section for wireless adapters)
elif current_section == "Caractéristiques WiFi":
# Norme Wi-Fi
wifi_std_match = re.search(r'\*\*Norme\s+WiFi\*\*:\s*(.+?)$', line)
if wifi_std_match:
result["caracteristiques_specifiques"]["wifi_standard"] = wifi_std_match.group(1).strip()
# Bande de fréquence
freq_match = re.search(r'\*\*Bande\s+de\s+fréquence\*\*:\s*(.+?)$', line)
if freq_match:
result["caracteristiques_specifiques"]["wifi_frequency"] = freq_match.group(1).strip()
# Débit théorique maximal
speed_match = re.search(r'\*\*Débit\s+théorique\s+maximal\*\*:\s*(.+?)$', line)
if speed_match:
result["caracteristiques_specifiques"]["wifi_max_speed"] = speed_match.group(1).strip()
# Collect other sections for notes
elif current_section in ["Performance Notes", "Power & Stability Considerations",
"Recommended USB Port Placement", "Typical Use Cases",
"Operating System Support", "Pilotes et compatibilité système",
"Contraintes et limitations", "Placement USB recommandé",
"Cas d'usage typiques", "Fonction réseau", "Résumé synthétique"]:
if line and not line.startswith('#'):
if line.startswith('- '):
notes_lines.append(f"{current_section}: {line[2:]}")
elif line.startswith('**'):
notes_lines.append(f"{current_section}: {line}")
elif line.startswith('>'):
notes_lines.append(f"{current_section}: {line[1:].strip()}")
elif current_section == "Résumé synthétique":
notes_lines.append(line)
# Build description
if description_lines:
result["description"] = " ".join(description_lines)
# Build notes
if notes_lines:
result["notes"] = "\n".join(notes_lines)
# Fallback for nom if not found
if not result["nom"]:
if result["description"]:
# Use first line/sentence of description as name
first_line = result["description"].split('\n')[0]
result["nom"] = first_line[:100] if len(first_line) > 100 else first_line
elif title_match:
result["nom"] = title
else:
result["nom"] = "Périphérique importé"
# Extract brand from description if not found
if not result["marque"] and result["description"]:
# Common brand patterns
brands = ["Logitech", "SanDisk", "Ralink", "Broadcom", "ASUS", "Realtek",
"TP-Link", "Intel", "Samsung", "Kingston", "Corsair"]
for brand in brands:
if re.search(rf'\b{brand}\b', result["description"], re.IGNORECASE):
result["marque"] = brand
break
# Clean up None values and empty dicts
result = {k: v for k, v in result.items() if v is not None}
if not result.get("caracteristiques_specifiques"):
result.pop("caracteristiques_specifiques", None)
return result
def extract_usb_ids_from_filename(filename: str) -> Optional[Dict[str, str]]:
"""
Extract vendor_id and product_id from filename.
Examples:
ID_0781_55ab.md -> {"vendor_id": "0x0781", "product_id": "0x55ab"}
id_0b05_17cb.md -> {"vendor_id": "0x0b05", "product_id": "0x17cb"}
Args:
filename: Name of the file
Returns:
Dict with vendor_id and product_id, or None if not found
"""
match = re.search(r'(?:ID|id)[_\s]+([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', filename)
if match:
return {
"vendor_id": f"0x{match.group(1).lower()}",
"product_id": f"0x{match.group(2).lower()}"
}
return None