This commit is contained in:
Gilles Soulier
2026-01-05 16:08:01 +01:00
parent dcba044cd6
commit c67befc549
2215 changed files with 26743 additions and 329 deletions

322
backend/app/utils/md_parser.py Executable file
View File

@@ -0,0 +1,322 @@
"""
Markdown specification file parser for peripherals.
Parses .md files containing USB device specifications.
"""
import re
from typing import Dict, Any, Optional
def parse_md_specification(md_content: str) -> Dict[str, Any]:
"""
Parse a markdown specification file and extract peripheral information.
Supports two formats:
1. Simple format: Title + Description
2. Detailed format: Full USB specification with vendor/product IDs, characteristics, etc.
Args:
md_content: Raw markdown content
Returns:
Dictionary with peripheral data ready for database insertion
"""
result = {
"nom": None,
"type_principal": "USB",
"sous_type": None,
"marque": None,
"modele": None,
"numero_serie": None,
"description": None,
"synthese": md_content, # Store complete markdown content
"caracteristiques_specifiques": {},
"notes": None
}
lines = md_content.strip().split('\n')
# Extract title (first H1)
title_match = re.search(r'^#\s+(.+?)$', md_content, re.MULTILINE)
if title_match:
title = title_match.group(1).strip()
# Extract USB IDs from title if present
id_match = re.search(r'(?:ID\s+)?([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', title)
if id_match:
vendor_id = id_match.group(1).lower()
product_id = id_match.group(2).lower()
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_id}"
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_id}"
# Parse content
current_section = None
description_lines = []
notes_lines = []
for line in lines:
line = line.strip()
# Section headers (H2)
if line.startswith('## '):
section_raw = line[3:].strip()
# Remove numbering (e.g., "1. ", "2. ", "10. ")
current_section = re.sub(r'^\d+\.\s*', '', section_raw)
continue
# Description section
if current_section == "Description":
if line and not line.startswith('#'):
description_lines.append(line)
# Try to extract device type from description
if not result["sous_type"]:
# Common patterns
if re.search(r'souris|mouse', line, re.IGNORECASE):
result["sous_type"] = "Souris"
elif re.search(r'clavier|keyboard', line, re.IGNORECASE):
result["sous_type"] = "Clavier"
elif re.search(r'wi-?fi|wireless', line, re.IGNORECASE):
result["type_principal"] = "WiFi"
result["sous_type"] = "Adaptateur WiFi"
elif re.search(r'bluetooth', line, re.IGNORECASE):
result["type_principal"] = "Bluetooth"
result["sous_type"] = "Adaptateur Bluetooth"
elif re.search(r'usb\s+flash|clé\s+usb|flash\s+drive', line, re.IGNORECASE):
result["sous_type"] = "Clé USB"
elif re.search(r'dongle', line, re.IGNORECASE):
result["sous_type"] = "Dongle"
# Identification section (support both "Identification" and "Identification USB")
elif current_section in ["Identification", "Identification USB", "Identification générale"]:
# Vendor ID (support multiple formats)
vendor_match = re.search(r'\*\*Vendor\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})\s*(?:\((.+?)\))?', line)
if vendor_match:
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_match.group(1)}"
if vendor_match.group(2):
result["marque"] = vendor_match.group(2).strip()
# Product ID (support multiple formats)
product_match = re.search(r'\*\*Product\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})', line)
if product_match:
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_match.group(1)}"
# Commercial name or Désignation USB
name_match = re.search(r'\*\*(?:Commercial\s+name|Désignation\s+USB)\*\*\s*:\s*(.+?)$', line, re.IGNORECASE)
if name_match:
result["nom"] = name_match.group(1).strip()
# Manufacturer
mfg_match = re.search(r'\*\*Manufacturer\s+string\*\*:\s*(.+?)$', line)
if mfg_match and not result["marque"]:
result["marque"] = mfg_match.group(1).strip()
# Product string
prod_match = re.search(r'\*\*Product\s+string\*\*:\s*(.+?)$', line)
if prod_match and not result["nom"]:
result["nom"] = prod_match.group(1).strip()
# Serial number
serial_match = re.search(r'\*\*Serial\s+number\*\*:\s*(.+?)$', line)
if serial_match:
result["numero_serie"] = serial_match.group(1).strip()
# Catégorie (format FR)
cat_match = re.search(r'\*\*Catégorie\*\*:\s*(.+?)$', line)
if cat_match:
cat_value = cat_match.group(1).strip()
if 'réseau' in cat_value.lower():
result["type_principal"] = "Réseau"
# Sous-catégorie (format FR)
subcat_match = re.search(r'\*\*Sous-catégorie\*\*:\s*(.+?)$', line)
if subcat_match:
result["sous_type"] = subcat_match.group(1).strip()
# Nom courant (format FR)
common_match = re.search(r'\*\*Nom\s+courant\*\*\s*:\s*(.+?)$', line)
if common_match and not result.get("modele"):
result["modele"] = common_match.group(1).strip()
# Version USB (from Identification USB section)
version_match = re.search(r'\*\*Version\s+USB\*\*\s*:\s*(.+?)$', line)
if version_match:
result["caracteristiques_specifiques"]["usb_version"] = version_match.group(1).strip()
# Vitesse négociée (from Identification USB section)
speed_match2 = re.search(r'\*\*Vitesse\s+négociée\*\*\s*:\s*(.+?)$', line)
if speed_match2:
result["caracteristiques_specifiques"]["usb_speed"] = speed_match2.group(1).strip()
# Consommation maximale (from Identification USB section)
power_match2 = re.search(r'\*\*Consommation\s+maximale\*\*\s*:\s*(.+?)$', line)
if power_match2:
result["caracteristiques_specifiques"]["max_power"] = power_match2.group(1).strip()
# USB Characteristics
elif current_section == "USB Characteristics":
# USB version (support both formats)
usb_ver_match = re.search(r'\*\*(?:USB\s+version|Version\s+USB)\*\*:\s*(.+?)$', line, re.IGNORECASE)
if usb_ver_match:
result["caracteristiques_specifiques"]["usb_version"] = usb_ver_match.group(1).strip()
# Speed (support both formats)
speed_match = re.search(r'\*\*(?:Negotiated\s+speed|Vitesse\s+négociée)\*\*:\s*(.+?)$', line, re.IGNORECASE)
if speed_match:
result["caracteristiques_specifiques"]["usb_speed"] = speed_match.group(1).strip()
# bcdUSB
bcd_match = re.search(r'\*\*bcdUSB\*\*:\s*(.+?)$', line)
if bcd_match:
result["caracteristiques_specifiques"]["bcdUSB"] = bcd_match.group(1).strip()
# Power (support both formats)
power_match = re.search(r'\*\*(?:Max\s+power\s+draw|Consommation\s+maximale)\*\*:\s*(.+?)$', line, re.IGNORECASE)
if power_match:
result["caracteristiques_specifiques"]["max_power"] = power_match.group(1).strip()
# Device Class (support both formats)
elif current_section in ["Device Class", "Classe et interface USB"]:
# Interface class (EN format)
class_match = re.search(r'\*\*Interface\s+class\*\*:\s*(\d+)\s*—\s*(.+?)$', line)
if class_match:
result["caracteristiques_specifiques"]["interface_class"] = class_match.group(1)
result["caracteristiques_specifiques"]["interface_class_name"] = class_match.group(2).strip()
# Classe USB (FR format)
class_fr_match = re.search(r'\*\*Classe\s+USB\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
if class_fr_match:
result["caracteristiques_specifiques"]["interface_class"] = class_fr_match.group(2)
result["caracteristiques_specifiques"]["interface_class_name"] = class_fr_match.group(1).strip()
# Subclass (EN format)
subclass_match = re.search(r'\*\*Subclass\*\*\s*:\s*(\d+)\s*—\s*(.+?)$', line)
if subclass_match:
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_match.group(1)
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_match.group(2).strip()
# Sous-classe (FR format)
subclass_fr_match = re.search(r'\*\*Sous-classe\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
if subclass_fr_match:
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_fr_match.group(2)
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_fr_match.group(1).strip()
# Protocol (EN format)
protocol_match = re.search(r'\*\*Protocol\*\*\s*:\s*(\d+|[0-9a-fA-F]{2})\s*—\s*(.+?)$', line)
if protocol_match:
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_match.group(1)
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_match.group(2).strip()
# Protocole (FR format)
protocol_fr_match = re.search(r'\*\*Protocole\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
if protocol_fr_match:
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_fr_match.group(2)
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_fr_match.group(1).strip()
# Functional Role
elif current_section == "Functional Role":
if line.startswith('- '):
notes_lines.append(line[2:])
# Classification Summary
elif current_section == "Classification Summary":
# Category
category_match = re.search(r'\*\*Category\*\*:\s*(.+?)$', line)
if category_match:
result["caracteristiques_specifiques"]["category"] = category_match.group(1).strip()
# Subcategory
subcategory_match = re.search(r'\*\*Subcategory\*\*:\s*(.+?)$', line)
if subcategory_match:
result["caracteristiques_specifiques"]["subcategory"] = subcategory_match.group(1).strip()
# Wi-Fi characteristics (new section for wireless adapters)
elif current_section == "Caractéristiques WiFi":
# Norme Wi-Fi
wifi_std_match = re.search(r'\*\*Norme\s+WiFi\*\*:\s*(.+?)$', line)
if wifi_std_match:
result["caracteristiques_specifiques"]["wifi_standard"] = wifi_std_match.group(1).strip()
# Bande de fréquence
freq_match = re.search(r'\*\*Bande\s+de\s+fréquence\*\*:\s*(.+?)$', line)
if freq_match:
result["caracteristiques_specifiques"]["wifi_frequency"] = freq_match.group(1).strip()
# Débit théorique maximal
speed_match = re.search(r'\*\*Débit\s+théorique\s+maximal\*\*:\s*(.+?)$', line)
if speed_match:
result["caracteristiques_specifiques"]["wifi_max_speed"] = speed_match.group(1).strip()
# Collect other sections for notes
elif current_section in ["Performance Notes", "Power & Stability Considerations",
"Recommended USB Port Placement", "Typical Use Cases",
"Operating System Support", "Pilotes et compatibilité système",
"Contraintes et limitations", "Placement USB recommandé",
"Cas d'usage typiques", "Fonction réseau", "Résumé synthétique"]:
if line and not line.startswith('#'):
if line.startswith('- '):
notes_lines.append(f"{current_section}: {line[2:]}")
elif line.startswith('**'):
notes_lines.append(f"{current_section}: {line}")
elif line.startswith('>'):
notes_lines.append(f"{current_section}: {line[1:].strip()}")
elif current_section == "Résumé synthétique":
notes_lines.append(line)
# Build description
if description_lines:
result["description"] = " ".join(description_lines)
# Build notes
if notes_lines:
result["notes"] = "\n".join(notes_lines)
# Fallback for nom if not found
if not result["nom"]:
if result["description"]:
# Use first line/sentence of description as name
first_line = result["description"].split('\n')[0]
result["nom"] = first_line[:100] if len(first_line) > 100 else first_line
elif title_match:
result["nom"] = title
else:
result["nom"] = "Périphérique importé"
# Extract brand from description if not found
if not result["marque"] and result["description"]:
# Common brand patterns
brands = ["Logitech", "SanDisk", "Ralink", "Broadcom", "ASUS", "Realtek",
"TP-Link", "Intel", "Samsung", "Kingston", "Corsair"]
for brand in brands:
if re.search(rf'\b{brand}\b', result["description"], re.IGNORECASE):
result["marque"] = brand
break
# Clean up None values and empty dicts
result = {k: v for k, v in result.items() if v is not None}
if not result.get("caracteristiques_specifiques"):
result.pop("caracteristiques_specifiques", None)
return result
def extract_usb_ids_from_filename(filename: str) -> Optional[Dict[str, str]]:
"""
Extract vendor_id and product_id from filename.
Examples:
ID_0781_55ab.md -> {"vendor_id": "0x0781", "product_id": "0x55ab"}
id_0b05_17cb.md -> {"vendor_id": "0x0b05", "product_id": "0x17cb"}
Args:
filename: Name of the file
Returns:
Dict with vendor_id and product_id, or None if not found
"""
match = re.search(r'(?:ID|id)[_\s]+([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', filename)
if match:
return {
"vendor_id": f"0x{match.group(1).lower()}",
"product_id": f"0x{match.group(2).lower()}"
}
return None