323 lines
15 KiB
Python
Executable File
323 lines
15 KiB
Python
Executable File
"""
|
||
Markdown specification file parser for peripherals.
|
||
Parses .md files containing USB device specifications.
|
||
"""
|
||
import re
|
||
from typing import Dict, Any, Optional
|
||
|
||
|
||
def parse_md_specification(md_content: str) -> Dict[str, Any]:
|
||
"""
|
||
Parse a markdown specification file and extract peripheral information.
|
||
|
||
Supports two formats:
|
||
1. Simple format: Title + Description
|
||
2. Detailed format: Full USB specification with vendor/product IDs, characteristics, etc.
|
||
|
||
Args:
|
||
md_content: Raw markdown content
|
||
|
||
Returns:
|
||
Dictionary with peripheral data ready for database insertion
|
||
"""
|
||
result = {
|
||
"nom": None,
|
||
"type_principal": "USB",
|
||
"sous_type": None,
|
||
"marque": None,
|
||
"modele": None,
|
||
"numero_serie": None,
|
||
"description": None,
|
||
"synthese": md_content, # Store complete markdown content
|
||
"caracteristiques_specifiques": {},
|
||
"notes": None
|
||
}
|
||
|
||
lines = md_content.strip().split('\n')
|
||
|
||
# Extract title (first H1)
|
||
title_match = re.search(r'^#\s+(.+?)$', md_content, re.MULTILINE)
|
||
if title_match:
|
||
title = title_match.group(1).strip()
|
||
|
||
# Extract USB IDs from title if present
|
||
id_match = re.search(r'(?:ID\s+)?([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', title)
|
||
if id_match:
|
||
vendor_id = id_match.group(1).lower()
|
||
product_id = id_match.group(2).lower()
|
||
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_id}"
|
||
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_id}"
|
||
|
||
# Parse content
|
||
current_section = None
|
||
description_lines = []
|
||
notes_lines = []
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
|
||
# Section headers (H2)
|
||
if line.startswith('## '):
|
||
section_raw = line[3:].strip()
|
||
# Remove numbering (e.g., "1. ", "2. ", "10. ")
|
||
current_section = re.sub(r'^\d+\.\s*', '', section_raw)
|
||
continue
|
||
|
||
# Description section
|
||
if current_section == "Description":
|
||
if line and not line.startswith('#'):
|
||
description_lines.append(line)
|
||
|
||
# Try to extract device type from description
|
||
if not result["sous_type"]:
|
||
# Common patterns
|
||
if re.search(r'souris|mouse', line, re.IGNORECASE):
|
||
result["sous_type"] = "Souris"
|
||
elif re.search(r'clavier|keyboard', line, re.IGNORECASE):
|
||
result["sous_type"] = "Clavier"
|
||
elif re.search(r'wi-?fi|wireless', line, re.IGNORECASE):
|
||
result["type_principal"] = "WiFi"
|
||
result["sous_type"] = "Adaptateur WiFi"
|
||
elif re.search(r'bluetooth', line, re.IGNORECASE):
|
||
result["type_principal"] = "Bluetooth"
|
||
result["sous_type"] = "Adaptateur Bluetooth"
|
||
elif re.search(r'usb\s+flash|clé\s+usb|flash\s+drive', line, re.IGNORECASE):
|
||
result["sous_type"] = "Clé USB"
|
||
elif re.search(r'dongle', line, re.IGNORECASE):
|
||
result["sous_type"] = "Dongle"
|
||
|
||
# Identification section (support both "Identification" and "Identification USB")
|
||
elif current_section in ["Identification", "Identification USB", "Identification générale"]:
|
||
# Vendor ID (support multiple formats)
|
||
vendor_match = re.search(r'\*\*Vendor\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})\s*(?:\((.+?)\))?', line)
|
||
if vendor_match:
|
||
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_match.group(1)}"
|
||
if vendor_match.group(2):
|
||
result["marque"] = vendor_match.group(2).strip()
|
||
|
||
# Product ID (support multiple formats)
|
||
product_match = re.search(r'\*\*Product\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})', line)
|
||
if product_match:
|
||
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_match.group(1)}"
|
||
|
||
# Commercial name or Désignation USB
|
||
name_match = re.search(r'\*\*(?:Commercial\s+name|Désignation\s+USB)\*\*\s*:\s*(.+?)$', line, re.IGNORECASE)
|
||
if name_match:
|
||
result["nom"] = name_match.group(1).strip()
|
||
|
||
# Manufacturer
|
||
mfg_match = re.search(r'\*\*Manufacturer\s+string\*\*:\s*(.+?)$', line)
|
||
if mfg_match and not result["marque"]:
|
||
result["marque"] = mfg_match.group(1).strip()
|
||
|
||
# Product string
|
||
prod_match = re.search(r'\*\*Product\s+string\*\*:\s*(.+?)$', line)
|
||
if prod_match and not result["nom"]:
|
||
result["nom"] = prod_match.group(1).strip()
|
||
|
||
# Serial number
|
||
serial_match = re.search(r'\*\*Serial\s+number\*\*:\s*(.+?)$', line)
|
||
if serial_match:
|
||
result["numero_serie"] = serial_match.group(1).strip()
|
||
|
||
# Catégorie (format FR)
|
||
cat_match = re.search(r'\*\*Catégorie\*\*:\s*(.+?)$', line)
|
||
if cat_match:
|
||
cat_value = cat_match.group(1).strip()
|
||
if 'réseau' in cat_value.lower():
|
||
result["type_principal"] = "Réseau"
|
||
|
||
# Sous-catégorie (format FR)
|
||
subcat_match = re.search(r'\*\*Sous-catégorie\*\*:\s*(.+?)$', line)
|
||
if subcat_match:
|
||
result["sous_type"] = subcat_match.group(1).strip()
|
||
|
||
# Nom courant (format FR)
|
||
common_match = re.search(r'\*\*Nom\s+courant\*\*\s*:\s*(.+?)$', line)
|
||
if common_match and not result.get("modele"):
|
||
result["modele"] = common_match.group(1).strip()
|
||
|
||
# Version USB (from Identification USB section)
|
||
version_match = re.search(r'\*\*Version\s+USB\*\*\s*:\s*(.+?)$', line)
|
||
if version_match:
|
||
result["caracteristiques_specifiques"]["usb_version"] = version_match.group(1).strip()
|
||
|
||
# Vitesse négociée (from Identification USB section)
|
||
speed_match2 = re.search(r'\*\*Vitesse\s+négociée\*\*\s*:\s*(.+?)$', line)
|
||
if speed_match2:
|
||
result["caracteristiques_specifiques"]["usb_speed"] = speed_match2.group(1).strip()
|
||
|
||
# Consommation maximale (from Identification USB section)
|
||
power_match2 = re.search(r'\*\*Consommation\s+maximale\*\*\s*:\s*(.+?)$', line)
|
||
if power_match2:
|
||
result["caracteristiques_specifiques"]["max_power"] = power_match2.group(1).strip()
|
||
|
||
# USB Characteristics
|
||
elif current_section == "USB Characteristics":
|
||
# USB version (support both formats)
|
||
usb_ver_match = re.search(r'\*\*(?:USB\s+version|Version\s+USB)\*\*:\s*(.+?)$', line, re.IGNORECASE)
|
||
if usb_ver_match:
|
||
result["caracteristiques_specifiques"]["usb_version"] = usb_ver_match.group(1).strip()
|
||
|
||
# Speed (support both formats)
|
||
speed_match = re.search(r'\*\*(?:Negotiated\s+speed|Vitesse\s+négociée)\*\*:\s*(.+?)$', line, re.IGNORECASE)
|
||
if speed_match:
|
||
result["caracteristiques_specifiques"]["usb_speed"] = speed_match.group(1).strip()
|
||
|
||
# bcdUSB
|
||
bcd_match = re.search(r'\*\*bcdUSB\*\*:\s*(.+?)$', line)
|
||
if bcd_match:
|
||
result["caracteristiques_specifiques"]["bcdUSB"] = bcd_match.group(1).strip()
|
||
|
||
# Power (support both formats)
|
||
power_match = re.search(r'\*\*(?:Max\s+power\s+draw|Consommation\s+maximale)\*\*:\s*(.+?)$', line, re.IGNORECASE)
|
||
if power_match:
|
||
result["caracteristiques_specifiques"]["max_power"] = power_match.group(1).strip()
|
||
|
||
# Device Class (support both formats)
|
||
elif current_section in ["Device Class", "Classe et interface USB"]:
|
||
# Interface class (EN format)
|
||
class_match = re.search(r'\*\*Interface\s+class\*\*:\s*(\d+)\s*—\s*(.+?)$', line)
|
||
if class_match:
|
||
result["caracteristiques_specifiques"]["interface_class"] = class_match.group(1)
|
||
result["caracteristiques_specifiques"]["interface_class_name"] = class_match.group(2).strip()
|
||
|
||
# Classe USB (FR format)
|
||
class_fr_match = re.search(r'\*\*Classe\s+USB\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
|
||
if class_fr_match:
|
||
result["caracteristiques_specifiques"]["interface_class"] = class_fr_match.group(2)
|
||
result["caracteristiques_specifiques"]["interface_class_name"] = class_fr_match.group(1).strip()
|
||
|
||
# Subclass (EN format)
|
||
subclass_match = re.search(r'\*\*Subclass\*\*\s*:\s*(\d+)\s*—\s*(.+?)$', line)
|
||
if subclass_match:
|
||
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_match.group(1)
|
||
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_match.group(2).strip()
|
||
|
||
# Sous-classe (FR format)
|
||
subclass_fr_match = re.search(r'\*\*Sous-classe\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
|
||
if subclass_fr_match:
|
||
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_fr_match.group(2)
|
||
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_fr_match.group(1).strip()
|
||
|
||
# Protocol (EN format)
|
||
protocol_match = re.search(r'\*\*Protocol\*\*\s*:\s*(\d+|[0-9a-fA-F]{2})\s*—\s*(.+?)$', line)
|
||
if protocol_match:
|
||
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_match.group(1)
|
||
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_match.group(2).strip()
|
||
|
||
# Protocole (FR format)
|
||
protocol_fr_match = re.search(r'\*\*Protocole\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
|
||
if protocol_fr_match:
|
||
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_fr_match.group(2)
|
||
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_fr_match.group(1).strip()
|
||
|
||
# Functional Role
|
||
elif current_section == "Functional Role":
|
||
if line.startswith('- '):
|
||
notes_lines.append(line[2:])
|
||
|
||
# Classification Summary
|
||
elif current_section == "Classification Summary":
|
||
# Category
|
||
category_match = re.search(r'\*\*Category\*\*:\s*(.+?)$', line)
|
||
if category_match:
|
||
result["caracteristiques_specifiques"]["category"] = category_match.group(1).strip()
|
||
|
||
# Subcategory
|
||
subcategory_match = re.search(r'\*\*Subcategory\*\*:\s*(.+?)$', line)
|
||
if subcategory_match:
|
||
result["caracteristiques_specifiques"]["subcategory"] = subcategory_match.group(1).strip()
|
||
|
||
# Wi-Fi characteristics (new section for wireless adapters)
|
||
elif current_section == "Caractéristiques Wi‑Fi":
|
||
# Norme Wi-Fi
|
||
wifi_std_match = re.search(r'\*\*Norme\s+Wi‑Fi\*\*:\s*(.+?)$', line)
|
||
if wifi_std_match:
|
||
result["caracteristiques_specifiques"]["wifi_standard"] = wifi_std_match.group(1).strip()
|
||
|
||
# Bande de fréquence
|
||
freq_match = re.search(r'\*\*Bande\s+de\s+fréquence\*\*:\s*(.+?)$', line)
|
||
if freq_match:
|
||
result["caracteristiques_specifiques"]["wifi_frequency"] = freq_match.group(1).strip()
|
||
|
||
# Débit théorique maximal
|
||
speed_match = re.search(r'\*\*Débit\s+théorique\s+maximal\*\*:\s*(.+?)$', line)
|
||
if speed_match:
|
||
result["caracteristiques_specifiques"]["wifi_max_speed"] = speed_match.group(1).strip()
|
||
|
||
# Collect other sections for notes
|
||
elif current_section in ["Performance Notes", "Power & Stability Considerations",
|
||
"Recommended USB Port Placement", "Typical Use Cases",
|
||
"Operating System Support", "Pilotes et compatibilité système",
|
||
"Contraintes et limitations", "Placement USB recommandé",
|
||
"Cas d'usage typiques", "Fonction réseau", "Résumé synthétique"]:
|
||
if line and not line.startswith('#'):
|
||
if line.startswith('- '):
|
||
notes_lines.append(f"{current_section}: {line[2:]}")
|
||
elif line.startswith('**'):
|
||
notes_lines.append(f"{current_section}: {line}")
|
||
elif line.startswith('>'):
|
||
notes_lines.append(f"{current_section}: {line[1:].strip()}")
|
||
elif current_section == "Résumé synthétique":
|
||
notes_lines.append(line)
|
||
|
||
# Build description
|
||
if description_lines:
|
||
result["description"] = " ".join(description_lines)
|
||
|
||
# Build notes
|
||
if notes_lines:
|
||
result["notes"] = "\n".join(notes_lines)
|
||
|
||
# Fallback for nom if not found
|
||
if not result["nom"]:
|
||
if result["description"]:
|
||
# Use first line/sentence of description as name
|
||
first_line = result["description"].split('\n')[0]
|
||
result["nom"] = first_line[:100] if len(first_line) > 100 else first_line
|
||
elif title_match:
|
||
result["nom"] = title
|
||
else:
|
||
result["nom"] = "Périphérique importé"
|
||
|
||
# Extract brand from description if not found
|
||
if not result["marque"] and result["description"]:
|
||
# Common brand patterns
|
||
brands = ["Logitech", "SanDisk", "Ralink", "Broadcom", "ASUS", "Realtek",
|
||
"TP-Link", "Intel", "Samsung", "Kingston", "Corsair"]
|
||
for brand in brands:
|
||
if re.search(rf'\b{brand}\b', result["description"], re.IGNORECASE):
|
||
result["marque"] = brand
|
||
break
|
||
|
||
# Clean up None values and empty dicts
|
||
result = {k: v for k, v in result.items() if v is not None}
|
||
if not result.get("caracteristiques_specifiques"):
|
||
result.pop("caracteristiques_specifiques", None)
|
||
|
||
return result
|
||
|
||
|
||
def extract_usb_ids_from_filename(filename: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Extract vendor_id and product_id from filename.
|
||
|
||
Examples:
|
||
ID_0781_55ab.md -> {"vendor_id": "0x0781", "product_id": "0x55ab"}
|
||
id_0b05_17cb.md -> {"vendor_id": "0x0b05", "product_id": "0x17cb"}
|
||
|
||
Args:
|
||
filename: Name of the file
|
||
|
||
Returns:
|
||
Dict with vendor_id and product_id, or None if not found
|
||
"""
|
||
match = re.search(r'(?:ID|id)[_\s]+([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', filename)
|
||
if match:
|
||
return {
|
||
"vendor_id": f"0x{match.group(1).lower()}",
|
||
"product_id": f"0x{match.group(2).lower()}"
|
||
}
|
||
return None
|