addon
This commit is contained in:
322
backend/app/utils/md_parser.py
Executable file
322
backend/app/utils/md_parser.py
Executable file
@@ -0,0 +1,322 @@
|
||||
"""
|
||||
Markdown specification file parser for peripherals.
|
||||
Parses .md files containing USB device specifications.
|
||||
"""
|
||||
import re
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
def parse_md_specification(md_content: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse a markdown specification file and extract peripheral information.
|
||||
|
||||
Supports two formats:
|
||||
1. Simple format: Title + Description
|
||||
2. Detailed format: Full USB specification with vendor/product IDs, characteristics, etc.
|
||||
|
||||
Args:
|
||||
md_content: Raw markdown content
|
||||
|
||||
Returns:
|
||||
Dictionary with peripheral data ready for database insertion
|
||||
"""
|
||||
result = {
|
||||
"nom": None,
|
||||
"type_principal": "USB",
|
||||
"sous_type": None,
|
||||
"marque": None,
|
||||
"modele": None,
|
||||
"numero_serie": None,
|
||||
"description": None,
|
||||
"synthese": md_content, # Store complete markdown content
|
||||
"caracteristiques_specifiques": {},
|
||||
"notes": None
|
||||
}
|
||||
|
||||
lines = md_content.strip().split('\n')
|
||||
|
||||
# Extract title (first H1)
|
||||
title_match = re.search(r'^#\s+(.+?)$', md_content, re.MULTILINE)
|
||||
if title_match:
|
||||
title = title_match.group(1).strip()
|
||||
|
||||
# Extract USB IDs from title if present
|
||||
id_match = re.search(r'(?:ID\s+)?([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', title)
|
||||
if id_match:
|
||||
vendor_id = id_match.group(1).lower()
|
||||
product_id = id_match.group(2).lower()
|
||||
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_id}"
|
||||
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_id}"
|
||||
|
||||
# Parse content
|
||||
current_section = None
|
||||
description_lines = []
|
||||
notes_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Section headers (H2)
|
||||
if line.startswith('## '):
|
||||
section_raw = line[3:].strip()
|
||||
# Remove numbering (e.g., "1. ", "2. ", "10. ")
|
||||
current_section = re.sub(r'^\d+\.\s*', '', section_raw)
|
||||
continue
|
||||
|
||||
# Description section
|
||||
if current_section == "Description":
|
||||
if line and not line.startswith('#'):
|
||||
description_lines.append(line)
|
||||
|
||||
# Try to extract device type from description
|
||||
if not result["sous_type"]:
|
||||
# Common patterns
|
||||
if re.search(r'souris|mouse', line, re.IGNORECASE):
|
||||
result["sous_type"] = "Souris"
|
||||
elif re.search(r'clavier|keyboard', line, re.IGNORECASE):
|
||||
result["sous_type"] = "Clavier"
|
||||
elif re.search(r'wi-?fi|wireless', line, re.IGNORECASE):
|
||||
result["type_principal"] = "WiFi"
|
||||
result["sous_type"] = "Adaptateur WiFi"
|
||||
elif re.search(r'bluetooth', line, re.IGNORECASE):
|
||||
result["type_principal"] = "Bluetooth"
|
||||
result["sous_type"] = "Adaptateur Bluetooth"
|
||||
elif re.search(r'usb\s+flash|clé\s+usb|flash\s+drive', line, re.IGNORECASE):
|
||||
result["sous_type"] = "Clé USB"
|
||||
elif re.search(r'dongle', line, re.IGNORECASE):
|
||||
result["sous_type"] = "Dongle"
|
||||
|
||||
# Identification section (support both "Identification" and "Identification USB")
|
||||
elif current_section in ["Identification", "Identification USB", "Identification générale"]:
|
||||
# Vendor ID (support multiple formats)
|
||||
vendor_match = re.search(r'\*\*Vendor\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})\s*(?:\((.+?)\))?', line)
|
||||
if vendor_match:
|
||||
result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_match.group(1)}"
|
||||
if vendor_match.group(2):
|
||||
result["marque"] = vendor_match.group(2).strip()
|
||||
|
||||
# Product ID (support multiple formats)
|
||||
product_match = re.search(r'\*\*Product\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})', line)
|
||||
if product_match:
|
||||
result["caracteristiques_specifiques"]["product_id"] = f"0x{product_match.group(1)}"
|
||||
|
||||
# Commercial name or Désignation USB
|
||||
name_match = re.search(r'\*\*(?:Commercial\s+name|Désignation\s+USB)\*\*\s*:\s*(.+?)$', line, re.IGNORECASE)
|
||||
if name_match:
|
||||
result["nom"] = name_match.group(1).strip()
|
||||
|
||||
# Manufacturer
|
||||
mfg_match = re.search(r'\*\*Manufacturer\s+string\*\*:\s*(.+?)$', line)
|
||||
if mfg_match and not result["marque"]:
|
||||
result["marque"] = mfg_match.group(1).strip()
|
||||
|
||||
# Product string
|
||||
prod_match = re.search(r'\*\*Product\s+string\*\*:\s*(.+?)$', line)
|
||||
if prod_match and not result["nom"]:
|
||||
result["nom"] = prod_match.group(1).strip()
|
||||
|
||||
# Serial number
|
||||
serial_match = re.search(r'\*\*Serial\s+number\*\*:\s*(.+?)$', line)
|
||||
if serial_match:
|
||||
result["numero_serie"] = serial_match.group(1).strip()
|
||||
|
||||
# Catégorie (format FR)
|
||||
cat_match = re.search(r'\*\*Catégorie\*\*:\s*(.+?)$', line)
|
||||
if cat_match:
|
||||
cat_value = cat_match.group(1).strip()
|
||||
if 'réseau' in cat_value.lower():
|
||||
result["type_principal"] = "Réseau"
|
||||
|
||||
# Sous-catégorie (format FR)
|
||||
subcat_match = re.search(r'\*\*Sous-catégorie\*\*:\s*(.+?)$', line)
|
||||
if subcat_match:
|
||||
result["sous_type"] = subcat_match.group(1).strip()
|
||||
|
||||
# Nom courant (format FR)
|
||||
common_match = re.search(r'\*\*Nom\s+courant\*\*\s*:\s*(.+?)$', line)
|
||||
if common_match and not result.get("modele"):
|
||||
result["modele"] = common_match.group(1).strip()
|
||||
|
||||
# Version USB (from Identification USB section)
|
||||
version_match = re.search(r'\*\*Version\s+USB\*\*\s*:\s*(.+?)$', line)
|
||||
if version_match:
|
||||
result["caracteristiques_specifiques"]["usb_version"] = version_match.group(1).strip()
|
||||
|
||||
# Vitesse négociée (from Identification USB section)
|
||||
speed_match2 = re.search(r'\*\*Vitesse\s+négociée\*\*\s*:\s*(.+?)$', line)
|
||||
if speed_match2:
|
||||
result["caracteristiques_specifiques"]["usb_speed"] = speed_match2.group(1).strip()
|
||||
|
||||
# Consommation maximale (from Identification USB section)
|
||||
power_match2 = re.search(r'\*\*Consommation\s+maximale\*\*\s*:\s*(.+?)$', line)
|
||||
if power_match2:
|
||||
result["caracteristiques_specifiques"]["max_power"] = power_match2.group(1).strip()
|
||||
|
||||
# USB Characteristics
|
||||
elif current_section == "USB Characteristics":
|
||||
# USB version (support both formats)
|
||||
usb_ver_match = re.search(r'\*\*(?:USB\s+version|Version\s+USB)\*\*:\s*(.+?)$', line, re.IGNORECASE)
|
||||
if usb_ver_match:
|
||||
result["caracteristiques_specifiques"]["usb_version"] = usb_ver_match.group(1).strip()
|
||||
|
||||
# Speed (support both formats)
|
||||
speed_match = re.search(r'\*\*(?:Negotiated\s+speed|Vitesse\s+négociée)\*\*:\s*(.+?)$', line, re.IGNORECASE)
|
||||
if speed_match:
|
||||
result["caracteristiques_specifiques"]["usb_speed"] = speed_match.group(1).strip()
|
||||
|
||||
# bcdUSB
|
||||
bcd_match = re.search(r'\*\*bcdUSB\*\*:\s*(.+?)$', line)
|
||||
if bcd_match:
|
||||
result["caracteristiques_specifiques"]["bcdUSB"] = bcd_match.group(1).strip()
|
||||
|
||||
# Power (support both formats)
|
||||
power_match = re.search(r'\*\*(?:Max\s+power\s+draw|Consommation\s+maximale)\*\*:\s*(.+?)$', line, re.IGNORECASE)
|
||||
if power_match:
|
||||
result["caracteristiques_specifiques"]["max_power"] = power_match.group(1).strip()
|
||||
|
||||
# Device Class (support both formats)
|
||||
elif current_section in ["Device Class", "Classe et interface USB"]:
|
||||
# Interface class (EN format)
|
||||
class_match = re.search(r'\*\*Interface\s+class\*\*:\s*(\d+)\s*—\s*(.+?)$', line)
|
||||
if class_match:
|
||||
result["caracteristiques_specifiques"]["interface_class"] = class_match.group(1)
|
||||
result["caracteristiques_specifiques"]["interface_class_name"] = class_match.group(2).strip()
|
||||
|
||||
# Classe USB (FR format)
|
||||
class_fr_match = re.search(r'\*\*Classe\s+USB\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
|
||||
if class_fr_match:
|
||||
result["caracteristiques_specifiques"]["interface_class"] = class_fr_match.group(2)
|
||||
result["caracteristiques_specifiques"]["interface_class_name"] = class_fr_match.group(1).strip()
|
||||
|
||||
# Subclass (EN format)
|
||||
subclass_match = re.search(r'\*\*Subclass\*\*\s*:\s*(\d+)\s*—\s*(.+?)$', line)
|
||||
if subclass_match:
|
||||
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_match.group(1)
|
||||
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_match.group(2).strip()
|
||||
|
||||
# Sous-classe (FR format)
|
||||
subclass_fr_match = re.search(r'\*\*Sous-classe\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
|
||||
if subclass_fr_match:
|
||||
result["caracteristiques_specifiques"]["interface_subclass"] = subclass_fr_match.group(2)
|
||||
result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_fr_match.group(1).strip()
|
||||
|
||||
# Protocol (EN format)
|
||||
protocol_match = re.search(r'\*\*Protocol\*\*\s*:\s*(\d+|[0-9a-fA-F]{2})\s*—\s*(.+?)$', line)
|
||||
if protocol_match:
|
||||
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_match.group(1)
|
||||
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_match.group(2).strip()
|
||||
|
||||
# Protocole (FR format)
|
||||
protocol_fr_match = re.search(r'\*\*Protocole\*\*\s*:\s*(.+?)\s*\((\d+)\)', line)
|
||||
if protocol_fr_match:
|
||||
result["caracteristiques_specifiques"]["interface_protocol"] = protocol_fr_match.group(2)
|
||||
result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_fr_match.group(1).strip()
|
||||
|
||||
# Functional Role
|
||||
elif current_section == "Functional Role":
|
||||
if line.startswith('- '):
|
||||
notes_lines.append(line[2:])
|
||||
|
||||
# Classification Summary
|
||||
elif current_section == "Classification Summary":
|
||||
# Category
|
||||
category_match = re.search(r'\*\*Category\*\*:\s*(.+?)$', line)
|
||||
if category_match:
|
||||
result["caracteristiques_specifiques"]["category"] = category_match.group(1).strip()
|
||||
|
||||
# Subcategory
|
||||
subcategory_match = re.search(r'\*\*Subcategory\*\*:\s*(.+?)$', line)
|
||||
if subcategory_match:
|
||||
result["caracteristiques_specifiques"]["subcategory"] = subcategory_match.group(1).strip()
|
||||
|
||||
# Wi-Fi characteristics (new section for wireless adapters)
|
||||
elif current_section == "Caractéristiques Wi‑Fi":
|
||||
# Norme Wi-Fi
|
||||
wifi_std_match = re.search(r'\*\*Norme\s+Wi‑Fi\*\*:\s*(.+?)$', line)
|
||||
if wifi_std_match:
|
||||
result["caracteristiques_specifiques"]["wifi_standard"] = wifi_std_match.group(1).strip()
|
||||
|
||||
# Bande de fréquence
|
||||
freq_match = re.search(r'\*\*Bande\s+de\s+fréquence\*\*:\s*(.+?)$', line)
|
||||
if freq_match:
|
||||
result["caracteristiques_specifiques"]["wifi_frequency"] = freq_match.group(1).strip()
|
||||
|
||||
# Débit théorique maximal
|
||||
speed_match = re.search(r'\*\*Débit\s+théorique\s+maximal\*\*:\s*(.+?)$', line)
|
||||
if speed_match:
|
||||
result["caracteristiques_specifiques"]["wifi_max_speed"] = speed_match.group(1).strip()
|
||||
|
||||
# Collect other sections for notes
|
||||
elif current_section in ["Performance Notes", "Power & Stability Considerations",
|
||||
"Recommended USB Port Placement", "Typical Use Cases",
|
||||
"Operating System Support", "Pilotes et compatibilité système",
|
||||
"Contraintes et limitations", "Placement USB recommandé",
|
||||
"Cas d'usage typiques", "Fonction réseau", "Résumé synthétique"]:
|
||||
if line and not line.startswith('#'):
|
||||
if line.startswith('- '):
|
||||
notes_lines.append(f"{current_section}: {line[2:]}")
|
||||
elif line.startswith('**'):
|
||||
notes_lines.append(f"{current_section}: {line}")
|
||||
elif line.startswith('>'):
|
||||
notes_lines.append(f"{current_section}: {line[1:].strip()}")
|
||||
elif current_section == "Résumé synthétique":
|
||||
notes_lines.append(line)
|
||||
|
||||
# Build description
|
||||
if description_lines:
|
||||
result["description"] = " ".join(description_lines)
|
||||
|
||||
# Build notes
|
||||
if notes_lines:
|
||||
result["notes"] = "\n".join(notes_lines)
|
||||
|
||||
# Fallback for nom if not found
|
||||
if not result["nom"]:
|
||||
if result["description"]:
|
||||
# Use first line/sentence of description as name
|
||||
first_line = result["description"].split('\n')[0]
|
||||
result["nom"] = first_line[:100] if len(first_line) > 100 else first_line
|
||||
elif title_match:
|
||||
result["nom"] = title
|
||||
else:
|
||||
result["nom"] = "Périphérique importé"
|
||||
|
||||
# Extract brand from description if not found
|
||||
if not result["marque"] and result["description"]:
|
||||
# Common brand patterns
|
||||
brands = ["Logitech", "SanDisk", "Ralink", "Broadcom", "ASUS", "Realtek",
|
||||
"TP-Link", "Intel", "Samsung", "Kingston", "Corsair"]
|
||||
for brand in brands:
|
||||
if re.search(rf'\b{brand}\b', result["description"], re.IGNORECASE):
|
||||
result["marque"] = brand
|
||||
break
|
||||
|
||||
# Clean up None values and empty dicts
|
||||
result = {k: v for k, v in result.items() if v is not None}
|
||||
if not result.get("caracteristiques_specifiques"):
|
||||
result.pop("caracteristiques_specifiques", None)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def extract_usb_ids_from_filename(filename: str) -> Optional[Dict[str, str]]:
|
||||
"""
|
||||
Extract vendor_id and product_id from filename.
|
||||
|
||||
Examples:
|
||||
ID_0781_55ab.md -> {"vendor_id": "0x0781", "product_id": "0x55ab"}
|
||||
id_0b05_17cb.md -> {"vendor_id": "0x0b05", "product_id": "0x17cb"}
|
||||
|
||||
Args:
|
||||
filename: Name of the file
|
||||
|
||||
Returns:
|
||||
Dict with vendor_id and product_id, or None if not found
|
||||
"""
|
||||
match = re.search(r'(?:ID|id)[_\s]+([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', filename)
|
||||
if match:
|
||||
return {
|
||||
"vendor_id": f"0x{match.group(1).lower()}",
|
||||
"product_id": f"0x{match.group(2).lower()}"
|
||||
}
|
||||
return None
|
||||
Reference in New Issue
Block a user