""" Markdown specification file parser for peripherals. Parses .md files containing USB device specifications. """ import re from typing import Dict, Any, Optional def parse_md_specification(md_content: str) -> Dict[str, Any]: """ Parse a markdown specification file and extract peripheral information. Supports two formats: 1. Simple format: Title + Description 2. Detailed format: Full USB specification with vendor/product IDs, characteristics, etc. Args: md_content: Raw markdown content Returns: Dictionary with peripheral data ready for database insertion """ result = { "nom": None, "type_principal": "USB", "sous_type": None, "marque": None, "modele": None, "numero_serie": None, "description": None, "synthese": md_content, # Store complete markdown content "caracteristiques_specifiques": {}, "notes": None } lines = md_content.strip().split('\n') # Extract title (first H1) title_match = re.search(r'^#\s+(.+?)$', md_content, re.MULTILINE) if title_match: title = title_match.group(1).strip() # Extract USB IDs from title if present id_match = re.search(r'(?:ID\s+)?([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', title) if id_match: vendor_id = id_match.group(1).lower() product_id = id_match.group(2).lower() result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_id}" result["caracteristiques_specifiques"]["product_id"] = f"0x{product_id}" # Parse content current_section = None description_lines = [] notes_lines = [] for line in lines: line = line.strip() # Section headers (H2) if line.startswith('## '): section_raw = line[3:].strip() # Remove numbering (e.g., "1. ", "2. ", "10. ") current_section = re.sub(r'^\d+\.\s*', '', section_raw) continue # Description section if current_section == "Description": if line and not line.startswith('#'): description_lines.append(line) # Try to extract device type from description if not result["sous_type"]: # Common patterns if re.search(r'souris|mouse', line, re.IGNORECASE): result["sous_type"] = "Souris" elif re.search(r'clavier|keyboard', line, re.IGNORECASE): result["sous_type"] = "Clavier" elif re.search(r'wi-?fi|wireless', line, re.IGNORECASE): result["type_principal"] = "WiFi" result["sous_type"] = "Adaptateur WiFi" elif re.search(r'bluetooth', line, re.IGNORECASE): result["type_principal"] = "Bluetooth" result["sous_type"] = "Adaptateur Bluetooth" elif re.search(r'usb\s+flash|clé\s+usb|flash\s+drive', line, re.IGNORECASE): result["sous_type"] = "Clé USB" elif re.search(r'dongle', line, re.IGNORECASE): result["sous_type"] = "Dongle" # Identification section (support both "Identification" and "Identification USB") elif current_section in ["Identification", "Identification USB", "Identification générale"]: # Vendor ID (support multiple formats) vendor_match = re.search(r'\*\*Vendor\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})\s*(?:\((.+?)\))?', line) if vendor_match: result["caracteristiques_specifiques"]["vendor_id"] = f"0x{vendor_match.group(1)}" if vendor_match.group(2): result["marque"] = vendor_match.group(2).strip() # Product ID (support multiple formats) product_match = re.search(r'\*\*Product\s+ID\*\*\s*:\s*0x([0-9a-fA-F]{4})', line) if product_match: result["caracteristiques_specifiques"]["product_id"] = f"0x{product_match.group(1)}" # Commercial name or Désignation USB name_match = re.search(r'\*\*(?:Commercial\s+name|Désignation\s+USB)\*\*\s*:\s*(.+?)$', line, re.IGNORECASE) if name_match: result["nom"] = name_match.group(1).strip() # Manufacturer mfg_match = re.search(r'\*\*Manufacturer\s+string\*\*:\s*(.+?)$', line) if mfg_match and not result["marque"]: result["marque"] = mfg_match.group(1).strip() # Product string prod_match = re.search(r'\*\*Product\s+string\*\*:\s*(.+?)$', line) if prod_match and not result["nom"]: result["nom"] = prod_match.group(1).strip() # Serial number serial_match = re.search(r'\*\*Serial\s+number\*\*:\s*(.+?)$', line) if serial_match: result["numero_serie"] = serial_match.group(1).strip() # Catégorie (format FR) cat_match = re.search(r'\*\*Catégorie\*\*:\s*(.+?)$', line) if cat_match: cat_value = cat_match.group(1).strip() if 'réseau' in cat_value.lower(): result["type_principal"] = "Réseau" # Sous-catégorie (format FR) subcat_match = re.search(r'\*\*Sous-catégorie\*\*:\s*(.+?)$', line) if subcat_match: result["sous_type"] = subcat_match.group(1).strip() # Nom courant (format FR) common_match = re.search(r'\*\*Nom\s+courant\*\*\s*:\s*(.+?)$', line) if common_match and not result.get("modele"): result["modele"] = common_match.group(1).strip() # Version USB (from Identification USB section) version_match = re.search(r'\*\*Version\s+USB\*\*\s*:\s*(.+?)$', line) if version_match: result["caracteristiques_specifiques"]["usb_version"] = version_match.group(1).strip() # Vitesse négociée (from Identification USB section) speed_match2 = re.search(r'\*\*Vitesse\s+négociée\*\*\s*:\s*(.+?)$', line) if speed_match2: result["caracteristiques_specifiques"]["usb_speed"] = speed_match2.group(1).strip() # Consommation maximale (from Identification USB section) power_match2 = re.search(r'\*\*Consommation\s+maximale\*\*\s*:\s*(.+?)$', line) if power_match2: result["caracteristiques_specifiques"]["max_power"] = power_match2.group(1).strip() # USB Characteristics elif current_section == "USB Characteristics": # USB version (support both formats) usb_ver_match = re.search(r'\*\*(?:USB\s+version|Version\s+USB)\*\*:\s*(.+?)$', line, re.IGNORECASE) if usb_ver_match: result["caracteristiques_specifiques"]["usb_version"] = usb_ver_match.group(1).strip() # Speed (support both formats) speed_match = re.search(r'\*\*(?:Negotiated\s+speed|Vitesse\s+négociée)\*\*:\s*(.+?)$', line, re.IGNORECASE) if speed_match: result["caracteristiques_specifiques"]["usb_speed"] = speed_match.group(1).strip() # bcdUSB bcd_match = re.search(r'\*\*bcdUSB\*\*:\s*(.+?)$', line) if bcd_match: result["caracteristiques_specifiques"]["bcdUSB"] = bcd_match.group(1).strip() # Power (support both formats) power_match = re.search(r'\*\*(?:Max\s+power\s+draw|Consommation\s+maximale)\*\*:\s*(.+?)$', line, re.IGNORECASE) if power_match: result["caracteristiques_specifiques"]["max_power"] = power_match.group(1).strip() # Device Class (support both formats) elif current_section in ["Device Class", "Classe et interface USB"]: # Interface class (EN format) class_match = re.search(r'\*\*Interface\s+class\*\*:\s*(\d+)\s*—\s*(.+?)$', line) if class_match: result["caracteristiques_specifiques"]["interface_class"] = class_match.group(1) result["caracteristiques_specifiques"]["interface_class_name"] = class_match.group(2).strip() # Classe USB (FR format) class_fr_match = re.search(r'\*\*Classe\s+USB\*\*\s*:\s*(.+?)\s*\((\d+)\)', line) if class_fr_match: result["caracteristiques_specifiques"]["interface_class"] = class_fr_match.group(2) result["caracteristiques_specifiques"]["interface_class_name"] = class_fr_match.group(1).strip() # Subclass (EN format) subclass_match = re.search(r'\*\*Subclass\*\*\s*:\s*(\d+)\s*—\s*(.+?)$', line) if subclass_match: result["caracteristiques_specifiques"]["interface_subclass"] = subclass_match.group(1) result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_match.group(2).strip() # Sous-classe (FR format) subclass_fr_match = re.search(r'\*\*Sous-classe\*\*\s*:\s*(.+?)\s*\((\d+)\)', line) if subclass_fr_match: result["caracteristiques_specifiques"]["interface_subclass"] = subclass_fr_match.group(2) result["caracteristiques_specifiques"]["interface_subclass_name"] = subclass_fr_match.group(1).strip() # Protocol (EN format) protocol_match = re.search(r'\*\*Protocol\*\*\s*:\s*(\d+|[0-9a-fA-F]{2})\s*—\s*(.+?)$', line) if protocol_match: result["caracteristiques_specifiques"]["interface_protocol"] = protocol_match.group(1) result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_match.group(2).strip() # Protocole (FR format) protocol_fr_match = re.search(r'\*\*Protocole\*\*\s*:\s*(.+?)\s*\((\d+)\)', line) if protocol_fr_match: result["caracteristiques_specifiques"]["interface_protocol"] = protocol_fr_match.group(2) result["caracteristiques_specifiques"]["interface_protocol_name"] = protocol_fr_match.group(1).strip() # Functional Role elif current_section == "Functional Role": if line.startswith('- '): notes_lines.append(line[2:]) # Classification Summary elif current_section == "Classification Summary": # Category category_match = re.search(r'\*\*Category\*\*:\s*(.+?)$', line) if category_match: result["caracteristiques_specifiques"]["category"] = category_match.group(1).strip() # Subcategory subcategory_match = re.search(r'\*\*Subcategory\*\*:\s*(.+?)$', line) if subcategory_match: result["caracteristiques_specifiques"]["subcategory"] = subcategory_match.group(1).strip() # Wi-Fi characteristics (new section for wireless adapters) elif current_section == "Caractéristiques Wi‑Fi": # Norme Wi-Fi wifi_std_match = re.search(r'\*\*Norme\s+Wi‑Fi\*\*:\s*(.+?)$', line) if wifi_std_match: result["caracteristiques_specifiques"]["wifi_standard"] = wifi_std_match.group(1).strip() # Bande de fréquence freq_match = re.search(r'\*\*Bande\s+de\s+fréquence\*\*:\s*(.+?)$', line) if freq_match: result["caracteristiques_specifiques"]["wifi_frequency"] = freq_match.group(1).strip() # Débit théorique maximal speed_match = re.search(r'\*\*Débit\s+théorique\s+maximal\*\*:\s*(.+?)$', line) if speed_match: result["caracteristiques_specifiques"]["wifi_max_speed"] = speed_match.group(1).strip() # Collect other sections for notes elif current_section in ["Performance Notes", "Power & Stability Considerations", "Recommended USB Port Placement", "Typical Use Cases", "Operating System Support", "Pilotes et compatibilité système", "Contraintes et limitations", "Placement USB recommandé", "Cas d'usage typiques", "Fonction réseau", "Résumé synthétique"]: if line and not line.startswith('#'): if line.startswith('- '): notes_lines.append(f"{current_section}: {line[2:]}") elif line.startswith('**'): notes_lines.append(f"{current_section}: {line}") elif line.startswith('>'): notes_lines.append(f"{current_section}: {line[1:].strip()}") elif current_section == "Résumé synthétique": notes_lines.append(line) # Build description if description_lines: result["description"] = " ".join(description_lines) # Build notes if notes_lines: result["notes"] = "\n".join(notes_lines) # Fallback for nom if not found if not result["nom"]: if result["description"]: # Use first line/sentence of description as name first_line = result["description"].split('\n')[0] result["nom"] = first_line[:100] if len(first_line) > 100 else first_line elif title_match: result["nom"] = title else: result["nom"] = "Périphérique importé" # Extract brand from description if not found if not result["marque"] and result["description"]: # Common brand patterns brands = ["Logitech", "SanDisk", "Ralink", "Broadcom", "ASUS", "Realtek", "TP-Link", "Intel", "Samsung", "Kingston", "Corsair"] for brand in brands: if re.search(rf'\b{brand}\b', result["description"], re.IGNORECASE): result["marque"] = brand break # Clean up None values and empty dicts result = {k: v for k, v in result.items() if v is not None} if not result.get("caracteristiques_specifiques"): result.pop("caracteristiques_specifiques", None) return result def extract_usb_ids_from_filename(filename: str) -> Optional[Dict[str, str]]: """ Extract vendor_id and product_id from filename. Examples: ID_0781_55ab.md -> {"vendor_id": "0x0781", "product_id": "0x55ab"} id_0b05_17cb.md -> {"vendor_id": "0x0b05", "product_id": "0x17cb"} Args: filename: Name of the file Returns: Dict with vendor_id and product_id, or None if not found """ match = re.search(r'(?:ID|id)[_\s]+([0-9a-fA-F]{4})[_:]([0-9a-fA-F]{4})', filename) if match: return { "vendor_id": f"0x{match.group(1).lower()}", "product_id": f"0x{match.group(2).lower()}" } return None