#!/usr/bin/env python3 """ Restructure German SPL exam explanations from prose blobs to bullet-point format. Strategy: 1. Find DE explanations that are blobs (no '- **' bullets) but contain "Option X" references. 2. Split at "Option A/B/C/D" boundaries to form bullet points. 3. Write back to DE files. Run: python3 restructure_de_explanations.py [--dry-run] """ import re import os import sys BASE_DIR = "/Users/i052341/Daten/Cloud/04 - Ablage/Ablage 2020 - 2029/Ablage 2025/Hobbies 2025/Segelflug/Theorie/Glidr" DE_DIR = os.path.join(BASE_DIR, "SPL Exam Questions DE") EN_DIR = os.path.join(BASE_DIR, "SPL Exam Questions EN") DRY_RUN = "--dry-run" in sys.argv # Pattern to detect an already-formatted explanation ALREADY_FORMATTED = re.compile(r'- \*\*') # Patterns for "Option X" references in German blobs. # Matches things like: # "Option A", "Option B (text)", "Option A)", "Option A:" # We want to split at the start of each option reference. # # The key insight: options appear as "Option A" or "Option A)" or "Option A (" at sentence boundaries. # We split the blob so each option mention starts a new bullet. OPTION_SPLIT = re.compile( r'(? str | None: """ Try to split a prose blob into structured bullet points. Returns the restructured text, or None if restructuring is not applicable (e.g., no Option references found). """ blob = blob.strip() # Check if there are Option references to split on if not re.search(r'\bOption\s+[A-D]\b', blob): return None # No option references; leave as-is # Strategy: find all positions where "Option X" starts after sentence-ending punctuation # Split blob into segments at those positions # We'll use a regex to find all "Option X" mentions that appear after # sentence-ending punctuation (possibly with closing paren/bracket before the period) # First, normalize line breaks to spaces within the blob blob_normalized = re.sub(r'\n+', ' ', blob).strip() # Find split positions: "Option [A-D]" after ". " or at start of blob # We allow "Option A (" or "Option A)" or "Option A " patterns split_pattern = re.compile( r'(?:(?<=\.\s)|(?<=\.\s{2})|^)' r'(Option\s+[A-D](?:\s*[\(\)]|\s))', re.MULTILINE ) # Use finditer to get positions # Better approach: split on ". Option X" boundaries parts = re.split(r'(?<=\.)\s+(?=Option\s+[A-D])', blob_normalized) if len(parts) <= 1: # Try splitting on ". Option X" with the dot being inside a parenthetical too # Some blobs have "text. Option A (...) text. Option B (...) text." # Let's try splitting at sentence boundaries before "Option" parts = re.split(r'(?<=[.!?])\s+(?=Option\s+[A-D])', blob_normalized) if len(parts) <= 1: # Blob references options but all in one run - try splitting at "Option X" directly # e.g., "main text Option A is wrong Option B is also wrong" parts = re.split(r'\s+(?=Option\s+[A-D])', blob_normalized) if len(parts) <= 1: return None # Cannot restructure # First part is the main paragraph, rest become bullet points main_para = parts[0].strip() bullets = [p.strip() for p in parts[1:] if p.strip()] if not bullets: return None # Build the restructured explanation lines = [] if main_para: lines.append(main_para) lines.append("") # blank line before bullets for bullet in bullets: # Clean up bullet text: remove trailing period if needed (keep it) # Bold the "Option X" at start # "Option A is wrong..." -> "- **Option A** is wrong..." bullet = re.sub( r'^(Option\s+[A-D])\b', lambda m: f'- **{m.group(1)}**', bullet ) if not bullet.startswith('- '): bullet = '- ' + bullet lines.append(bullet) return '\n'.join(lines) def parse_explanation_blocks(content: str): """ Parse a markdown file and return list of (start, end, tag, expl_text) tuples for each Erklärung section. """ blocks = [] # Find each question tag and its explanation # Pattern: section starts at "#### Erklärung\n\n" and ends at next "####" or "### Q" or EOF # First, find all question tags tag_pattern = re.compile(r'\^(t\d+q\d+)') expl_pattern = re.compile(r'#### Erklärung\n\n(.*?)(?=\n#### |\n### Q|\Z)', re.DOTALL) # Find explanation positions for expl_match in expl_pattern.finditer(content): expl_text = expl_match.group(1).rstrip() expl_start = expl_match.start(1) expl_end = expl_match.start(1) + len(expl_match.group(1).rstrip()) # Find the question tag that precedes this explanation # Look backwards from expl_start preceding = content[:expl_start] tag_match = None for m in tag_pattern.finditer(preceding): tag_match = m # Keep last (closest) match tag = tag_match.group(1) if tag_match else None blocks.append((expl_start, expl_end, tag, expl_text)) return blocks def restructure_file(fpath: str, verbose: bool = True) -> int: """ Restructure all blob explanations in a DE file. Returns count of explanations restructured. """ content = open(fpath, encoding='utf-8').read() blocks = parse_explanation_blocks(content) restructured = 0 # Process in reverse order so offsets remain valid for expl_start, expl_end, tag, expl_text in reversed(blocks): # Skip if already formatted if ALREADY_FORMATTED.search(expl_text): continue # Skip if no option references if not re.search(r'\bOption\s+[A-D]\b', expl_text): continue # Try to restructure new_text = split_blob_into_bullets(expl_text) if new_text is None or new_text.strip() == expl_text.strip(): continue if verbose and restructured == 0: fname = os.path.basename(fpath) print(f"\n=== {fname} ===") if verbose and restructured < 2: print(f"\n Tag: {tag}") print(f" BEFORE: {repr(expl_text[:200])}") print(f" AFTER: {repr(new_text[:200])}") # Replace in content content = content[:expl_start] + new_text + content[expl_end:] restructured += 1 if restructured > 0 and not DRY_RUN: open(fpath, 'w', encoding='utf-8').write(content) return restructured def main(): print(f"Mode: {'DRY RUN' if DRY_RUN else 'LIVE'}") print(f"Processing DE files in: {DE_DIR}\n") total_restructured = 0 files_modified = 0 for fname in sorted(os.listdir(DE_DIR)): if not fname.endswith('.md'): continue fpath = os.path.join(DE_DIR, fname) count = restructure_file(fpath, verbose=True) if count > 0: files_modified += 1 total_restructured += count print(f" -> {count} explanation(s) restructured in {fname}") print(f"\n{'='*60}") print(f"Total restructured: {total_restructured} explanations in {files_modified} files") if DRY_RUN: print("(DRY RUN - no files were modified)") if __name__ == '__main__': main()