APPS/glidr-content.git - git.mnsoft.org

Matthias Nott
5 days ago e07a553414967d3a090c9b2feea2d1fdfab082a7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python3
"""
Restructure German SPL exam explanations from prose blobs to bullet-point format.

Strategy:
1. Find DE explanations that are blobs (no '- **' bullets) but contain "Option X" references.
2. Split at "Option A/B/C/D" boundaries to form bullet points.
3. Write back to DE files.

Run: python3 restructure_de_explanations.py [--dry-run]
"""

import re
import os
import sys

BASE_DIR = "/Users/i052341/Daten/Cloud/04 - Ablage/Ablage 2020 - 2029/Ablage 2025/Hobbies 2025/Segelflug/Theorie/Glidr"
DE_DIR = os.path.join(BASE_DIR, "SPL Exam Questions DE")
EN_DIR = os.path.join(BASE_DIR, "SPL Exam Questions EN")

DRY_RUN = "--dry-run" in sys.argv

# Pattern to detect an already-formatted explanation
ALREADY_FORMATTED = re.compile(r'- \*\*')

# Patterns for "Option X" references in German blobs.
# Matches things like:
#   "Option A", "Option B (text)", "Option A)", "Option A:"
# We want to split at the start of each option reference.
#
# The key insight: options appear as "Option A" or "Option A)" or "Option A (" at sentence boundaries.
# We split the blob so each option mention starts a new bullet.
OPTION_SPLIT = re.compile(
    r'(?<!\*\*)(?<!\()(?<!\- )'   # not inside existing bullet or parens
    r'(?:(?<=\. )|(?<=\.\s)|(?<=\n)|(?:^))'  # after sentence end, newline, or start
    r'(Option\s+[A-D])',           # "Option A/B/C/D"
    re.MULTILINE
)

# More aggressive splitter that catches option mentions mid-sentence
# We look for "Option [A-D]" that follow punctuation or are at sentence boundaries
OPTION_BOUNDARY = re.compile(
    r'(?<=[.!?])\s+(Option\s+[A-D])'
    r'|(?<=\n)(Option\s+[A-D])'
    r'|^(Option\s+[A-D])',
    re.MULTILINE
)


def split_blob_into_bullets(blob: str) -> str | None:
    """
    Try to split a prose blob into structured bullet points.

    Returns the restructured text, or None if restructuring is not applicable
    (e.g., no Option references found).
    """
    blob = blob.strip()

    # Check if there are Option references to split on
    if not re.search(r'\bOption\s+[A-D]\b', blob):
        return None  # No option references; leave as-is

    # Strategy: find all positions where "Option X" starts after sentence-ending punctuation
    # Split blob into segments at those positions

    # We'll use a regex to find all "Option X" mentions that appear after
    # sentence-ending punctuation (possibly with closing paren/bracket before the period)

    # First, normalize line breaks to spaces within the blob
    blob_normalized = re.sub(r'\n+', ' ', blob).strip()

    # Find split positions: "Option [A-D]" after ". " or at start of blob
    # We allow "Option A (" or "Option A)" or "Option A " patterns
    split_pattern = re.compile(
        r'(?:(?<=\.\s)|(?<=\.\s{2})|^)'
        r'(Option\s+[A-D](?:\s*[\(\)]|\s))',
        re.MULTILINE
    )

    # Use finditer to get positions
    # Better approach: split on ". Option X" boundaries
    parts = re.split(r'(?<=\.)\s+(?=Option\s+[A-D])', blob_normalized)

    if len(parts) <= 1:
        # Try splitting on ". Option X" with the dot being inside a parenthetical too
        # Some blobs have "text. Option A (...) text. Option B (...) text."
        # Let's try splitting at sentence boundaries before "Option"
        parts = re.split(r'(?<=[.!?])\s+(?=Option\s+[A-D])', blob_normalized)

    if len(parts) <= 1:
        # Blob references options but all in one run - try splitting at "Option X" directly
        # e.g., "main text Option A is wrong Option B is also wrong"
        parts = re.split(r'\s+(?=Option\s+[A-D])', blob_normalized)

    if len(parts) <= 1:
        return None  # Cannot restructure

    # First part is the main paragraph, rest become bullet points
    main_para = parts[0].strip()
    bullets = [p.strip() for p in parts[1:] if p.strip()]

    if not bullets:
        return None

    # Build the restructured explanation
    lines = []
    if main_para:
        lines.append(main_para)
        lines.append("")  # blank line before bullets

    for bullet in bullets:
        # Clean up bullet text: remove trailing period if needed (keep it)
        # Bold the "Option X" at start
        # "Option A is wrong..." -> "- **Option A** is wrong..."
        bullet = re.sub(
            r'^(Option\s+[A-D])\b',
            lambda m: f'- **{m.group(1)}**',
            bullet
        )
        if not bullet.startswith('- '):
            bullet = '- ' + bullet
        lines.append(bullet)

    return '\n'.join(lines)


def parse_explanation_blocks(content: str):
    """
    Parse a markdown file and return list of (start, end, tag, expl_text) tuples
    for each Erklärung section.
    """
    blocks = []
    # Find each question tag and its explanation
    # Pattern: section starts at "#### Erklärung\n\n" and ends at next "####" or "### Q" or EOF

    # First, find all question tags
    tag_pattern = re.compile(r'\^(t\d+q\d+)')
    expl_pattern = re.compile(r'#### Erklärung\n\n(.*?)(?=\n#### |\n### Q|\Z)', re.DOTALL)

    # Find explanation positions
    for expl_match in expl_pattern.finditer(content):
        expl_text = expl_match.group(1).rstrip()
        expl_start = expl_match.start(1)
        expl_end = expl_match.start(1) + len(expl_match.group(1).rstrip())

        # Find the question tag that precedes this explanation
        # Look backwards from expl_start
        preceding = content[:expl_start]
        tag_match = None
        for m in tag_pattern.finditer(preceding):
            tag_match = m  # Keep last (closest) match

        tag = tag_match.group(1) if tag_match else None
        blocks.append((expl_start, expl_end, tag, expl_text))

    return blocks


def restructure_file(fpath: str, verbose: bool = True) -> int:
    """
    Restructure all blob explanations in a DE file.
    Returns count of explanations restructured.
    """
    content = open(fpath, encoding='utf-8').read()
    blocks = parse_explanation_blocks(content)

    restructured = 0
    # Process in reverse order so offsets remain valid
    for expl_start, expl_end, tag, expl_text in reversed(blocks):
        # Skip if already formatted
        if ALREADY_FORMATTED.search(expl_text):
            continue

        # Skip if no option references
        if not re.search(r'\bOption\s+[A-D]\b', expl_text):
            continue

        # Try to restructure
        new_text = split_blob_into_bullets(expl_text)
        if new_text is None or new_text.strip() == expl_text.strip():
            continue

        if verbose and restructured == 0:
            fname = os.path.basename(fpath)
            print(f"\n=== {fname} ===")

        if verbose and restructured < 2:
            print(f"\n  Tag: {tag}")
            print(f"  BEFORE: {repr(expl_text[:200])}")
            print(f"  AFTER:  {repr(new_text[:200])}")

        # Replace in content
        content = content[:expl_start] + new_text + content[expl_end:]
        restructured += 1

    if restructured > 0 and not DRY_RUN:
        open(fpath, 'w', encoding='utf-8').write(content)

    return restructured


def main():
    print(f"Mode: {'DRY RUN' if DRY_RUN else 'LIVE'}")
    print(f"Processing DE files in: {DE_DIR}\n")

    total_restructured = 0
    files_modified = 0

    for fname in sorted(os.listdir(DE_DIR)):
        if not fname.endswith('.md'):
            continue
        fpath = os.path.join(DE_DIR, fname)
        count = restructure_file(fpath, verbose=True)
        if count > 0:
            files_modified += 1
            total_restructured += count
            print(f"  -> {count} explanation(s) restructured in {fname}")

    print(f"\n{'='*60}")
    print(f"Total restructured: {total_restructured} explanations in {files_modified} files")
    if DRY_RUN:
        print("(DRY RUN - no files were modified)")


if __name__ == '__main__':
    main()