#!/usr/bin/env python3
"""
Restructure German SPL exam explanations from prose blobs to bullet-point format.

Strategy:
1. Find DE explanations that are blobs (no '- **' bullets) but contain "Option X" references.
2. Split at "Option A/B/C/D" boundaries to form bullet points.
3. Write back to DE files.

Run: python3 restructure_de_explanations.py [--dry-run]
"""

import re
import os
import sys

BASE_DIR = "/Users/i052341/Daten/Cloud/04 - Ablage/Ablage 2020 - 2029/Ablage 2025/Hobbies 2025/Segelflug/Theorie/Glidr"
DE_DIR = os.path.join(BASE_DIR, "SPL Exam Questions DE")
EN_DIR = os.path.join(BASE_DIR, "SPL Exam Questions EN")

DRY_RUN = "--dry-run" in sys.argv

# Pattern to detect an already-formatted explanation
ALREADY_FORMATTED = re.compile(r'- \*\*')

# Patterns for "Option X" references in German blobs.
# Matches things like:
#   "Option A", "Option B (text)", "Option A)", "Option A:"
# We want to split at the start of each option reference.
#
# The key insight: options appear as "Option A" or "Option A)" or "Option A (" at sentence boundaries.
# We split the blob so each option mention starts a new bullet.
OPTION_SPLIT = re.compile(
    r'(?<!\*\*)(?<!\()(?<!\- )'   # not inside existing bullet or parens
    r'(?:(?<=\. )|(?<=\.\s)|(?<=\n)|(?:^))'  # after sentence end, newline, or start
    r'(Option\s+[A-D])',           # "Option A/B/C/D"
    re.MULTILINE
)

# More aggressive splitter that catches option mentions mid-sentence
# We look for "Option [A-D]" that follow punctuation or are at sentence boundaries
OPTION_BOUNDARY = re.compile(
    r'(?<=[.!?])\s+(Option\s+[A-D])'
    r'|(?<=\n)(Option\s+[A-D])'
    r'|^(Option\s+[A-D])',
    re.MULTILINE
)


def split_blob_into_bullets(blob: str) -> str | None:
    """
    Try to split a prose blob into structured bullet points.

    Returns the restructured text, or None if restructuring is not applicable
    (e.g., no Option references found).
    """
    blob = blob.strip()

    # Check if there are Option references to split on
    if not re.search(r'\bOption\s+[A-D]\b', blob):
        return None  # No option references; leave as-is

    # Strategy: find all positions where "Option X" starts after sentence-ending punctuation
    # Split blob into segments at those positions

    # We'll use a regex to find all "Option X" mentions that appear after
    # sentence-ending punctuation (possibly with closing paren/bracket before the period)

    # First, normalize line breaks to spaces within the blob
    blob_normalized = re.sub(r'\n+', ' ', blob).strip()

    # Find split positions: "Option [A-D]" after ". " or at start of blob
    # We allow "Option A (" or "Option A)" or "Option A " patterns
    split_pattern = re.compile(
        r'(?:(?<=\.\s)|(?<=\.\s{2})|^)'
        r'(Option\s+[A-D](?:\s*[\(\)]|\s))',
        re.MULTILINE
    )

    # Use finditer to get positions
    # Better approach: split on ". Option X" boundaries
    parts = re.split(r'(?<=\.)\s+(?=Option\s+[A-D])', blob_normalized)

    if len(parts) <= 1:
        # Try splitting on ". Option X" with the dot being inside a parenthetical too
        # Some blobs have "text. Option A (...) text. Option B (...) text."
        # Let's try splitting at sentence boundaries before "Option"
        parts = re.split(r'(?<=[.!?])\s+(?=Option\s+[A-D])', blob_normalized)

    if len(parts) <= 1:
        # Blob references options but all in one run - try splitting at "Option X" directly
        # e.g., "main text Option A is wrong Option B is also wrong"
        parts = re.split(r'\s+(?=Option\s+[A-D])', blob_normalized)

    if len(parts) <= 1:
        return None  # Cannot restructure

    # First part is the main paragraph, rest become bullet points
    main_para = parts[0].strip()
    bullets = [p.strip() for p in parts[1:] if p.strip()]

    if not bullets:
        return None

    # Build the restructured explanation
    lines = []
    if main_para:
        lines.append(main_para)
        lines.append("")  # blank line before bullets

    for bullet in bullets:
        # Clean up bullet text: remove trailing period if needed (keep it)
        # Bold the "Option X" at start
        # "Option A is wrong..." -> "- **Option A** is wrong..."
        bullet = re.sub(
            r'^(Option\s+[A-D])\b',
            lambda m: f'- **{m.group(1)}**',
            bullet
        )
        if not bullet.startswith('- '):
            bullet = '- ' + bullet
        lines.append(bullet)

    return '\n'.join(lines)


def parse_explanation_blocks(content: str):
    """
    Parse a markdown file and return list of (start, end, tag, expl_text) tuples
    for each Erklärung section.
    """
    blocks = []
    # Find each question tag and its explanation
    # Pattern: section starts at "#### Erklärung\n\n" and ends at next "####" or "### Q" or EOF

    # First, find all question tags
    tag_pattern = re.compile(r'\^(t\d+q\d+)')
    expl_pattern = re.compile(r'#### Erklärung\n\n(.*?)(?=\n#### |\n### Q|\Z)', re.DOTALL)

    # Find explanation positions
    for expl_match in expl_pattern.finditer(content):
        expl_text = expl_match.group(1).rstrip()
        expl_start = expl_match.start(1)
        expl_end = expl_match.start(1) + len(expl_match.group(1).rstrip())

        # Find the question tag that precedes this explanation
        # Look backwards from expl_start
        preceding = content[:expl_start]
        tag_match = None
        for m in tag_pattern.finditer(preceding):
            tag_match = m  # Keep last (closest) match

        tag = tag_match.group(1) if tag_match else None
        blocks.append((expl_start, expl_end, tag, expl_text))

    return blocks


def restructure_file(fpath: str, verbose: bool = True) -> int:
    """
    Restructure all blob explanations in a DE file.
    Returns count of explanations restructured.
    """
    content = open(fpath, encoding='utf-8').read()
    blocks = parse_explanation_blocks(content)

    restructured = 0
    # Process in reverse order so offsets remain valid
    for expl_start, expl_end, tag, expl_text in reversed(blocks):
        # Skip if already formatted
        if ALREADY_FORMATTED.search(expl_text):
            continue

        # Skip if no option references
        if not re.search(r'\bOption\s+[A-D]\b', expl_text):
            continue

        # Try to restructure
        new_text = split_blob_into_bullets(expl_text)
        if new_text is None or new_text.strip() == expl_text.strip():
            continue

        if verbose and restructured == 0:
            fname = os.path.basename(fpath)
            print(f"\n=== {fname} ===")

        if verbose and restructured < 2:
            print(f"\n  Tag: {tag}")
            print(f"  BEFORE: {repr(expl_text[:200])}")
            print(f"  AFTER:  {repr(new_text[:200])}")

        # Replace in content
        content = content[:expl_start] + new_text + content[expl_end:]
        restructured += 1

    if restructured > 0 and not DRY_RUN:
        open(fpath, 'w', encoding='utf-8').write(content)

    return restructured


def main():
    print(f"Mode: {'DRY RUN' if DRY_RUN else 'LIVE'}")
    print(f"Processing DE files in: {DE_DIR}\n")

    total_restructured = 0
    files_modified = 0

    for fname in sorted(os.listdir(DE_DIR)):
        if not fname.endswith('.md'):
            continue
        fpath = os.path.join(DE_DIR, fname)
        count = restructure_file(fpath, verbose=True)
        if count > 0:
            files_modified += 1
            total_restructured += count
            print(f"  -> {count} explanation(s) restructured in {fname}")

    print(f"\n{'='*60}")
    print(f"Total restructured: {total_restructured} explanations in {files_modified} files")
    if DRY_RUN:
        print("(DRY RUN - no files were modified)")


if __name__ == '__main__':
    main()