#!/usr/bin/env python3
"""
Restructure explanation text in SPL exam question files to use markdown bullets
for option analysis sentences.
"""

import re
import os
from pathlib import Path

BASE_DIR = Path("/Users/i052341/Daten/Cloud/04 - Ablage/Ablage 2020 - 2029/Ablage 2025/Hobbies 2025/Segelflug/Theorie/Glidr")

SUBJECT_DIRS = {
    "EN": BASE_DIR / "SPL Exam Questions EN",
    "DE": BASE_DIR / "SPL Exam Questions DE",
    "FR": BASE_DIR / "SPL Exam Questions FR",
}

EXPLANATION_HEADERS = {
    "EN": "#### Explanation",
    "DE": "#### Erklärung",
    "FR": "#### Explication",
}

KEY_TERMS_HEADERS = {
    "EN": "#### Key Terms",
    "DE": "#### Begriffe",
    "FR": "#### Termes clés",
}

# Patterns that identify option-analysis sentences.
# Each pattern must match the START of a sentence (after splitting on sentence boundaries).
# Strategy: a sentence is an option sentence if it STARTS with an option reference.
# This catches all forms: "Option A is...", "Option A (label) contains...", "Options A and B are...", etc.
OPTION_PATTERNS = [
    # EN/DE: sentence starts with "Option" or "Options" followed by one or more letters A-D
    r"^Options?\s+[A-D]",
    # EN/DE: "Die Option A" or "Nur Option A"
    r"^(?:Die|Nur|Only)\s+Options?\s+[A-D]",
    # EN bare letter: "A is wrong", "B is incorrect" (only when followed by a form of "to be")
    r"^[A-D]\s+(?:is|are|was|were|would|can|cannot|does|did|has|have)\b",
    # DE bare letter: "A ist falsch"
    r"^[A-D]\s+(?:ist|sind|war|w\u00e4re)\b",
    # FR: "L'option A" or "L\u2019option A"
    r"^L['\u2019]?options?\s+[A-D]",
    # FR bare letter: "A est incorrecte"
    r"^[A-D]\s+est\b",
    # Generic: "Option A:" or "Option A —" or "Option A–"
    r"^Options?\s+[A-D](?:\s*(?:,|and|und|et)\s*(?:and\s+|und\s+|et\s+)?[A-D])*\s*(?::|—|–)",
    # "Seule l'option C" (FR), "Nur Option C" (DE), "Only Option C" (EN) - correct answer callouts
    r"^(?:Seule\s+l['\u2019]?option|Nur\s+Option|Only\s+Option)\s+[A-D]",
]

# Compiled combined pattern (case-sensitive)
# We split by detecting sentence starts matching any option pattern.
# Strategy: split the explanation text into sentences, then bucket them.

def split_into_sentences(text):
    """
    Split text into sentences. Split on sentence-ending punctuation followed by
    a space and a capital letter. Handles:
      - ". Capital"
      - '." Capital'  (period inside quotes)
      - ".'" / ".»" etc.
    """
    # Split on: period (optionally followed by closing quote/bracket) then whitespace then capital
    parts = re.split(r'(?<=\.)["\'\u201d\u2019»]?\s+(?=[A-Z\xdc\xc4\xd6L\'"«\u201c\u2018])', text.strip())
    return parts


def is_option_sentence(sentence, lang):
    """Return True if the sentence is an option-analysis sentence."""
    s = sentence.strip()
    for pat in OPTION_PATTERNS:
        if re.match(pat, s, re.IGNORECASE):
            return True
    return False


def bold_option_reference(sentence):
    """
    Bold the initial option reference in a sentence.
    e.g. "Option A is wrong..." -> "**Option A** is wrong..."
         "Option A (label) contains..." -> "**Option A** (label) contains..."
         "Options A and B are..." -> "**Options A and B** are..."
         "L'option A est..." -> "**L'option A** est..."
         "Only Option C correctly..." -> "Only **Option C** correctly..."
         "A is wrong" -> "**A** is wrong"
    """
    s = sentence.strip()

    patterns_to_bold = [
        # EN/DE multi: "Options A, B, and C" / "Options A und B"  (must come before single)
        (r'^(Options?\s+[A-D](?:\s*(?:,|and|und)\s*(?:and\s+|und\s+)?[A-D])+)', r'**\1**'),
        # EN/DE single: "Option A"
        (r'^(Options?\s+[A-D])\b', r'**\1**'),
        # FR multi: "L'option A et B"
        (r"^(L['\u2019]?options?\s+[A-D](?:\s*(?:,|et)\s*(?:et\s+)?[A-D])+)", r'**\1**'),
        # FR single: "L'option A"
        (r"^(L['\u2019]?options?\s+[A-D])\b", r'**\1**'),
        # EN "Only Option C" -> "Only **Option C**"
        (r'^(Only\s+)(Options?\s+[A-D])\b', r'\1**\2**'),
        # DE "Nur Option C"
        (r'^(Nur\s+)(Options?\s+[A-D])\b', r'\1**\2**'),
        # FR "Seule l'option C"
        (r"^(Seule\s+)(l['\u2019]?options?\s+[A-D])\b", r'\1**\2**'),
        # DE "Die Option A"
        (r'^(Die\s+)(Options?\s+[A-D])\b', r'\1**\2**'),
        # Bare letter: "A is", "B est"
        (r'^([A-D])(\s+(?:is|are|ist|sind|est|sont|was|w\u00e4re|serait)\b)', r'**\1**\2'),
    ]

    for pat, repl in patterns_to_bold:
        new_s = re.sub(pat, repl, s, count=1, flags=re.IGNORECASE)
        if new_s != s:
            return new_s

    return s


def restructure_explanation(explanation_text, lang):
    """
    Given the raw explanation text (without the header line), restructure it:
    - Sentences before the first option-analysis sentence -> main paragraph
    - Option-analysis sentences -> bullet points
    Returns (new_text, was_changed: bool)
    """
    text = explanation_text.strip()
    if not text:
        return text, False

    sentences = split_into_sentences(text)
    if not sentences:
        return text, False

    # Find index of first option sentence
    first_option_idx = None
    for i, s in enumerate(sentences):
        if is_option_sentence(s, lang):
            first_option_idx = i
            break

    if first_option_idx is None:
        # No option sentences found, leave as-is
        return text, False

    # Main paragraph: sentences before first option sentence
    main_sentences = sentences[:first_option_idx]
    option_sentences = sentences[first_option_idx:]

    # Verify that option_sentences are indeed all option-like (some trailing sentences might not be)
    # We keep going: once we see an option sentence, everything else goes to bullets
    # (this matches the described format where option analysis is at the end)

    main_paragraph = " ".join(s.strip() for s in main_sentences if s.strip())

    bullets = []
    for s in option_sentences:
        s = s.strip()
        if not s:
            continue
        # Remove trailing period for bullet, then re-add
        s_clean = s.rstrip(".")
        bolded = bold_option_reference(s_clean)
        bullets.append(f"- {bolded}.")

    if not bullets:
        return text, False

    # Build new text
    parts = []
    if main_paragraph:
        parts.append(main_paragraph)
        parts.append("")  # blank line
    parts.extend(bullets)

    new_text = "\n".join(parts)

    # Only report change if something actually changed
    changed = new_text.strip() != text.strip()
    return new_text, changed


def process_file(filepath, lang, explanation_header, key_terms_header):
    """
    Process a single markdown file. Returns (content, count_restructured).
    """
    content = filepath.read_text(encoding="utf-8")
    lines = content.split("\n")

    count = 0
    result_lines = []
    i = 0

    while i < len(lines):
        line = lines[i]

        # Check if this line is the explanation header
        if line.strip() == explanation_header:
            result_lines.append(line)
            i += 1

            # Collect blank lines after header
            while i < len(lines) and lines[i].strip() == "":
                result_lines.append(lines[i])
                i += 1

            # Collect the explanation body until we hit:
            # - key terms header
            # - next question (### Q)
            # - end of file
            explanation_body_lines = []
            while i < len(lines):
                l = lines[i]
                if (l.strip() == key_terms_header or
                        l.strip().startswith("### Q") or
                        l.strip().startswith("#### ")):
                    break
                explanation_body_lines.append(l)
                i += 1

            # The explanation body (trim trailing blanks)
            raw_body = "\n".join(explanation_body_lines).rstrip()

            new_body, changed = restructure_explanation(raw_body, lang)

            if changed:
                count += 1
                result_lines.append(new_body)
                result_lines.append("")  # trailing blank line
            else:
                # Restore original lines
                for bl in explanation_body_lines:
                    result_lines.append(bl)

        else:
            result_lines.append(line)
            i += 1

    new_content = "\n".join(result_lines)
    return new_content, count


def main():
    total_restructured = 0
    total_files = 0

    for lang, dir_path in SUBJECT_DIRS.items():
        explanation_header = EXPLANATION_HEADERS[lang]
        key_terms_header = KEY_TERMS_HEADERS[lang]

        md_files = sorted(dir_path.glob("*.md"))
        for filepath in md_files:
            # Skip the combined file
            if "SPL Exam Questions" in filepath.name:
                continue

            new_content, count = process_file(
                filepath, lang, explanation_header, key_terms_header
            )

            if count > 0:
                filepath.write_text(new_content, encoding="utf-8")
                print(f"  [{lang}] {filepath.name}: {count} explanation(s) restructured")
                total_restructured += count
            else:
                print(f"  [{lang}] {filepath.name}: no changes")

            total_files += 1

    print(f"\nDone. {total_restructured} explanations restructured across {total_files} files.")


if __name__ == "__main__":
    main()