#!/usr/bin/env python3
"""
build_provenance.py — Glidr SPL Exam Question Source Provenance Database

Traces each app question back to its source PDF exam paper.
Outputs:
  - Source_Provenance.md   (human-readable)
  - Source_Provenance.json (machine-readable)
"""

import re
import os
import sys
import json
import unicodedata
from pathlib import Path

try:
    import PyPDF2
except ImportError:
    print("ERROR: PyPDF2 not installed. Run: pip install PyPDF2")
    sys.exit(1)

# ─────────────────────────────────────────────────────────────────────────────
# Path constants
# ─────────────────────────────────────────────────────────────────────────────

ABLAGE_BASE = Path("/Users/i052341/Daten/Cloud/04 - Ablage/Ablage 2020 - 2029/Ablage 2025/Hobbies 2025/Segelflug/Theorie/Glidr")
SOURCES_DIR = ABLAGE_BASE / "SOURCES"
QUIZ_VDS_DIR = SOURCES_DIR / "QuizVDS"
FR_DIR = ABLAGE_BASE / "SPL Exam Questions FR"
PDF_DIR = SOURCES_DIR

OUTPUT_MD   = SOURCES_DIR / "Source_Provenance.md"
OUTPUT_JSON = SOURCES_DIR / "Source_Provenance.json"

# Subject number → QuizVDS filename
QUIZ_VDS_FILES = {
    10: "10 - Air Law.md",
    20: "20 - Aircraft General Knowledge.md",
    30: "30 - Flight Performance and Planning.md",
    40: "40 - Human Performance and Limitations.md",
    50: "50 - Meteorology.md",
    60: "60 - Navigation.md",
    70: "70 - Operational Procedures.md",
    80: "80 - Principles of Flight.md",
    90: "90 - Communication.md",
}

# Subject number → FR app filename
FR_FILES = {
    10: "10 - Droit aérien.md",
    20: "20 - Connaissances générales de l'aéronef.md",
    30: "30 - Performances et planification du vol.md",
    40: "40 - Performances humaines.md",
    50: "50 - Météorologie.md",
    60: "60 - Navigation.md",
    70: "70 - Procédures opérationnelles.md",
    80: "80 - Principes du vol.md",
    90: "90 - Radiotéléphonie.md",
}

# PDF source files (relative to SOURCES_DIR)
PDF_FILES = {
    "S1C": "Examen Blanc/Exa Blanc Série_1_Communes.pdf",
    "S1S": "Examen Blanc/Exa Blanc Série_1_Specifiques.pdf",
    "S2":  "Examen Blanc/Exa Blanc Série_2.pdf",
    "S3":  "Examen Blanc/Exa Blanc Série_3.pdf",
    "VV":  "VV/Questionnaire toutes branches VV.pdf",
}

# Branch labels as they appear in the S1C/S1S solution tables
S1C_BRANCH_MAP = {10: "BRANCHE  10", 40: "BRANCHE 40", 50: "BRANCHE 50", 90: "BRANCHE 90"}
S1S_BRANCH_MAP = {20: "BRANCHE  20", 30: "BRANCHE  30", 60: "BRANCHE  60", 70: "BRANCHE  70", 80: "BRANCHE  80"}

SUBJECT_NAMES = {
    10: "Air Law / Droit aérien",
    20: "Aircraft Knowledge / Connaissances aéronef",
    30: "Flight Performance / Performances vol",
    40: "Human Performance / Performances humaines",
    50: "Meteorology / Météorologie",
    60: "Navigation",
    70: "Operational Procedures / Procédures opérationnelles",
    80: "Principles of Flight / Principes du vol",
    90: "Communications / Radiotéléphonie",
}


# ─────────────────────────────────────────────────────────────────────────────
# Utility: accent folding + normalisation
# ─────────────────────────────────────────────────────────────────────────────

def normalize(text: str) -> str:
    """Lowercase, strip accents, keep alphanumerics and spaces only."""
    nfkd = unicodedata.normalize("NFKD", text)
    ascii_text = "".join(c for c in nfkd if not unicodedata.combining(c))
    return re.sub(r"[^a-z0-9 ]", " ", ascii_text.lower())


def word_set(text: str) -> set:
    """Return significant words as a set (accent-folded, stop-words removed)."""
    stop = {
        "a", "b", "c", "d", "la", "le", "les", "de", "du", "des", "un", "une",
        "et", "ou", "en", "au", "aux", "est", "il", "elle", "on", "que", "qui",
        "se", "sa", "son", "ce", "par", "sur", "pour", "avec", "dans", "si",
        "ne", "pas", "plus", "the", "of", "to", "is", "in", "an", "are", "at",
        "be", "by", "do", "for", "has", "have", "he", "it", "its", "no", "not",
        "or", "that", "this", "was", "we", "which", "you", "your", "l", "d",
        "j", "s", "n", "m", "y", "qu", "lorsque", "comme", "car", "mais",
        "donc", "lors", "quel", "quelle", "quels", "quelles", "comment", "quel",
        "peut", "doit", "doit", "sont", "ont", "ces", "lors", "aussi", "entre",
        "selon", "lors", "apres", "avant", "dans", "vers", "sous", "jusqu"
    }
    words = normalize(text).split()
    return {w for w in words if len(w) > 2 and w not in stop}


def jaccard(set_a: set, set_b: set) -> float:
    if not set_a or not set_b:
        return 0.0
    intersection = len(set_a & set_b)
    union = len(set_a | set_b)
    return intersection / union if union else 0.0


# ─────────────────────────────────────────────────────────────────────────────
# Step 1: Parse QuizVDS files → {tag: {question, options, correct}}
# ─────────────────────────────────────────────────────────────────────────────

def parse_quiz_vds() -> dict:
    """Returns dict keyed by tag (e.g. 't10q1') with QuizVDS question data."""
    print("\n[1/5] Parsing QuizVDS files...")
    quiz_db = {}

    for subject_num, filename in QUIZ_VDS_FILES.items():
        path = QUIZ_VDS_DIR / filename
        if not path.exists():
            print(f"  WARNING: {path} not found")
            continue

        with open(path, encoding="utf-8") as f:
            content = f.read()

        # Split on question headers: ### Q{N}: ...
        blocks = re.split(r"\n(?=### Q\d+:)", content)
        count = 0
        for block in blocks:
            m = re.match(r"### Q(\d+):\s*(.+?)(?:\n|$)(.*?)(?=\n---|\Z)", block, re.DOTALL)
            if not m:
                continue
            q_num = int(m.group(1))
            q_text = m.group(2).strip()
            rest = m.group(3)

            # Extract options A-D
            options = {}
            for opt in re.finditer(r"^- ([A-D])\)\s*(.+)$", rest, re.MULTILINE):
                options[opt.group(1)] = opt.group(2).strip()

            # Extract correct answer
            correct_m = re.search(r"\*\*Correct:\s*([A-D])\)\*\*", rest)
            correct = correct_m.group(1) if correct_m else None

            tag = f"t{subject_num}q{q_num}"
            quiz_db[tag] = {
                "question_en": q_text,
                "options_en": options,
                "quiz_correct": correct,
            }
            count += 1

        print(f"  {filename}: {count} questions parsed")

    print(f"  Total QuizVDS questions: {len(quiz_db)}")
    return quiz_db


# ─────────────────────────────────────────────────────────────────────────────
# Step 2: Parse FR app MD files → {tag: {question_fr, options_fr, app_correct}}
# ─────────────────────────────────────────────────────────────────────────────

def parse_fr_questions() -> dict:
    """Returns dict keyed by tag with French question data from app MD files."""
    print("\n[2/5] Parsing FR app question files...")
    fr_db = {}

    for subject_num, filename in FR_FILES.items():
        path = FR_DIR / filename
        if not path.exists():
            print(f"  WARNING: {path} not found")
            continue

        with open(path, encoding="utf-8") as f:
            content = f.read()

        # The header can be "### Q1:" OR "### Q1 :" (space before colon)
        # Tag is always "^t{NN}q{N}" at end of header line
        pattern = r"\n(?=### Q\d+\s*:.*\^t\d+q\d+)"
        blocks = re.split(pattern, content)

        count = 0
        for block in blocks:
            # Match header with optional space before colon, and tag
            header_m = re.match(r"### Q\d+\s*:\s*(.+?)\s*\^(t\d+q\d+)", block)
            if not header_m:
                continue
            q_text = header_m.group(1).strip()
            tag = header_m.group(2)

            # Options: various formats:
            #   "- A) text"
            #   "- [x] A) text"  (correct)
            #   "- [ ] A) text"  (wrong)
            #   "- **A)** text"  (bold format)
            options = {}
            app_correct = None

            # Format 1: "- [x] A) ..." or "- [ ] A) ..."
            for opt_m in re.finditer(r"^- \[( |x)\] ([A-D])\)\s*(.+)$", block, re.MULTILINE):
                checked = opt_m.group(1)
                letter = opt_m.group(2)
                text = opt_m.group(3).strip()
                options[letter] = text
                if checked == "x":
                    app_correct = letter

            # Format 2: "- A) ..." (no checkbox)
            if not options:
                for opt_m in re.finditer(r"^- \**([A-D])\)\**\s*(.+)$", block, re.MULTILINE):
                    letter = opt_m.group(1)
                    text = opt_m.group(2).strip()
                    options[letter] = text

            # Answer from "#### Réponse\n\nX)" pattern
            reponse_m = re.search(r"#### Réponse\s*\n+([A-D])\)", block)
            if reponse_m:
                app_correct = reponse_m.group(1)

            fr_db[tag] = {
                "question_fr": q_text,
                "options_fr": options,
                "app_correct": app_correct,
                "subject_num": subject_num,
            }
            count += 1

        print(f"  {filename}: {count} questions parsed")

    total = sum(1 for _ in fr_db)
    print(f"  Total FR app questions: {total}")
    return fr_db


# ─────────────────────────────────────────────────────────────────────────────
# Step 3: Extract text from all PDFs, page by page
# ─────────────────────────────────────────────────────────────────────────────

def extract_pdf_pages() -> dict:
    """Returns {pdf_code: [(page_num, text), ...]}"""
    print("\n[3/5] Extracting PDF text...")
    pdf_pages = {}

    for code, filename in PDF_FILES.items():
        path = PDF_DIR / filename
        if not path.exists():
            print(f"  WARNING: {path} not found")
            pdf_pages[code] = []
            continue

        pages = []
        try:
            with open(path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                n = len(reader.pages)
                for i in range(n):
                    text = reader.pages[i].extract_text() or ""
                    pages.append((i + 1, text))
            print(f"  {code} ({filename}): {n} pages")
        except Exception as e:
            print(f"  ERROR reading {filename}: {e}")

        pdf_pages[code] = pages

    return pdf_pages


# ─────────────────────────────────────────────────────────────────────────────
# Step 3b: Extract PDF answer keys
# ─────────────────────────────────────────────────────────────────────────────

def parse_s1_solution_table(text: str, branch_map: dict) -> dict:
    """
    Parse the columnar solution table at the end of S1C and S1S PDFs.
    Returns {subject_num: {q_num: answer_letter}}

    The table looks like:
      BRANCHE  10  BRANCHE 40  BRANCHE 50  BRANCHE 90
      1. A  1. C  1. A  1. A
      2. C  2. A  2. C  2. B
      ...
    """
    result = {}
    # Find positions of all branch headers
    branch_positions = []
    for subj, label in branch_map.items():
        idx = text.find(label)
        if idx >= 0:
            branch_positions.append((idx, subj))
    if not branch_positions:
        return result

    n_branches = len(branch_map)
    sorted_branches = [subj for _, subj in sorted(branch_positions)]

    # The answer section starts right after the line containing the branch headers.
    # Find the end of that header line (next \n after the last branch label).
    last_header_pos = max(p[0] for p in branch_positions)
    # Find the end of last label
    last_label = ""
    for subj, label in branch_map.items():
        if text.find(label) == last_header_pos:
            last_label = label
            break
    # Walk forward to end of header line
    scan_pos = last_header_pos + len(last_label)
    newline_pos = text.find("\n", scan_pos)
    if newline_pos < 0:
        return result
    answer_section = text[newline_pos + 1:]

    # Extract all "N. X" patterns
    all_answers = re.findall(r"\b(\d+)\.\s+([A-D])\b", answer_section)

    # Answers interleave: q1_b1, q1_b2, ..., q2_b1, q2_b2, ...
    branch_answers = {subj: {} for subj in sorted_branches}
    for i, (q_str, letter) in enumerate(all_answers):
        q_num = int(q_str)
        branch_idx = i % n_branches
        if branch_idx < len(sorted_branches):
            subj = sorted_branches[branch_idx]
            branch_answers[subj][q_num] = letter

    return branch_answers


def parse_vv_solution_answers(pages: list) -> dict:
    """
    Parse 'Solution question N : X' lines from VV PDF.
    Returns {q_num: answer_letter}
    """
    answers = {}
    for page_num, text in pages:
        for m in re.finditer(r"Solution question\s+(\d+)\s*:\s*([A-D])", text, re.IGNORECASE):
            q_num = int(m.group(1))
            letter = m.group(2).upper()
            answers[q_num] = letter
    return answers


def extract_pdf_answer_keys(pdf_pages: dict) -> dict:
    """
    Returns:
      S1C/S1S: {pdf_code: {subject_num: {q_num: letter}}}
      VV:      {pdf_code: {None: {q_num: letter}}}
    """
    print("\n[3b] Extracting PDF answer keys...")
    keys = {}

    # S1C
    if "S1C" in pdf_pages:
        full_text = "\n".join(text for _, text in pdf_pages["S1C"])
        answers = parse_s1_solution_table(full_text, S1C_BRANCH_MAP)
        keys["S1C"] = answers
        for subj, ans in answers.items():
            print(f"  S1C branch {subj}: {len(ans)} answers parsed")

    # S1S
    if "S1S" in pdf_pages:
        full_text = "\n".join(text for _, text in pdf_pages["S1S"])
        answers = parse_s1_solution_table(full_text, S1S_BRANCH_MAP)
        keys["S1S"] = answers
        for subj, ans in answers.items():
            print(f"  S1S branch {subj}: {len(ans)} answers parsed")

    keys["S2"] = {}
    keys["S3"] = {}

    # VV
    if "VV" in pdf_pages:
        vv_answers = parse_vv_solution_answers(pdf_pages["VV"])
        keys["VV"] = {None: vv_answers}
        print(f"  VV: {len(vv_answers)} answers parsed")

    return keys


# ─────────────────────────────────────────────────────────────────────────────
# Step 4: Build question-level chunks from PDFs
# ─────────────────────────────────────────────────────────────────────────────

def build_pdf_question_chunks(pdf_pages: dict) -> list:
    """
    Split PDF pages into individual question chunks for better matching.

    For each PDF, we extract chunks of text corresponding to individual questions.
    We split on patterns like:
      - "1." / "2." at start of line (S1C, S1S, S2, S3: numbered questions)
      - "Solution question N :" boundaries (VV)

    Returns list of:
      (pdf_code, page_num, chunk_q_num, chunk_text, chunk_word_set)
    """
    chunks = []

    for code, pages in pdf_pages.items():
        # Combine all text per PDF but track page boundaries
        all_text = ""
        page_breaks = []  # [(char_offset, page_num)]
        for page_num, text in pages:
            page_breaks.append((len(all_text), page_num))
            all_text += text + "\n"

        def char_to_page(offset):
            """Return page_num for a character offset."""
            for i in range(len(page_breaks) - 1, -1, -1):
                if offset >= page_breaks[i][0]:
                    return page_breaks[i][1]
            return 1

        if code in ("S1C", "S1S", "S2", "S3"):
            # Split on numbered questions: line starting with "N." where N is 1-99
            # followed by a space and capital letter (French question text)
            splits = list(re.finditer(
                r"(?:^|\n)\s*(\d{1,2})\.\s+([A-ZÀÂÄÉÈÊËÎÏÔÙÛÜÇ])",
                all_text
            ))
            for i, m in enumerate(splits):
                q_num = int(m.group(1))
                start = m.start()
                end = splits[i + 1].start() if i + 1 < len(splits) else len(all_text)
                chunk = all_text[start:end].strip()
                if len(chunk) > 30:
                    pg = char_to_page(start)
                    ws = word_set(chunk)
                    if len(ws) >= 4:
                        chunks.append((code, pg, q_num, chunk, ws))

        elif code == "VV":
            # VV: each question is preceded by "Solution question N-1 : X\n"
            # and has its own question block before the next solution marker
            # Split on subject headers ("10 Droit aérien", "20 ...", etc.) or
            # on "Solution question N :" markers combined with question text
            # Strategy: split on "Solution question N :" boundaries

            # Find all solution markers
            sol_markers = list(re.finditer(
                r"Solution question\s+(\d+)\s*:\s*[A-D]",
                all_text,
                re.IGNORECASE
            ))

            for i, m in enumerate(sol_markers):
                q_num = int(m.group(1))
                # The question text appears BEFORE this solution marker
                # (between previous solution marker end and this one)
                prev_end = sol_markers[i - 1].end() if i > 0 else 0
                chunk = all_text[prev_end:m.start()].strip()
                # Remove page headers ("10 Droit aérien", "Page N", etc.)
                chunk = re.sub(r"^\s*\d{2}\s+[A-ZÀ-Ü].{0,40}\n", "", chunk, flags=re.MULTILINE)
                chunk = re.sub(r"^\s*(?:Page|Edition)\s+\d+.*\n", "", chunk, flags=re.MULTILINE)
                if len(chunk) > 20:
                    pg = char_to_page(m.start())
                    ws = word_set(chunk)
                    if len(ws) >= 4:
                        chunks.append((code, pg, q_num, chunk, ws))

    print(f"  PDF question chunks built: {len(chunks)} total")
    return chunks


# ─────────────────────────────────────────────────────────────────────────────
# Step 5: Match FR questions against PDF chunks
# ─────────────────────────────────────────────────────────────────────────────

def find_best_pdf_match(question_text: str, options: dict, page_index: list,
                        threshold: float = 0.15):
    """
    Find the best matching PDF chunk for a question.
    Returns (pdf_code, page_num, pdf_q_num, score) or (None, None, None, 0.0)
    """
    combined = question_text
    for opt_text in options.values():
        combined += " " + opt_text
    q_words = word_set(combined)

    if not q_words:
        return None, None, None, 0.0

    best_score = 0.0
    best_code = None
    best_page = None
    best_qnum = None

    for code, page_num, chunk_q_num, _, chunk_words in page_index:
        score = jaccard(q_words, chunk_words)
        if score > best_score:
            best_score = score
            best_code = code
            best_page = page_num
            best_qnum = chunk_q_num

    if best_score < threshold:
        return None, None, None, best_score

    return best_code, best_page, best_qnum, best_score


# ─────────────────────────────────────────────────────────────────────────────
# Step 5b: Get PDF answer key answer for matched question
# ─────────────────────────────────────────────────────────────────────────────

def get_pdf_answer(subject_num: int, pdf_code: str, pdf_q_num: int,
                   pdf_keys: dict) -> str | None:
    """Look up the answer key answer for a matched PDF question."""
    if pdf_code == "VV":
        return pdf_keys.get("VV", {}).get(None, {}).get(pdf_q_num)
    elif pdf_code in ("S1C", "S1S"):
        subj_answers = pdf_keys.get(pdf_code, {}).get(subject_num, {})
        return subj_answers.get(pdf_q_num)
    # S2, S3: no keys
    return None


# ─────────────────────────────────────────────────────────────────────────────
# Step 6: Build provenance database
# ─────────────────────────────────────────────────────────────────────────────

def build_provenance(quiz_db: dict, fr_db: dict, pdf_pages: dict, pdf_keys: dict) -> tuple:
    """Main matching loop. Returns (records, stats)."""
    print("\n[4/5] Building PDF question chunk index and matching...")
    chunk_index = build_pdf_question_chunks(pdf_pages)

    records = []
    tags = sorted(fr_db.keys(), key=lambda t: (
        int(re.search(r't(\d+)', t).group(1)),
        int(re.search(r'q(\d+)', t).group(1))
    ))
    total = len(tags)
    matched = 0
    unmatched = 0
    answer_mismatches = 0

    for i, tag in enumerate(tags):
        if i % 200 == 0:
            print(f"  Progress: {i}/{total} tags processed...")

        fr_data = fr_db[tag]
        quiz_data = quiz_db.get(tag, {})
        subject_num = fr_data["subject_num"]

        question_fr = fr_data.get("question_fr", "")
        options_fr = fr_data.get("options_fr", {})
        app_correct = fr_data.get("app_correct")
        quiz_correct = quiz_data.get("quiz_correct")

        # Find best PDF chunk match
        pdf_code, page_num, pdf_q_num, score = find_best_pdf_match(
            question_fr, options_fr, chunk_index
        )

        is_matched = pdf_code is not None
        if is_matched:
            matched += 1
        else:
            unmatched += 1

        # Get PDF answer key answer
        pdf_answer = None
        if is_matched and pdf_code:
            pdf_answer = get_pdf_answer(subject_num, pdf_code, pdf_q_num, pdf_keys)

        # Detect answer mismatches.
        #
        # IMPORTANT CAVEAT: Answer option order (A/B/C/D) is frequently shuffled
        # between exam papers and the app's FR version. A letter mismatch does NOT
        # necessarily mean the wrong answer — the same content answer may appear
        # at a different letter.  We flag mismatches as informational; manual
        # review is required to confirm genuine wrong answers.
        mismatch_flags = []
        quiz_shuffled = (quiz_correct and app_correct and quiz_correct != app_correct)
        if quiz_shuffled:
            mismatch_flags.append(f"QUIZ_VS_APP:{quiz_correct}!={app_correct}")
        if pdf_answer and app_correct and pdf_answer != app_correct:
            mismatch_flags.append(f"PDF_VS_APP:{pdf_answer}!={app_correct}")
        if pdf_answer and quiz_correct and pdf_answer != quiz_correct:
            mismatch_flags.append(f"PDF_VS_QUIZ:{pdf_answer}!={quiz_correct}")

        # Count as flagged mismatch when PDF key disagrees with app answer
        # (these require manual verification — may be option-shuffle or real error)
        has_real_mismatch = bool(pdf_answer and app_correct and pdf_answer != app_correct)

        if has_real_mismatch:
            answer_mismatches += 1

        record = {
            "tag": tag,
            "subject_num": subject_num,
            "question_fr": question_fr,
            "options_fr": options_fr,
            "app_correct": app_correct,
            "quiz_correct": quiz_correct,
            "pdf_source": pdf_code,
            "pdf_page": page_num,
            "pdf_q_num": pdf_q_num,
            "match_score": round(score, 4),
            "pdf_answer": pdf_answer,
            "mismatch_flags": mismatch_flags,
            "has_mismatch": has_real_mismatch,
        }
        records.append(record)

    print(f"\n  Matched: {matched}/{total} ({100*matched//total if total else 0}%)")
    print(f"  Unmatched: {unmatched}/{total}")
    print(f"  Answer mismatches found: {answer_mismatches}")

    stats = {
        "total": total,
        "matched": matched,
        "unmatched": unmatched,
        "answer_mismatches": answer_mismatches
    }
    return records, stats


# ─────────────────────────────────────────────────────────────────────────────
# Step 7: Write outputs
# ─────────────────────────────────────────────────────────────────────────────

def write_outputs(records: list, stats: dict):
    print("\n[5/5] Writing output files...")

    # ── JSON ──────────────────────────────────────────────────────────────────
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump({"stats": stats, "records": records}, f, ensure_ascii=False, indent=2)
    print(f"  JSON written: {OUTPUT_JSON}")

    # ── Markdown ──────────────────────────────────────────────────────────────
    lines = []
    lines.append("# Source Provenance Database — Glidr SPL Exam Questions")
    lines.append("")
    lines.append(f"Generated: 2026-04-12  |  Total questions: {stats['total']}  |  "
                 f"Matched: {stats['matched']}  |  Unmatched: {stats['unmatched']}  |  "
                 f"Answer mismatches: {stats['answer_mismatches']}")
    lines.append("")
    lines.append("## Legend")
    lines.append("")
    lines.append("| Column | Description |")
    lines.append("|--------|-------------|")
    lines.append("| Tag | Question tag (e.g. t10q1) |")
    lines.append("| PDF Source | Which exam paper (S1C/S1S/S2/S3/VV) |")
    lines.append("| Page | PDF page number |")
    lines.append("| PDF Q# | Question number within that PDF |")
    lines.append("| Score | Jaccard word-overlap similarity (≥0.15 = match) |")
    lines.append("| App | Current app correct answer |")
    lines.append("| Quiz | QuizVDS original answer (EN import) |")
    lines.append("| PDF Key | Answer from PDF solution page |")
    lines.append("| Flags | Mismatch warnings |")
    lines.append("")
    lines.append("## PDF Sources")
    lines.append("")
    lines.append("| Code | File |")
    lines.append("|------|------|")
    for code, fname in PDF_FILES.items():
        lines.append(f"| {code} | {fname} |")
    lines.append("")

    # Group by subject
    from itertools import groupby
    records_sorted = sorted(records, key=lambda r: (
        r["subject_num"],
        int(re.search(r'q(\d+)', r["tag"]).group(1))
    ))

    for subject_num, group in groupby(records_sorted, key=lambda r: r["subject_num"]):
        group_list = list(group)
        subject_name = SUBJECT_NAMES.get(subject_num, f"Subject {subject_num}")
        matched_in_group = sum(1 for r in group_list if r["pdf_source"])
        lines.append(f"## Subject {subject_num}: {subject_name}")
        lines.append("")
        lines.append(f"Total: {len(group_list)} questions | Matched: {matched_in_group}")
        lines.append("")
        lines.append("| Tag | PDF | Page | PDF Q# | Score | App | Quiz | PDF Key | Flags |")
        lines.append("|-----|-----|------|--------|-------|-----|------|---------|-------|")

        for r in group_list:
            tag = r["tag"]
            pdf_src = r["pdf_source"] or "—"
            page = str(r["pdf_page"]) if r["pdf_page"] else "—"
            pdf_qn = str(r["pdf_q_num"]) if r["pdf_q_num"] else "—"
            score = f"{r['match_score']:.3f}"
            app = r["app_correct"] or "?"
            quiz = r["quiz_correct"] or "—"
            pdf_key = r["pdf_answer"] or "—"
            flags = " ".join(r["mismatch_flags"]) if r["mismatch_flags"] else ""
            row = f"| {tag} | {pdf_src} | {page} | {pdf_qn} | {score} | {app} | {quiz} | {pdf_key} | {flags} |"
            lines.append(row)

        lines.append("")

    # Flagged mismatches: PDF key letter differs from app answer letter
    mismatches = [r for r in records if r["has_mismatch"]]
    if mismatches:
        lines.append("## Flagged Answer Letter Differences (PDF Key vs App Answer)")
        lines.append("")
        lines.append("> **IMPORTANT**: Answer option order (A/B/C/D) is frequently shuffled between")
        lines.append("> exam papers and the app's FR version. A letter difference does NOT necessarily")
        lines.append("> indicate a wrong answer — the same correct content may appear at a different")
        lines.append("> letter. Manual review is required to confirm genuine errors.")
        lines.append("")
        lines.append(f"Found {len(mismatches)} questions with letter disagreements:")
        lines.append("")
        lines.append("| Tag | Score | Question (FR, truncated) | App | PDF Key | PDF Source |")
        lines.append("|-----|-------|--------------------------|-----|---------|------------|")
        for r in sorted(mismatches, key=lambda r: -r["match_score"]):
            q_short = r["question_fr"][:55].replace("|", "/")
            app = r["app_correct"] or "?"
            pdf_key = r["pdf_answer"] or "—"
            src = f"{r['pdf_source']} p{r['pdf_page']}" if r['pdf_source'] else "—"
            lines.append(f"| {r['tag']} | {r['match_score']:.3f} | {q_short}… | {app} | {pdf_key} | {src} |")
        lines.append("")

    # Unmatched summary
    unmatched_list = [r for r in records if not r["pdf_source"]]
    if unmatched_list:
        lines.append("## Unmatched Questions (score < 0.15)")
        lines.append("")
        lines.append(f"Found {len(unmatched_list)} questions with no strong PDF match:")
        lines.append("")
        lines.append("| Tag | Best Score | Question (FR, truncated) |")
        lines.append("|-----|------------|--------------------------|")
        for r in sorted(unmatched_list, key=lambda r: r["tag"]):
            q_short = r["question_fr"][:70].replace("|", "/")
            lines.append(f"| {r['tag']} | {r['match_score']:.3f} | {q_short} |")
        lines.append("")

    with open(OUTPUT_MD, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print(f"  Markdown written: {OUTPUT_MD}")


# ─────────────────────────────────────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────────────────────────────────────

def main():
    print("=" * 70)
    print("Glidr Source Provenance Builder")
    print("=" * 70)

    quiz_db = parse_quiz_vds()
    fr_db = parse_fr_questions()
    pdf_pages = extract_pdf_pages()
    pdf_keys = extract_pdf_answer_keys(pdf_pages)
    records, stats = build_provenance(quiz_db, fr_db, pdf_pages, pdf_keys)
    write_outputs(records, stats)

    print("\n" + "=" * 70)
    print("PUBLISH COMPLETE")
    print(f"  Total questions: {stats['total']}")
    print(f"  Matched to PDF:  {stats['matched']} ({100*stats['matched']//stats['total'] if stats['total'] else 0}%)")
    print(f"  Unmatched:       {stats['unmatched']}")
    print(f"  Answer mismatches: {stats['answer_mismatches']}")
    print(f"\n  Output MD:   {OUTPUT_MD}")
    print(f"  Output JSON: {OUTPUT_JSON}")
    print("=" * 70)


if __name__ == "__main__":
    main()