APPS/glidr-content.git - git.mnsoft.org

Add ICAO light signals reference doc and update figures
Matthias Nott
yesterday 21f29d4c1e6a6d6dcd01ba0f57f8dc4c2a16be9f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
#!/usr/bin/env python3
"""
build_provenance.py — Glidr SPL Exam Question Source Provenance Database

Traces each app question back to its source PDF exam paper.
Outputs:
  - Source_Provenance.md   (human-readable)
  - Source_Provenance.json (machine-readable)
"""

import re
import os
import sys
import json
import unicodedata
from pathlib import Path

try:
    import PyPDF2
except ImportError:
    print("ERROR: PyPDF2 not installed. Run: pip install PyPDF2")
    sys.exit(1)

# ─────────────────────────────────────────────────────────────────────────────
# Path constants
# ─────────────────────────────────────────────────────────────────────────────

ABLAGE_BASE = Path("/Users/i052341/Daten/Cloud/04 - Ablage/Ablage 2020 - 2029/Ablage 2025/Hobbies 2025/Segelflug/Theorie/Glidr")
SOURCES_DIR = ABLAGE_BASE / "SOURCES"
QUIZ_VDS_DIR = SOURCES_DIR / "QuizVDS"
FR_DIR = ABLAGE_BASE / "SPL Exam Questions FR"
PDF_DIR = SOURCES_DIR

OUTPUT_MD   = SOURCES_DIR / "Source_Provenance.md"
OUTPUT_JSON = SOURCES_DIR / "Source_Provenance.json"

# Subject number → QuizVDS filename
QUIZ_VDS_FILES = {
    10: "10 - Air Law.md",
    20: "20 - Aircraft General Knowledge.md",
    30: "30 - Flight Performance and Planning.md",
    40: "40 - Human Performance and Limitations.md",
    50: "50 - Meteorology.md",
    60: "60 - Navigation.md",
    70: "70 - Operational Procedures.md",
    80: "80 - Principles of Flight.md",
    90: "90 - Communication.md",
}

# Subject number → FR app filename
FR_FILES = {
    10: "10 - Droit aérien.md",
    20: "20 - Connaissances générales de l'aéronef.md",
    30: "30 - Performances et planification du vol.md",
    40: "40 - Performances humaines.md",
    50: "50 - Météorologie.md",
    60: "60 - Navigation.md",
    70: "70 - Procédures opérationnelles.md",
    80: "80 - Principes du vol.md",
    90: "90 - Radiotéléphonie.md",
}

# PDF source files (relative to SOURCES_DIR)
PDF_FILES = {
    "S1C": "Examen Blanc/Exa Blanc Série_1_Communes.pdf",
    "S1S": "Examen Blanc/Exa Blanc Série_1_Specifiques.pdf",
    "S2":  "Examen Blanc/Exa Blanc Série_2.pdf",
    "S3":  "Examen Blanc/Exa Blanc Série_3.pdf",
    "VV":  "VV/Questionnaire toutes branches VV.pdf",
}

# Branch labels as they appear in the S1C/S1S solution tables
S1C_BRANCH_MAP = {10: "BRANCHE  10", 40: "BRANCHE 40", 50: "BRANCHE 50", 90: "BRANCHE 90"}
S1S_BRANCH_MAP = {20: "BRANCHE  20", 30: "BRANCHE  30", 60: "BRANCHE  60", 70: "BRANCHE  70", 80: "BRANCHE  80"}

SUBJECT_NAMES = {
    10: "Air Law / Droit aérien",
    20: "Aircraft Knowledge / Connaissances aéronef",
    30: "Flight Performance / Performances vol",
    40: "Human Performance / Performances humaines",
    50: "Meteorology / Météorologie",
    60: "Navigation",
    70: "Operational Procedures / Procédures opérationnelles",
    80: "Principles of Flight / Principes du vol",
    90: "Communications / Radiotéléphonie",
}


# ─────────────────────────────────────────────────────────────────────────────
# Utility: accent folding + normalisation
# ─────────────────────────────────────────────────────────────────────────────

def normalize(text: str) -> str:
    """Lowercase, strip accents, keep alphanumerics and spaces only."""
    nfkd = unicodedata.normalize("NFKD", text)
    ascii_text = "".join(c for c in nfkd if not unicodedata.combining(c))
    return re.sub(r"[^a-z0-9 ]", " ", ascii_text.lower())


def word_set(text: str) -> set:
    """Return significant words as a set (accent-folded, stop-words removed)."""
    stop = {
        "a", "b", "c", "d", "la", "le", "les", "de", "du", "des", "un", "une",
        "et", "ou", "en", "au", "aux", "est", "il", "elle", "on", "que", "qui",
        "se", "sa", "son", "ce", "par", "sur", "pour", "avec", "dans", "si",
        "ne", "pas", "plus", "the", "of", "to", "is", "in", "an", "are", "at",
        "be", "by", "do", "for", "has", "have", "he", "it", "its", "no", "not",
        "or", "that", "this", "was", "we", "which", "you", "your", "l", "d",
        "j", "s", "n", "m", "y", "qu", "lorsque", "comme", "car", "mais",
        "donc", "lors", "quel", "quelle", "quels", "quelles", "comment", "quel",
        "peut", "doit", "doit", "sont", "ont", "ces", "lors", "aussi", "entre",
        "selon", "lors", "apres", "avant", "dans", "vers", "sous", "jusqu"
    }
    words = normalize(text).split()
    return {w for w in words if len(w) > 2 and w not in stop}


def jaccard(set_a: set, set_b: set) -> float:
    if not set_a or not set_b:
        return 0.0
    intersection = len(set_a & set_b)
    union = len(set_a | set_b)
    return intersection / union if union else 0.0


# ─────────────────────────────────────────────────────────────────────────────
# Step 1: Parse QuizVDS files → {tag: {question, options, correct}}
# ─────────────────────────────────────────────────────────────────────────────

def parse_quiz_vds() -> dict:
    """Returns dict keyed by tag (e.g. 't10q1') with QuizVDS question data."""
    print("\n[1/5] Parsing QuizVDS files...")
    quiz_db = {}

    for subject_num, filename in QUIZ_VDS_FILES.items():
        path = QUIZ_VDS_DIR / filename
        if not path.exists():
            print(f"  WARNING: {path} not found")
            continue

        with open(path, encoding="utf-8") as f:
            content = f.read()

        # Split on question headers: ### Q{N}: ...
        blocks = re.split(r"\n(?=### Q\d+:)", content)
        count = 0
        for block in blocks:
            m = re.match(r"### Q(\d+):\s*(.+?)(?:\n|$)(.*?)(?=\n---|\Z)", block, re.DOTALL)
            if not m:
                continue
            q_num = int(m.group(1))
            q_text = m.group(2).strip()
            rest = m.group(3)

            # Extract options A-D
            options = {}
            for opt in re.finditer(r"^- ([A-D])\)\s*(.+)$", rest, re.MULTILINE):
                options[opt.group(1)] = opt.group(2).strip()

            # Extract correct answer
            correct_m = re.search(r"\*\*Correct:\s*([A-D])\)\*\*", rest)
            correct = correct_m.group(1) if correct_m else None

            tag = f"t{subject_num}q{q_num}"
            quiz_db[tag] = {
                "question_en": q_text,
                "options_en": options,
                "quiz_correct": correct,
            }
            count += 1

        print(f"  {filename}: {count} questions parsed")

    print(f"  Total QuizVDS questions: {len(quiz_db)}")
    return quiz_db


# ─────────────────────────────────────────────────────────────────────────────
# Step 2: Parse FR app MD files → {tag: {question_fr, options_fr, app_correct}}
# ─────────────────────────────────────────────────────────────────────────────

def parse_fr_questions() -> dict:
    """Returns dict keyed by tag with French question data from app MD files."""
    print("\n[2/5] Parsing FR app question files...")
    fr_db = {}

    for subject_num, filename in FR_FILES.items():
        path = FR_DIR / filename
        if not path.exists():
            print(f"  WARNING: {path} not found")
            continue

        with open(path, encoding="utf-8") as f:
            content = f.read()

        # The header can be "### Q1:" OR "### Q1 :" (space before colon)
        # Tag is always "^t{NN}q{N}" at end of header line
        pattern = r"\n(?=### Q\d+\s*:.*\^t\d+q\d+)"
        blocks = re.split(pattern, content)

        count = 0
        for block in blocks:
            # Match header with optional space before colon, and tag
            header_m = re.match(r"### Q\d+\s*:\s*(.+?)\s*\^(t\d+q\d+)", block)
            if not header_m:
                continue
            q_text = header_m.group(1).strip()
            tag = header_m.group(2)

            # Options: various formats:
            #   "- A) text"
            #   "- [x] A) text"  (correct)
            #   "- [ ] A) text"  (wrong)
            #   "- **A)** text"  (bold format)
            options = {}
            app_correct = None

            # Format 1: "- [x] A) ..." or "- [ ] A) ..."
            for opt_m in re.finditer(r"^- \[( |x)\] ([A-D])\)\s*(.+)$", block, re.MULTILINE):
                checked = opt_m.group(1)
                letter = opt_m.group(2)
                text = opt_m.group(3).strip()
                options[letter] = text
                if checked == "x":
                    app_correct = letter

            # Format 2: "- A) ..." (no checkbox)
            if not options:
                for opt_m in re.finditer(r"^- \**([A-D])\)\**\s*(.+)$", block, re.MULTILINE):
                    letter = opt_m.group(1)
                    text = opt_m.group(2).strip()
                    options[letter] = text

            # Answer from "#### Réponse\n\nX)" pattern
            reponse_m = re.search(r"#### Réponse\s*\n+([A-D])\)", block)
            if reponse_m:
                app_correct = reponse_m.group(1)

            fr_db[tag] = {
                "question_fr": q_text,
                "options_fr": options,
                "app_correct": app_correct,
                "subject_num": subject_num,
            }
            count += 1

        print(f"  {filename}: {count} questions parsed")

    total = sum(1 for _ in fr_db)
    print(f"  Total FR app questions: {total}")
    return fr_db


# ─────────────────────────────────────────────────────────────────────────────
# Step 3: Extract text from all PDFs, page by page
# ─────────────────────────────────────────────────────────────────────────────

def extract_pdf_pages() -> dict:
    """Returns {pdf_code: [(page_num, text), ...]}"""
    print("\n[3/5] Extracting PDF text...")
    pdf_pages = {}

    for code, filename in PDF_FILES.items():
        path = PDF_DIR / filename
        if not path.exists():
            print(f"  WARNING: {path} not found")
            pdf_pages[code] = []
            continue

        pages = []
        try:
            with open(path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                n = len(reader.pages)
                for i in range(n):
                    text = reader.pages[i].extract_text() or ""
                    pages.append((i + 1, text))
            print(f"  {code} ({filename}): {n} pages")
        except Exception as e:
            print(f"  ERROR reading {filename}: {e}")

        pdf_pages[code] = pages

    return pdf_pages


# ─────────────────────────────────────────────────────────────────────────────
# Step 3b: Extract PDF answer keys
# ─────────────────────────────────────────────────────────────────────────────

def parse_s1_solution_table(text: str, branch_map: dict) -> dict:
    """
    Parse the columnar solution table at the end of S1C and S1S PDFs.
    Returns {subject_num: {q_num: answer_letter}}

    The table looks like:
      BRANCHE  10  BRANCHE 40  BRANCHE 50  BRANCHE 90
      1. A  1. C  1. A  1. A
      2. C  2. A  2. C  2. B
      ...
    """
    result = {}
    # Find positions of all branch headers
    branch_positions = []
    for subj, label in branch_map.items():
        idx = text.find(label)
        if idx >= 0:
            branch_positions.append((idx, subj))
    if not branch_positions:
        return result

    n_branches = len(branch_map)
    sorted_branches = [subj for _, subj in sorted(branch_positions)]

    # The answer section starts right after the line containing the branch headers.
    # Find the end of that header line (next \n after the last branch label).
    last_header_pos = max(p[0] for p in branch_positions)
    # Find the end of last label
    last_label = ""
    for subj, label in branch_map.items():
        if text.find(label) == last_header_pos:
            last_label = label
            break
    # Walk forward to end of header line
    scan_pos = last_header_pos + len(last_label)
    newline_pos = text.find("\n", scan_pos)
    if newline_pos < 0:
        return result
    answer_section = text[newline_pos + 1:]

    # Extract all "N. X" patterns
    all_answers = re.findall(r"\b(\d+)\.\s+([A-D])\b", answer_section)

    # Answers interleave: q1_b1, q1_b2, ..., q2_b1, q2_b2, ...
    branch_answers = {subj: {} for subj in sorted_branches}
    for i, (q_str, letter) in enumerate(all_answers):
        q_num = int(q_str)
        branch_idx = i % n_branches
        if branch_idx < len(sorted_branches):
            subj = sorted_branches[branch_idx]
            branch_answers[subj][q_num] = letter

    return branch_answers


def parse_vv_solution_answers(pages: list) -> dict:
    """
    Parse 'Solution question N : X' lines from VV PDF.
    Returns {q_num: answer_letter}
    """
    answers = {}
    for page_num, text in pages:
        for m in re.finditer(r"Solution question\s+(\d+)\s*:\s*([A-D])", text, re.IGNORECASE):
            q_num = int(m.group(1))
            letter = m.group(2).upper()
            answers[q_num] = letter
    return answers


def extract_pdf_answer_keys(pdf_pages: dict) -> dict:
    """
    Returns:
      S1C/S1S: {pdf_code: {subject_num: {q_num: letter}}}
      VV:      {pdf_code: {None: {q_num: letter}}}
    """
    print("\n[3b] Extracting PDF answer keys...")
    keys = {}

    # S1C
    if "S1C" in pdf_pages:
        full_text = "\n".join(text for _, text in pdf_pages["S1C"])
        answers = parse_s1_solution_table(full_text, S1C_BRANCH_MAP)
        keys["S1C"] = answers
        for subj, ans in answers.items():
            print(f"  S1C branch {subj}: {len(ans)} answers parsed")

    # S1S
    if "S1S" in pdf_pages:
        full_text = "\n".join(text for _, text in pdf_pages["S1S"])
        answers = parse_s1_solution_table(full_text, S1S_BRANCH_MAP)
        keys["S1S"] = answers
        for subj, ans in answers.items():
            print(f"  S1S branch {subj}: {len(ans)} answers parsed")

    keys["S2"] = {}
    keys["S3"] = {}

    # VV
    if "VV" in pdf_pages:
        vv_answers = parse_vv_solution_answers(pdf_pages["VV"])
        keys["VV"] = {None: vv_answers}
        print(f"  VV: {len(vv_answers)} answers parsed")

    return keys


# ─────────────────────────────────────────────────────────────────────────────
# Step 4: Build question-level chunks from PDFs
# ─────────────────────────────────────────────────────────────────────────────

def build_pdf_question_chunks(pdf_pages: dict) -> list:
    """
    Split PDF pages into individual question chunks for better matching.

    For each PDF, we extract chunks of text corresponding to individual questions.
    We split on patterns like:
      - "1." / "2." at start of line (S1C, S1S, S2, S3: numbered questions)
      - "Solution question N :" boundaries (VV)

    Returns list of:
      (pdf_code, page_num, chunk_q_num, chunk_text, chunk_word_set)
    """
    chunks = []

    for code, pages in pdf_pages.items():
        # Combine all text per PDF but track page boundaries
        all_text = ""
        page_breaks = []  # [(char_offset, page_num)]
        for page_num, text in pages:
            page_breaks.append((len(all_text), page_num))
            all_text += text + "\n"

        def char_to_page(offset):
            """Return page_num for a character offset."""
            for i in range(len(page_breaks) - 1, -1, -1):
                if offset >= page_breaks[i][0]:
                    return page_breaks[i][1]
            return 1

        if code in ("S1C", "S1S", "S2", "S3"):
            # Split on numbered questions: line starting with "N." where N is 1-99
            # followed by a space and capital letter (French question text)
            splits = list(re.finditer(
                r"(?:^|\n)\s*(\d{1,2})\.\s+([A-ZÀÂÄÉÈÊËÎÏÔÙÛÜÇ])",
                all_text
            ))
            for i, m in enumerate(splits):
                q_num = int(m.group(1))
                start = m.start()
                end = splits[i + 1].start() if i + 1 < len(splits) else len(all_text)
                chunk = all_text[start:end].strip()
                if len(chunk) > 30:
                    pg = char_to_page(start)
                    ws = word_set(chunk)
                    if len(ws) >= 4:
                        chunks.append((code, pg, q_num, chunk, ws))

        elif code == "VV":
            # VV: each question is preceded by "Solution question N-1 : X\n"
            # and has its own question block before the next solution marker
            # Split on subject headers ("10 Droit aérien", "20 ...", etc.) or
            # on "Solution question N :" markers combined with question text
            # Strategy: split on "Solution question N :" boundaries

            # Find all solution markers
            sol_markers = list(re.finditer(
                r"Solution question\s+(\d+)\s*:\s*[A-D]",
                all_text,
                re.IGNORECASE
            ))

            for i, m in enumerate(sol_markers):
                q_num = int(m.group(1))
                # The question text appears BEFORE this solution marker
                # (between previous solution marker end and this one)
                prev_end = sol_markers[i - 1].end() if i > 0 else 0
                chunk = all_text[prev_end:m.start()].strip()
                # Remove page headers ("10 Droit aérien", "Page N", etc.)
                chunk = re.sub(r"^\s*\d{2}\s+[A-ZÀ-Ü].{0,40}\n", "", chunk, flags=re.MULTILINE)
                chunk = re.sub(r"^\s*(?:Page|Edition)\s+\d+.*\n", "", chunk, flags=re.MULTILINE)
                if len(chunk) > 20:
                    pg = char_to_page(m.start())
                    ws = word_set(chunk)
                    if len(ws) >= 4:
                        chunks.append((code, pg, q_num, chunk, ws))

    print(f"  PDF question chunks built: {len(chunks)} total")
    return chunks


# ─────────────────────────────────────────────────────────────────────────────
# Step 5: Match FR questions against PDF chunks
# ─────────────────────────────────────────────────────────────────────────────

def find_best_pdf_match(question_text: str, options: dict, page_index: list,
                        threshold: float = 0.15):
    """
    Find the best matching PDF chunk for a question.
    Returns (pdf_code, page_num, pdf_q_num, score) or (None, None, None, 0.0)
    """
    combined = question_text
    for opt_text in options.values():
        combined += " " + opt_text
    q_words = word_set(combined)

    if not q_words:
        return None, None, None, 0.0

    best_score = 0.0
    best_code = None
    best_page = None
    best_qnum = None

    for code, page_num, chunk_q_num, _, chunk_words in page_index:
        score = jaccard(q_words, chunk_words)
        if score > best_score:
            best_score = score
            best_code = code
            best_page = page_num
            best_qnum = chunk_q_num

    if best_score < threshold:
        return None, None, None, best_score

    return best_code, best_page, best_qnum, best_score


# ─────────────────────────────────────────────────────────────────────────────
# Step 5b: Get PDF answer key answer for matched question
# ─────────────────────────────────────────────────────────────────────────────

def get_pdf_answer(subject_num: int, pdf_code: str, pdf_q_num: int,
                   pdf_keys: dict) -> str | None:
    """Look up the answer key answer for a matched PDF question."""
    if pdf_code == "VV":
        return pdf_keys.get("VV", {}).get(None, {}).get(pdf_q_num)
    elif pdf_code in ("S1C", "S1S"):
        subj_answers = pdf_keys.get(pdf_code, {}).get(subject_num, {})
        return subj_answers.get(pdf_q_num)
    # S2, S3: no keys
    return None


# ─────────────────────────────────────────────────────────────────────────────
# Step 6: Build provenance database
# ─────────────────────────────────────────────────────────────────────────────

def build_provenance(quiz_db: dict, fr_db: dict, pdf_pages: dict, pdf_keys: dict) -> tuple:
    """Main matching loop. Returns (records, stats)."""
    print("\n[4/5] Building PDF question chunk index and matching...")
    chunk_index = build_pdf_question_chunks(pdf_pages)

    records = []
    tags = sorted(fr_db.keys(), key=lambda t: (
        int(re.search(r't(\d+)', t).group(1)),
        int(re.search(r'q(\d+)', t).group(1))
    ))
    total = len(tags)
    matched = 0
    unmatched = 0
    answer_mismatches = 0

    for i, tag in enumerate(tags):
        if i % 200 == 0:
            print(f"  Progress: {i}/{total} tags processed...")

        fr_data = fr_db[tag]
        quiz_data = quiz_db.get(tag, {})
        subject_num = fr_data["subject_num"]

        question_fr = fr_data.get("question_fr", "")
        options_fr = fr_data.get("options_fr", {})
        app_correct = fr_data.get("app_correct")
        quiz_correct = quiz_data.get("quiz_correct")

        # Find best PDF chunk match
        pdf_code, page_num, pdf_q_num, score = find_best_pdf_match(
            question_fr, options_fr, chunk_index
        )

        is_matched = pdf_code is not None
        if is_matched:
            matched += 1
        else:
            unmatched += 1

        # Get PDF answer key answer
        pdf_answer = None
        if is_matched and pdf_code:
            pdf_answer = get_pdf_answer(subject_num, pdf_code, pdf_q_num, pdf_keys)

        # Detect answer mismatches.
        #
        # IMPORTANT CAVEAT: Answer option order (A/B/C/D) is frequently shuffled
        # between exam papers and the app's FR version. A letter mismatch does NOT
        # necessarily mean the wrong answer — the same content answer may appear
        # at a different letter.  We flag mismatches as informational; manual
        # review is required to confirm genuine wrong answers.
        mismatch_flags = []
        quiz_shuffled = (quiz_correct and app_correct and quiz_correct != app_correct)
        if quiz_shuffled:
            mismatch_flags.append(f"QUIZ_VS_APP:{quiz_correct}!={app_correct}")
        if pdf_answer and app_correct and pdf_answer != app_correct:
            mismatch_flags.append(f"PDF_VS_APP:{pdf_answer}!={app_correct}")
        if pdf_answer and quiz_correct and pdf_answer != quiz_correct:
            mismatch_flags.append(f"PDF_VS_QUIZ:{pdf_answer}!={quiz_correct}")

        # Count as flagged mismatch when PDF key disagrees with app answer
        # (these require manual verification — may be option-shuffle or real error)
        has_real_mismatch = bool(pdf_answer and app_correct and pdf_answer != app_correct)

        if has_real_mismatch:
            answer_mismatches += 1

        record = {
            "tag": tag,
            "subject_num": subject_num,
            "question_fr": question_fr,
            "options_fr": options_fr,
            "app_correct": app_correct,
            "quiz_correct": quiz_correct,
            "pdf_source": pdf_code,
            "pdf_page": page_num,
            "pdf_q_num": pdf_q_num,
            "match_score": round(score, 4),
            "pdf_answer": pdf_answer,
            "mismatch_flags": mismatch_flags,
            "has_mismatch": has_real_mismatch,
        }
        records.append(record)

    print(f"\n  Matched: {matched}/{total} ({100*matched//total if total else 0}%)")
    print(f"  Unmatched: {unmatched}/{total}")
    print(f"  Answer mismatches found: {answer_mismatches}")

    stats = {
        "total": total,
        "matched": matched,
        "unmatched": unmatched,
        "answer_mismatches": answer_mismatches
    }
    return records, stats


# ─────────────────────────────────────────────────────────────────────────────
# Step 7: Write outputs
# ─────────────────────────────────────────────────────────────────────────────

def write_outputs(records: list, stats: dict):
    print("\n[5/5] Writing output files...")

    # ── JSON ──────────────────────────────────────────────────────────────────
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump({"stats": stats, "records": records}, f, ensure_ascii=False, indent=2)
    print(f"  JSON written: {OUTPUT_JSON}")

    # ── Markdown ──────────────────────────────────────────────────────────────
    lines = []
    lines.append("# Source Provenance Database — Glidr SPL Exam Questions")
    lines.append("")
    lines.append(f"Generated: 2026-04-12  |  Total questions: {stats['total']}  |  "
                 f"Matched: {stats['matched']}  |  Unmatched: {stats['unmatched']}  |  "
                 f"Answer mismatches: {stats['answer_mismatches']}")
    lines.append("")
    lines.append("## Legend")
    lines.append("")
    lines.append("| Column | Description |")
    lines.append("|--------|-------------|")
    lines.append("| Tag | Question tag (e.g. t10q1) |")
    lines.append("| PDF Source | Which exam paper (S1C/S1S/S2/S3/VV) |")
    lines.append("| Page | PDF page number |")
    lines.append("| PDF Q# | Question number within that PDF |")
    lines.append("| Score | Jaccard word-overlap similarity (≥0.15 = match) |")
    lines.append("| App | Current app correct answer |")
    lines.append("| Quiz | QuizVDS original answer (EN import) |")
    lines.append("| PDF Key | Answer from PDF solution page |")
    lines.append("| Flags | Mismatch warnings |")
    lines.append("")
    lines.append("## PDF Sources")
    lines.append("")
    lines.append("| Code | File |")
    lines.append("|------|------|")
    for code, fname in PDF_FILES.items():
        lines.append(f"| {code} | {fname} |")
    lines.append("")

    # Group by subject
    from itertools import groupby
    records_sorted = sorted(records, key=lambda r: (
        r["subject_num"],
        int(re.search(r'q(\d+)', r["tag"]).group(1))
    ))

    for subject_num, group in groupby(records_sorted, key=lambda r: r["subject_num"]):
        group_list = list(group)
        subject_name = SUBJECT_NAMES.get(subject_num, f"Subject {subject_num}")
        matched_in_group = sum(1 for r in group_list if r["pdf_source"])
        lines.append(f"## Subject {subject_num}: {subject_name}")
        lines.append("")
        lines.append(f"Total: {len(group_list)} questions | Matched: {matched_in_group}")
        lines.append("")
        lines.append("| Tag | PDF | Page | PDF Q# | Score | App | Quiz | PDF Key | Flags |")
        lines.append("|-----|-----|------|--------|-------|-----|------|---------|-------|")

        for r in group_list:
            tag = r["tag"]
            pdf_src = r["pdf_source"] or "—"
            page = str(r["pdf_page"]) if r["pdf_page"] else "—"
            pdf_qn = str(r["pdf_q_num"]) if r["pdf_q_num"] else "—"
            score = f"{r['match_score']:.3f}"
            app = r["app_correct"] or "?"
            quiz = r["quiz_correct"] or "—"
            pdf_key = r["pdf_answer"] or "—"
            flags = " ".join(r["mismatch_flags"]) if r["mismatch_flags"] else ""
            row = f"| {tag} | {pdf_src} | {page} | {pdf_qn} | {score} | {app} | {quiz} | {pdf_key} | {flags} |"
            lines.append(row)

        lines.append("")

    # Flagged mismatches: PDF key letter differs from app answer letter
    mismatches = [r for r in records if r["has_mismatch"]]
    if mismatches:
        lines.append("## Flagged Answer Letter Differences (PDF Key vs App Answer)")
        lines.append("")
        lines.append("> **IMPORTANT**: Answer option order (A/B/C/D) is frequently shuffled between")
        lines.append("> exam papers and the app's FR version. A letter difference does NOT necessarily")
        lines.append("> indicate a wrong answer — the same correct content may appear at a different")
        lines.append("> letter. Manual review is required to confirm genuine errors.")
        lines.append("")
        lines.append(f"Found {len(mismatches)} questions with letter disagreements:")
        lines.append("")
        lines.append("| Tag | Score | Question (FR, truncated) | App | PDF Key | PDF Source |")
        lines.append("|-----|-------|--------------------------|-----|---------|------------|")
        for r in sorted(mismatches, key=lambda r: -r["match_score"]):
            q_short = r["question_fr"][:55].replace("|", "/")
            app = r["app_correct"] or "?"
            pdf_key = r["pdf_answer"] or "—"
            src = f"{r['pdf_source']} p{r['pdf_page']}" if r['pdf_source'] else "—"
            lines.append(f"| {r['tag']} | {r['match_score']:.3f} | {q_short}… | {app} | {pdf_key} | {src} |")
        lines.append("")

    # Unmatched summary
    unmatched_list = [r for r in records if not r["pdf_source"]]
    if unmatched_list:
        lines.append("## Unmatched Questions (score < 0.15)")
        lines.append("")
        lines.append(f"Found {len(unmatched_list)} questions with no strong PDF match:")
        lines.append("")
        lines.append("| Tag | Best Score | Question (FR, truncated) |")
        lines.append("|-----|------------|--------------------------|")
        for r in sorted(unmatched_list, key=lambda r: r["tag"]):
            q_short = r["question_fr"][:70].replace("|", "/")
            lines.append(f"| {r['tag']} | {r['match_score']:.3f} | {q_short} |")
        lines.append("")

    with open(OUTPUT_MD, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print(f"  Markdown written: {OUTPUT_MD}")


# ─────────────────────────────────────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────────────────────────────────────

def main():
    print("=" * 70)
    print("Glidr Source Provenance Builder")
    print("=" * 70)

    quiz_db = parse_quiz_vds()
    fr_db = parse_fr_questions()
    pdf_pages = extract_pdf_pages()
    pdf_keys = extract_pdf_answer_keys(pdf_pages)
    records, stats = build_provenance(quiz_db, fr_db, pdf_pages, pdf_keys)
    write_outputs(records, stats)

    print("\n" + "=" * 70)
    print("PUBLISH COMPLETE")
    print(f"  Total questions: {stats['total']}")
    print(f"  Matched to PDF:  {stats['matched']} ({100*stats['matched']//stats['total'] if stats['total'] else 0}%)")
    print(f"  Unmatched:       {stats['unmatched']}")
    print(f"  Answer mismatches: {stats['answer_mismatches']}")
    print(f"\n  Output MD:   {OUTPUT_MD}")
    print(f"  Output JSON: {OUTPUT_JSON}")
    print("=" * 70)


if __name__ == "__main__":
    main()