#!/usr/bin/env python3 """ build_provenance.py — Glidr SPL Exam Question Source Provenance Database Traces each app question back to its source PDF exam paper. Outputs: - Source_Provenance.md (human-readable) - Source_Provenance.json (machine-readable) """ import re import os import sys import json import unicodedata from pathlib import Path try: import PyPDF2 except ImportError: print("ERROR: PyPDF2 not installed. Run: pip install PyPDF2") sys.exit(1) # ───────────────────────────────────────────────────────────────────────────── # Path constants # ───────────────────────────────────────────────────────────────────────────── ABLAGE_BASE = Path("/Users/i052341/Daten/Cloud/04 - Ablage/Ablage 2020 - 2029/Ablage 2025/Hobbies 2025/Segelflug/Theorie/Glidr") SOURCES_DIR = ABLAGE_BASE / "SOURCES" QUIZ_VDS_DIR = SOURCES_DIR / "QuizVDS" FR_DIR = ABLAGE_BASE / "SPL Exam Questions FR" PDF_DIR = SOURCES_DIR OUTPUT_MD = SOURCES_DIR / "Source_Provenance.md" OUTPUT_JSON = SOURCES_DIR / "Source_Provenance.json" # Subject number → QuizVDS filename QUIZ_VDS_FILES = { 10: "10 - Air Law.md", 20: "20 - Aircraft General Knowledge.md", 30: "30 - Flight Performance and Planning.md", 40: "40 - Human Performance and Limitations.md", 50: "50 - Meteorology.md", 60: "60 - Navigation.md", 70: "70 - Operational Procedures.md", 80: "80 - Principles of Flight.md", 90: "90 - Communication.md", } # Subject number → FR app filename FR_FILES = { 10: "10 - Droit aérien.md", 20: "20 - Connaissances générales de l'aéronef.md", 30: "30 - Performances et planification du vol.md", 40: "40 - Performances humaines.md", 50: "50 - Météorologie.md", 60: "60 - Navigation.md", 70: "70 - Procédures opérationnelles.md", 80: "80 - Principes du vol.md", 90: "90 - Radiotéléphonie.md", } # PDF source files (relative to SOURCES_DIR) PDF_FILES = { "S1C": "Examen Blanc/Exa Blanc Série_1_Communes.pdf", "S1S": "Examen Blanc/Exa Blanc Série_1_Specifiques.pdf", "S2": "Examen Blanc/Exa Blanc Série_2.pdf", "S3": "Examen Blanc/Exa Blanc Série_3.pdf", "VV": "VV/Questionnaire toutes branches VV.pdf", } # Branch labels as they appear in the S1C/S1S solution tables S1C_BRANCH_MAP = {10: "BRANCHE 10", 40: "BRANCHE 40", 50: "BRANCHE 50", 90: "BRANCHE 90"} S1S_BRANCH_MAP = {20: "BRANCHE 20", 30: "BRANCHE 30", 60: "BRANCHE 60", 70: "BRANCHE 70", 80: "BRANCHE 80"} SUBJECT_NAMES = { 10: "Air Law / Droit aérien", 20: "Aircraft Knowledge / Connaissances aéronef", 30: "Flight Performance / Performances vol", 40: "Human Performance / Performances humaines", 50: "Meteorology / Météorologie", 60: "Navigation", 70: "Operational Procedures / Procédures opérationnelles", 80: "Principles of Flight / Principes du vol", 90: "Communications / Radiotéléphonie", } # ───────────────────────────────────────────────────────────────────────────── # Utility: accent folding + normalisation # ───────────────────────────────────────────────────────────────────────────── def normalize(text: str) -> str: """Lowercase, strip accents, keep alphanumerics and spaces only.""" nfkd = unicodedata.normalize("NFKD", text) ascii_text = "".join(c for c in nfkd if not unicodedata.combining(c)) return re.sub(r"[^a-z0-9 ]", " ", ascii_text.lower()) def word_set(text: str) -> set: """Return significant words as a set (accent-folded, stop-words removed).""" stop = { "a", "b", "c", "d", "la", "le", "les", "de", "du", "des", "un", "une", "et", "ou", "en", "au", "aux", "est", "il", "elle", "on", "que", "qui", "se", "sa", "son", "ce", "par", "sur", "pour", "avec", "dans", "si", "ne", "pas", "plus", "the", "of", "to", "is", "in", "an", "are", "at", "be", "by", "do", "for", "has", "have", "he", "it", "its", "no", "not", "or", "that", "this", "was", "we", "which", "you", "your", "l", "d", "j", "s", "n", "m", "y", "qu", "lorsque", "comme", "car", "mais", "donc", "lors", "quel", "quelle", "quels", "quelles", "comment", "quel", "peut", "doit", "doit", "sont", "ont", "ces", "lors", "aussi", "entre", "selon", "lors", "apres", "avant", "dans", "vers", "sous", "jusqu" } words = normalize(text).split() return {w for w in words if len(w) > 2 and w not in stop} def jaccard(set_a: set, set_b: set) -> float: if not set_a or not set_b: return 0.0 intersection = len(set_a & set_b) union = len(set_a | set_b) return intersection / union if union else 0.0 # ───────────────────────────────────────────────────────────────────────────── # Step 1: Parse QuizVDS files → {tag: {question, options, correct}} # ───────────────────────────────────────────────────────────────────────────── def parse_quiz_vds() -> dict: """Returns dict keyed by tag (e.g. 't10q1') with QuizVDS question data.""" print("\n[1/5] Parsing QuizVDS files...") quiz_db = {} for subject_num, filename in QUIZ_VDS_FILES.items(): path = QUIZ_VDS_DIR / filename if not path.exists(): print(f" WARNING: {path} not found") continue with open(path, encoding="utf-8") as f: content = f.read() # Split on question headers: ### Q{N}: ... blocks = re.split(r"\n(?=### Q\d+:)", content) count = 0 for block in blocks: m = re.match(r"### Q(\d+):\s*(.+?)(?:\n|$)(.*?)(?=\n---|\Z)", block, re.DOTALL) if not m: continue q_num = int(m.group(1)) q_text = m.group(2).strip() rest = m.group(3) # Extract options A-D options = {} for opt in re.finditer(r"^- ([A-D])\)\s*(.+)$", rest, re.MULTILINE): options[opt.group(1)] = opt.group(2).strip() # Extract correct answer correct_m = re.search(r"\*\*Correct:\s*([A-D])\)\*\*", rest) correct = correct_m.group(1) if correct_m else None tag = f"t{subject_num}q{q_num}" quiz_db[tag] = { "question_en": q_text, "options_en": options, "quiz_correct": correct, } count += 1 print(f" {filename}: {count} questions parsed") print(f" Total QuizVDS questions: {len(quiz_db)}") return quiz_db # ───────────────────────────────────────────────────────────────────────────── # Step 2: Parse FR app MD files → {tag: {question_fr, options_fr, app_correct}} # ───────────────────────────────────────────────────────────────────────────── def parse_fr_questions() -> dict: """Returns dict keyed by tag with French question data from app MD files.""" print("\n[2/5] Parsing FR app question files...") fr_db = {} for subject_num, filename in FR_FILES.items(): path = FR_DIR / filename if not path.exists(): print(f" WARNING: {path} not found") continue with open(path, encoding="utf-8") as f: content = f.read() # The header can be "### Q1:" OR "### Q1 :" (space before colon) # Tag is always "^t{NN}q{N}" at end of header line pattern = r"\n(?=### Q\d+\s*:.*\^t\d+q\d+)" blocks = re.split(pattern, content) count = 0 for block in blocks: # Match header with optional space before colon, and tag header_m = re.match(r"### Q\d+\s*:\s*(.+?)\s*\^(t\d+q\d+)", block) if not header_m: continue q_text = header_m.group(1).strip() tag = header_m.group(2) # Options: various formats: # "- A) text" # "- [x] A) text" (correct) # "- [ ] A) text" (wrong) # "- **A)** text" (bold format) options = {} app_correct = None # Format 1: "- [x] A) ..." or "- [ ] A) ..." for opt_m in re.finditer(r"^- \[( |x)\] ([A-D])\)\s*(.+)$", block, re.MULTILINE): checked = opt_m.group(1) letter = opt_m.group(2) text = opt_m.group(3).strip() options[letter] = text if checked == "x": app_correct = letter # Format 2: "- A) ..." (no checkbox) if not options: for opt_m in re.finditer(r"^- \**([A-D])\)\**\s*(.+)$", block, re.MULTILINE): letter = opt_m.group(1) text = opt_m.group(2).strip() options[letter] = text # Answer from "#### Réponse\n\nX)" pattern reponse_m = re.search(r"#### Réponse\s*\n+([A-D])\)", block) if reponse_m: app_correct = reponse_m.group(1) fr_db[tag] = { "question_fr": q_text, "options_fr": options, "app_correct": app_correct, "subject_num": subject_num, } count += 1 print(f" {filename}: {count} questions parsed") total = sum(1 for _ in fr_db) print(f" Total FR app questions: {total}") return fr_db # ───────────────────────────────────────────────────────────────────────────── # Step 3: Extract text from all PDFs, page by page # ───────────────────────────────────────────────────────────────────────────── def extract_pdf_pages() -> dict: """Returns {pdf_code: [(page_num, text), ...]}""" print("\n[3/5] Extracting PDF text...") pdf_pages = {} for code, filename in PDF_FILES.items(): path = PDF_DIR / filename if not path.exists(): print(f" WARNING: {path} not found") pdf_pages[code] = [] continue pages = [] try: with open(path, "rb") as f: reader = PyPDF2.PdfReader(f) n = len(reader.pages) for i in range(n): text = reader.pages[i].extract_text() or "" pages.append((i + 1, text)) print(f" {code} ({filename}): {n} pages") except Exception as e: print(f" ERROR reading {filename}: {e}") pdf_pages[code] = pages return pdf_pages # ───────────────────────────────────────────────────────────────────────────── # Step 3b: Extract PDF answer keys # ───────────────────────────────────────────────────────────────────────────── def parse_s1_solution_table(text: str, branch_map: dict) -> dict: """ Parse the columnar solution table at the end of S1C and S1S PDFs. Returns {subject_num: {q_num: answer_letter}} The table looks like: BRANCHE 10 BRANCHE 40 BRANCHE 50 BRANCHE 90 1. A 1. C 1. A 1. A 2. C 2. A 2. C 2. B ... """ result = {} # Find positions of all branch headers branch_positions = [] for subj, label in branch_map.items(): idx = text.find(label) if idx >= 0: branch_positions.append((idx, subj)) if not branch_positions: return result n_branches = len(branch_map) sorted_branches = [subj for _, subj in sorted(branch_positions)] # The answer section starts right after the line containing the branch headers. # Find the end of that header line (next \n after the last branch label). last_header_pos = max(p[0] for p in branch_positions) # Find the end of last label last_label = "" for subj, label in branch_map.items(): if text.find(label) == last_header_pos: last_label = label break # Walk forward to end of header line scan_pos = last_header_pos + len(last_label) newline_pos = text.find("\n", scan_pos) if newline_pos < 0: return result answer_section = text[newline_pos + 1:] # Extract all "N. X" patterns all_answers = re.findall(r"\b(\d+)\.\s+([A-D])\b", answer_section) # Answers interleave: q1_b1, q1_b2, ..., q2_b1, q2_b2, ... branch_answers = {subj: {} for subj in sorted_branches} for i, (q_str, letter) in enumerate(all_answers): q_num = int(q_str) branch_idx = i % n_branches if branch_idx < len(sorted_branches): subj = sorted_branches[branch_idx] branch_answers[subj][q_num] = letter return branch_answers def parse_vv_solution_answers(pages: list) -> dict: """ Parse 'Solution question N : X' lines from VV PDF. Returns {q_num: answer_letter} """ answers = {} for page_num, text in pages: for m in re.finditer(r"Solution question\s+(\d+)\s*:\s*([A-D])", text, re.IGNORECASE): q_num = int(m.group(1)) letter = m.group(2).upper() answers[q_num] = letter return answers def extract_pdf_answer_keys(pdf_pages: dict) -> dict: """ Returns: S1C/S1S: {pdf_code: {subject_num: {q_num: letter}}} VV: {pdf_code: {None: {q_num: letter}}} """ print("\n[3b] Extracting PDF answer keys...") keys = {} # S1C if "S1C" in pdf_pages: full_text = "\n".join(text for _, text in pdf_pages["S1C"]) answers = parse_s1_solution_table(full_text, S1C_BRANCH_MAP) keys["S1C"] = answers for subj, ans in answers.items(): print(f" S1C branch {subj}: {len(ans)} answers parsed") # S1S if "S1S" in pdf_pages: full_text = "\n".join(text for _, text in pdf_pages["S1S"]) answers = parse_s1_solution_table(full_text, S1S_BRANCH_MAP) keys["S1S"] = answers for subj, ans in answers.items(): print(f" S1S branch {subj}: {len(ans)} answers parsed") keys["S2"] = {} keys["S3"] = {} # VV if "VV" in pdf_pages: vv_answers = parse_vv_solution_answers(pdf_pages["VV"]) keys["VV"] = {None: vv_answers} print(f" VV: {len(vv_answers)} answers parsed") return keys # ───────────────────────────────────────────────────────────────────────────── # Step 4: Build question-level chunks from PDFs # ───────────────────────────────────────────────────────────────────────────── def build_pdf_question_chunks(pdf_pages: dict) -> list: """ Split PDF pages into individual question chunks for better matching. For each PDF, we extract chunks of text corresponding to individual questions. We split on patterns like: - "1." / "2." at start of line (S1C, S1S, S2, S3: numbered questions) - "Solution question N :" boundaries (VV) Returns list of: (pdf_code, page_num, chunk_q_num, chunk_text, chunk_word_set) """ chunks = [] for code, pages in pdf_pages.items(): # Combine all text per PDF but track page boundaries all_text = "" page_breaks = [] # [(char_offset, page_num)] for page_num, text in pages: page_breaks.append((len(all_text), page_num)) all_text += text + "\n" def char_to_page(offset): """Return page_num for a character offset.""" for i in range(len(page_breaks) - 1, -1, -1): if offset >= page_breaks[i][0]: return page_breaks[i][1] return 1 if code in ("S1C", "S1S", "S2", "S3"): # Split on numbered questions: line starting with "N." where N is 1-99 # followed by a space and capital letter (French question text) splits = list(re.finditer( r"(?:^|\n)\s*(\d{1,2})\.\s+([A-ZÀÂÄÉÈÊËÎÏÔÙÛÜÇ])", all_text )) for i, m in enumerate(splits): q_num = int(m.group(1)) start = m.start() end = splits[i + 1].start() if i + 1 < len(splits) else len(all_text) chunk = all_text[start:end].strip() if len(chunk) > 30: pg = char_to_page(start) ws = word_set(chunk) if len(ws) >= 4: chunks.append((code, pg, q_num, chunk, ws)) elif code == "VV": # VV: each question is preceded by "Solution question N-1 : X\n" # and has its own question block before the next solution marker # Split on subject headers ("10 Droit aérien", "20 ...", etc.) or # on "Solution question N :" markers combined with question text # Strategy: split on "Solution question N :" boundaries # Find all solution markers sol_markers = list(re.finditer( r"Solution question\s+(\d+)\s*:\s*[A-D]", all_text, re.IGNORECASE )) for i, m in enumerate(sol_markers): q_num = int(m.group(1)) # The question text appears BEFORE this solution marker # (between previous solution marker end and this one) prev_end = sol_markers[i - 1].end() if i > 0 else 0 chunk = all_text[prev_end:m.start()].strip() # Remove page headers ("10 Droit aérien", "Page N", etc.) chunk = re.sub(r"^\s*\d{2}\s+[A-ZÀ-Ü].{0,40}\n", "", chunk, flags=re.MULTILINE) chunk = re.sub(r"^\s*(?:Page|Edition)\s+\d+.*\n", "", chunk, flags=re.MULTILINE) if len(chunk) > 20: pg = char_to_page(m.start()) ws = word_set(chunk) if len(ws) >= 4: chunks.append((code, pg, q_num, chunk, ws)) print(f" PDF question chunks built: {len(chunks)} total") return chunks # ───────────────────────────────────────────────────────────────────────────── # Step 5: Match FR questions against PDF chunks # ───────────────────────────────────────────────────────────────────────────── def find_best_pdf_match(question_text: str, options: dict, page_index: list, threshold: float = 0.15): """ Find the best matching PDF chunk for a question. Returns (pdf_code, page_num, pdf_q_num, score) or (None, None, None, 0.0) """ combined = question_text for opt_text in options.values(): combined += " " + opt_text q_words = word_set(combined) if not q_words: return None, None, None, 0.0 best_score = 0.0 best_code = None best_page = None best_qnum = None for code, page_num, chunk_q_num, _, chunk_words in page_index: score = jaccard(q_words, chunk_words) if score > best_score: best_score = score best_code = code best_page = page_num best_qnum = chunk_q_num if best_score < threshold: return None, None, None, best_score return best_code, best_page, best_qnum, best_score # ───────────────────────────────────────────────────────────────────────────── # Step 5b: Get PDF answer key answer for matched question # ───────────────────────────────────────────────────────────────────────────── def get_pdf_answer(subject_num: int, pdf_code: str, pdf_q_num: int, pdf_keys: dict) -> str | None: """Look up the answer key answer for a matched PDF question.""" if pdf_code == "VV": return pdf_keys.get("VV", {}).get(None, {}).get(pdf_q_num) elif pdf_code in ("S1C", "S1S"): subj_answers = pdf_keys.get(pdf_code, {}).get(subject_num, {}) return subj_answers.get(pdf_q_num) # S2, S3: no keys return None # ───────────────────────────────────────────────────────────────────────────── # Step 6: Build provenance database # ───────────────────────────────────────────────────────────────────────────── def build_provenance(quiz_db: dict, fr_db: dict, pdf_pages: dict, pdf_keys: dict) -> tuple: """Main matching loop. Returns (records, stats).""" print("\n[4/5] Building PDF question chunk index and matching...") chunk_index = build_pdf_question_chunks(pdf_pages) records = [] tags = sorted(fr_db.keys(), key=lambda t: ( int(re.search(r't(\d+)', t).group(1)), int(re.search(r'q(\d+)', t).group(1)) )) total = len(tags) matched = 0 unmatched = 0 answer_mismatches = 0 for i, tag in enumerate(tags): if i % 200 == 0: print(f" Progress: {i}/{total} tags processed...") fr_data = fr_db[tag] quiz_data = quiz_db.get(tag, {}) subject_num = fr_data["subject_num"] question_fr = fr_data.get("question_fr", "") options_fr = fr_data.get("options_fr", {}) app_correct = fr_data.get("app_correct") quiz_correct = quiz_data.get("quiz_correct") # Find best PDF chunk match pdf_code, page_num, pdf_q_num, score = find_best_pdf_match( question_fr, options_fr, chunk_index ) is_matched = pdf_code is not None if is_matched: matched += 1 else: unmatched += 1 # Get PDF answer key answer pdf_answer = None if is_matched and pdf_code: pdf_answer = get_pdf_answer(subject_num, pdf_code, pdf_q_num, pdf_keys) # Detect answer mismatches. # # IMPORTANT CAVEAT: Answer option order (A/B/C/D) is frequently shuffled # between exam papers and the app's FR version. A letter mismatch does NOT # necessarily mean the wrong answer — the same content answer may appear # at a different letter. We flag mismatches as informational; manual # review is required to confirm genuine wrong answers. mismatch_flags = [] quiz_shuffled = (quiz_correct and app_correct and quiz_correct != app_correct) if quiz_shuffled: mismatch_flags.append(f"QUIZ_VS_APP:{quiz_correct}!={app_correct}") if pdf_answer and app_correct and pdf_answer != app_correct: mismatch_flags.append(f"PDF_VS_APP:{pdf_answer}!={app_correct}") if pdf_answer and quiz_correct and pdf_answer != quiz_correct: mismatch_flags.append(f"PDF_VS_QUIZ:{pdf_answer}!={quiz_correct}") # Count as flagged mismatch when PDF key disagrees with app answer # (these require manual verification — may be option-shuffle or real error) has_real_mismatch = bool(pdf_answer and app_correct and pdf_answer != app_correct) if has_real_mismatch: answer_mismatches += 1 record = { "tag": tag, "subject_num": subject_num, "question_fr": question_fr, "options_fr": options_fr, "app_correct": app_correct, "quiz_correct": quiz_correct, "pdf_source": pdf_code, "pdf_page": page_num, "pdf_q_num": pdf_q_num, "match_score": round(score, 4), "pdf_answer": pdf_answer, "mismatch_flags": mismatch_flags, "has_mismatch": has_real_mismatch, } records.append(record) print(f"\n Matched: {matched}/{total} ({100*matched//total if total else 0}%)") print(f" Unmatched: {unmatched}/{total}") print(f" Answer mismatches found: {answer_mismatches}") stats = { "total": total, "matched": matched, "unmatched": unmatched, "answer_mismatches": answer_mismatches } return records, stats # ───────────────────────────────────────────────────────────────────────────── # Step 7: Write outputs # ───────────────────────────────────────────────────────────────────────────── def write_outputs(records: list, stats: dict): print("\n[5/5] Writing output files...") # ── JSON ────────────────────────────────────────────────────────────────── with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump({"stats": stats, "records": records}, f, ensure_ascii=False, indent=2) print(f" JSON written: {OUTPUT_JSON}") # ── Markdown ────────────────────────────────────────────────────────────── lines = [] lines.append("# Source Provenance Database — Glidr SPL Exam Questions") lines.append("") lines.append(f"Generated: 2026-04-12 | Total questions: {stats['total']} | " f"Matched: {stats['matched']} | Unmatched: {stats['unmatched']} | " f"Answer mismatches: {stats['answer_mismatches']}") lines.append("") lines.append("## Legend") lines.append("") lines.append("| Column | Description |") lines.append("|--------|-------------|") lines.append("| Tag | Question tag (e.g. t10q1) |") lines.append("| PDF Source | Which exam paper (S1C/S1S/S2/S3/VV) |") lines.append("| Page | PDF page number |") lines.append("| PDF Q# | Question number within that PDF |") lines.append("| Score | Jaccard word-overlap similarity (≥0.15 = match) |") lines.append("| App | Current app correct answer |") lines.append("| Quiz | QuizVDS original answer (EN import) |") lines.append("| PDF Key | Answer from PDF solution page |") lines.append("| Flags | Mismatch warnings |") lines.append("") lines.append("## PDF Sources") lines.append("") lines.append("| Code | File |") lines.append("|------|------|") for code, fname in PDF_FILES.items(): lines.append(f"| {code} | {fname} |") lines.append("") # Group by subject from itertools import groupby records_sorted = sorted(records, key=lambda r: ( r["subject_num"], int(re.search(r'q(\d+)', r["tag"]).group(1)) )) for subject_num, group in groupby(records_sorted, key=lambda r: r["subject_num"]): group_list = list(group) subject_name = SUBJECT_NAMES.get(subject_num, f"Subject {subject_num}") matched_in_group = sum(1 for r in group_list if r["pdf_source"]) lines.append(f"## Subject {subject_num}: {subject_name}") lines.append("") lines.append(f"Total: {len(group_list)} questions | Matched: {matched_in_group}") lines.append("") lines.append("| Tag | PDF | Page | PDF Q# | Score | App | Quiz | PDF Key | Flags |") lines.append("|-----|-----|------|--------|-------|-----|------|---------|-------|") for r in group_list: tag = r["tag"] pdf_src = r["pdf_source"] or "—" page = str(r["pdf_page"]) if r["pdf_page"] else "—" pdf_qn = str(r["pdf_q_num"]) if r["pdf_q_num"] else "—" score = f"{r['match_score']:.3f}" app = r["app_correct"] or "?" quiz = r["quiz_correct"] or "—" pdf_key = r["pdf_answer"] or "—" flags = " ".join(r["mismatch_flags"]) if r["mismatch_flags"] else "" row = f"| {tag} | {pdf_src} | {page} | {pdf_qn} | {score} | {app} | {quiz} | {pdf_key} | {flags} |" lines.append(row) lines.append("") # Flagged mismatches: PDF key letter differs from app answer letter mismatches = [r for r in records if r["has_mismatch"]] if mismatches: lines.append("## Flagged Answer Letter Differences (PDF Key vs App Answer)") lines.append("") lines.append("> **IMPORTANT**: Answer option order (A/B/C/D) is frequently shuffled between") lines.append("> exam papers and the app's FR version. A letter difference does NOT necessarily") lines.append("> indicate a wrong answer — the same correct content may appear at a different") lines.append("> letter. Manual review is required to confirm genuine errors.") lines.append("") lines.append(f"Found {len(mismatches)} questions with letter disagreements:") lines.append("") lines.append("| Tag | Score | Question (FR, truncated) | App | PDF Key | PDF Source |") lines.append("|-----|-------|--------------------------|-----|---------|------------|") for r in sorted(mismatches, key=lambda r: -r["match_score"]): q_short = r["question_fr"][:55].replace("|", "/") app = r["app_correct"] or "?" pdf_key = r["pdf_answer"] or "—" src = f"{r['pdf_source']} p{r['pdf_page']}" if r['pdf_source'] else "—" lines.append(f"| {r['tag']} | {r['match_score']:.3f} | {q_short}… | {app} | {pdf_key} | {src} |") lines.append("") # Unmatched summary unmatched_list = [r for r in records if not r["pdf_source"]] if unmatched_list: lines.append("## Unmatched Questions (score < 0.15)") lines.append("") lines.append(f"Found {len(unmatched_list)} questions with no strong PDF match:") lines.append("") lines.append("| Tag | Best Score | Question (FR, truncated) |") lines.append("|-----|------------|--------------------------|") for r in sorted(unmatched_list, key=lambda r: r["tag"]): q_short = r["question_fr"][:70].replace("|", "/") lines.append(f"| {r['tag']} | {r['match_score']:.3f} | {q_short} |") lines.append("") with open(OUTPUT_MD, "w", encoding="utf-8") as f: f.write("\n".join(lines)) print(f" Markdown written: {OUTPUT_MD}") # ───────────────────────────────────────────────────────────────────────────── # Main # ───────────────────────────────────────────────────────────────────────────── def main(): print("=" * 70) print("Glidr Source Provenance Builder") print("=" * 70) quiz_db = parse_quiz_vds() fr_db = parse_fr_questions() pdf_pages = extract_pdf_pages() pdf_keys = extract_pdf_answer_keys(pdf_pages) records, stats = build_provenance(quiz_db, fr_db, pdf_pages, pdf_keys) write_outputs(records, stats) print("\n" + "=" * 70) print("PUBLISH COMPLETE") print(f" Total questions: {stats['total']}") print(f" Matched to PDF: {stats['matched']} ({100*stats['matched']//stats['total'] if stats['total'] else 0}%)") print(f" Unmatched: {stats['unmatched']}") print(f" Answer mismatches: {stats['answer_mismatches']}") print(f"\n Output MD: {OUTPUT_MD}") print(f" Output JSON: {OUTPUT_JSON}") print("=" * 70) if __name__ == "__main__": main()