#!/usr/bin/env python3 """Search all exam PDFs for a string and output Obsidian links to matching pages. Usage: python3 search_pdfs.py "visibilité minimale" python3 search_pdfs.py "centre de gravité" --pdf VV python3 search_pdfs.py "transponder" --no-fold python3 search_pdfs.py "Solution question 45" --no-open """ import os import re import subprocess import sys import tempfile import unicodedata try: import typer from rich.console import Console from rich.table import Table from rich.panel import Panel from rich.markdown import Markdown except ImportError: print("Installing dependencies...") subprocess.check_call([sys.executable, "-m", "pip", "install", "typer", "rich", "PyPDF2", "-q"]) import typer from rich.console import Console from rich.table import Table from rich.panel import Panel from rich.markdown import Markdown from PyPDF2 import PdfReader app = typer.Typer(help="Search exam PDFs for a string and get Obsidian links.") console = Console() SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) PDFS = { "S1C": "Examen Blanc/Exa Blanc Série_1_Communes.pdf", "S1S": "Examen Blanc/Exa Blanc Série_1_Specifiques.pdf", "S2": "Examen Blanc/Exa Blanc Série_2.pdf", "S3": "Examen Blanc/Exa Blanc Série_3.pdf", "VV": "VV/Questionnaire toutes branches VV.pdf", } def strip_accents(s: str) -> str: return "".join( c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" ) def search(query: str, pdf_filter: str | None = None, fold_accents: bool = True): results = [] targets = {pdf_filter: PDFS[pdf_filter]} if pdf_filter and pdf_filter in PDFS else PDFS q = query.lower() q_folded = strip_accents(q) for code, fname in targets.items(): path = os.path.join(SCRIPT_DIR, fname) if not os.path.exists(path): console.print(f"[yellow]⚠ {fname} not found[/yellow]", stderr=True) continue reader = PdfReader(path) for page_idx, page in enumerate(reader.pages): text = page.extract_text() or "" page_num = page_idx + 1 text_lower = text.lower() text_folded = strip_accents(text_lower) if fold_accents else text_lower search_text = q_folded if fold_accents else q if search_text in text_folded: pos = text_folded.find(search_text) # Extract the full question block around the match. # Walk backwards to find question start (line after "Solution question" or numbered Q) # Walk forwards to find "Solution question N : X" that ends this question. lines = text.split("\n") match_char = 0 match_line = 0 for li, line in enumerate(lines): if match_char + len(line) >= pos: match_line = li break match_char += len(line) + 1 # Find question start: scan backwards for blank-line gap or Solution line q_start = match_line for j in range(match_line - 1, -1, -1): l = lines[j].strip() if l.startswith("Solution question") or re.match(r"^\d{2,3}\s*$", l): q_start = j + 1 break # Skip blank lines at boundary if not l and j < match_line - 2: q_start = j + 1 break # Find question end: scan forwards for "Solution question" q_end = len(lines) for j in range(match_line + 1, len(lines)): l = lines[j].strip() if l.startswith("Solution question"): q_end = j + 1 # include the Solution line break question_block = "\n".join( l for l in lines[q_start:q_end] if l.strip() ).strip() # Short context for the table context = text[max(0, pos - 40):min(len(text), pos + len(search_text) + 40)] context = context.replace("\n", " ").strip() results.append({ "pdf": code, "fname": fname, "page": page_num, "context": context, "question": question_block, }) return results @app.command() def main( query: str = typer.Argument(..., help="Search string"), pdf: str = typer.Option(None, "--pdf", "-p", help="Limit to one PDF: S1C, S1S, S2, S3, VV"), no_fold: bool = typer.Option(False, "--no-fold", help="Don't fold accents (é≠e)"), no_open: bool = typer.Option(False, "--no-open", help="Don't open results in viewer"), ): """Search all exam PDFs for a string and output Obsidian links.""" if pdf and pdf not in PDFS: console.print(f"[red]Unknown PDF code: {pdf}. Use: {', '.join(PDFS.keys())}[/red]") raise typer.Exit(1) with console.status(f"[bold blue]Searching for \"{query}\"...[/bold blue]"): results = search(query, pdf_filter=pdf, fold_accents=not no_fold) if not results: console.print(f"[yellow]No matches for \"{query}\"[/yellow]") raise typer.Exit(0) # Rich table output console.print() table = Table(title=f"[bold]{len(results)} match(es) for \"{query}\"[/bold]", show_lines=True) table.add_column("Source", style="cyan", width=8) table.add_column("Page", style="green", justify="right", width=5) table.add_column("Context", style="white") table.add_column("Obsidian Link", style="blue") md_lines = [f"# Search: \"{query}\"\n", f"{len(results)} match(es)\n"] for r in results: link = f"[{r['pdf']} p.{r['page']}]({r['fname'].replace(' ', '%20')}#page={r['page']})" table.add_row(r["pdf"], str(r["page"]), f"…{r['context']}…", link) md_lines.append(f"### {link}\n") md_lines.append(f"> {r['question'].replace(chr(10), chr(10) + '> ')}\n") console.print(table) # Write MD and open if not no_open: md_content = "\n".join(md_lines) tmp = tempfile.NamedTemporaryFile( mode="w", suffix=".md", prefix="search_results_", dir=SCRIPT_DIR, delete=False ) tmp.write(md_content) tmp.close() console.print(f"\n[dim]Results saved to {os.path.basename(tmp.name)}[/dim]") subprocess.run(["open", tmp.name]) if __name__ == "__main__": app()