APPS/glidr-content.git - git.mnsoft.org

#!/usr/bin/env python3
"""Search all exam PDFs for a string and output Obsidian links to matching pages.

Usage:
    python3 search_pdfs.py "visibilité minimale"
    python3 search_pdfs.py "centre de gravité" --pdf VV
    python3 search_pdfs.py "transponder" --no-fold
    python3 search_pdfs.py "Solution question 45" --no-open
"""

import os
import re
import subprocess
import sys
import tempfile
import unicodedata

try:
    import typer
    from rich.console import Console
    from rich.table import Table
    from rich.panel import Panel
    from rich.markdown import Markdown
except ImportError:
    print("Installing dependencies...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "typer", "rich", "PyPDF2", "-q"])
    import typer
    from rich.console import Console
    from rich.table import Table
    from rich.panel import Panel
    from rich.markdown import Markdown

from PyPDF2 import PdfReader

app = typer.Typer(help="Search exam PDFs for a string and get Obsidian links.")
console = Console()

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

PDFS = {
    "S1C": "Examen Blanc/Exa Blanc Série_1_Communes.pdf",
    "S1S": "Examen Blanc/Exa Blanc Série_1_Specifiques.pdf",
    "S2":  "Examen Blanc/Exa Blanc Série_2.pdf",
    "S3":  "Examen Blanc/Exa Blanc Série_3.pdf",
    "VV":  "VV/Questionnaire toutes branches VV.pdf",
}


def strip_accents(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
    )


def search(query: str, pdf_filter: str | None = None, fold_accents: bool = True):
    results = []
    targets = {pdf_filter: PDFS[pdf_filter]} if pdf_filter and pdf_filter in PDFS else PDFS

    q = query.lower()
    q_folded = strip_accents(q)

    for code, fname in targets.items():
        path = os.path.join(SCRIPT_DIR, fname)
        if not os.path.exists(path):
            console.print(f"[yellow]⚠  {fname} not found[/yellow]", stderr=True)
            continue

        reader = PdfReader(path)
        for page_idx, page in enumerate(reader.pages):
            text = page.extract_text() or ""
            page_num = page_idx + 1

            text_lower = text.lower()
            text_folded = strip_accents(text_lower) if fold_accents else text_lower
            search_text = q_folded if fold_accents else q

            if search_text in text_folded:
                pos = text_folded.find(search_text)

                # Extract the full question block around the match.
                # Walk backwards to find question start (line after "Solution question" or numbered Q)
                # Walk forwards to find "Solution question N : X" that ends this question.
                lines = text.split("\n")
                match_char = 0
                match_line = 0
                for li, line in enumerate(lines):
                    if match_char + len(line) >= pos:
                        match_line = li
                        break
                    match_char += len(line) + 1

                # Find question start: scan backwards for blank-line gap or Solution line
                q_start = match_line
                for j in range(match_line - 1, -1, -1):
                    l = lines[j].strip()
                    if l.startswith("Solution question") or re.match(r"^\d{2,3}\s*$", l):
                        q_start = j + 1
                        break
                    # Skip blank lines at boundary
                    if not l and j < match_line - 2:
                        q_start = j + 1
                        break

                # Find question end: scan forwards for "Solution question"
                q_end = len(lines)
                for j in range(match_line + 1, len(lines)):
                    l = lines[j].strip()
                    if l.startswith("Solution question"):
                        q_end = j + 1  # include the Solution line
                        break

                question_block = "\n".join(
                    l for l in lines[q_start:q_end] if l.strip()
                ).strip()

                # Short context for the table
                context = text[max(0, pos - 40):min(len(text), pos + len(search_text) + 40)]
                context = context.replace("\n", " ").strip()

                results.append({
                    "pdf": code,
                    "fname": fname,
                    "page": page_num,
                    "context": context,
                    "question": question_block,
                })

    return results


@app.command()
def main(
    query: str = typer.Argument(..., help="Search string"),
    pdf: str = typer.Option(None, "--pdf", "-p", help="Limit to one PDF: S1C, S1S, S2, S3, VV"),
    no_fold: bool = typer.Option(False, "--no-fold", help="Don't fold accents (é≠e)"),
    no_open: bool = typer.Option(False, "--no-open", help="Don't open results in viewer"),
):
    """Search all exam PDFs for a string and output Obsidian links."""

    if pdf and pdf not in PDFS:
        console.print(f"[red]Unknown PDF code: {pdf}. Use: {', '.join(PDFS.keys())}[/red]")
        raise typer.Exit(1)

    with console.status(f"[bold blue]Searching for \"{query}\"...[/bold blue]"):
        results = search(query, pdf_filter=pdf, fold_accents=not no_fold)

    if not results:
        console.print(f"[yellow]No matches for \"{query}\"[/yellow]")
        raise typer.Exit(0)

    # Rich table output
    console.print()
    table = Table(title=f"[bold]{len(results)} match(es) for \"{query}\"[/bold]", show_lines=True)
    table.add_column("Source", style="cyan", width=8)
    table.add_column("Page", style="green", justify="right", width=5)
    table.add_column("Context", style="white")
    table.add_column("Obsidian Link", style="blue")

    md_lines = [f"# Search: \"{query}\"\n", f"{len(results)} match(es)\n"]

    for r in results:
        link = f"[{r['pdf']} p.{r['page']}]({r['fname'].replace(' ', '%20')}#page={r['page']})"
        table.add_row(r["pdf"], str(r["page"]), f"…{r['context']}…", link)
        md_lines.append(f"### {link}\n")
        md_lines.append(f"> {r['question'].replace(chr(10), chr(10) + '> ')}\n")

    console.print(table)

    # Write MD and open
    if not no_open:
        md_content = "\n".join(md_lines)
        tmp = tempfile.NamedTemporaryFile(
            mode="w", suffix=".md", prefix="search_results_",
            dir=SCRIPT_DIR, delete=False
        )
        tmp.write(md_content)
        tmp.close()
        console.print(f"\n[dim]Results saved to {os.path.basename(tmp.name)}[/dim]")
        subprocess.run(["open", tmp.name])


if __name__ == "__main__":
    app()