Source code for scitex_scholar.pdf_highlight.highlighter

"""Thin orchestrator — wires extraction, classification, and annotation.

Public entry points:
- :func:`highlight_pdf` — end-to-end: extract → classify → write annotated PDF.
- :func:`save_with_highlights` — apply pre-classified labels to a document.
- :func:`apply_classifications` — merge offline JSON labels into blocks.

The source PDF bytes are not modified; highlights are PDF annotation
objects compatible with any PDF viewer.
"""

from __future__ import annotations

import datetime as _dt
import importlib.metadata as _md
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Optional

import pymupdf

from ._annotator import add_legend_page, apply_highlights
from ._blocks import Block, extract_blocks
from ._classifier import API_KEY_ENV, classify_llm, classify_stub
from ._colors import CATEGORIES, CATEGORY_LABELS, COLOR_RGB

__all__ = [
    "CATEGORIES",
    "COLOR_RGB",
    "CATEGORY_LABELS",
    "Block",
    "HighlightResult",
    "apply_classifications",
    "extract_blocks",
    "highlight_pdf",
    "save_with_highlights",
]


def _package_version() -> str:
    try:
        return _md.version("scitex-scholar")
    except Exception:
        return "unknown"


def _default_signature(model: Optional[str]) -> str:
    ts = _dt.datetime.now().strftime("%Y-%m-%d %H:%M")
    return f"Highlighted by scitex-scholar v{_package_version()} (pdf_highlight) — {ts}"



[docs]
def apply_classifications(
    blocks: list[Block],
    classifications: list[dict[str, Any]],
) -> int:
    """Assign offline-produced labels to already-extracted blocks.

    Each entry must contain at least ``id`` and ``category``; ``confidence``
    is optional (defaults to 0.0). Categories outside :data:`CATEGORIES`
    are silently dropped.

    Returns the number of blocks that received a label.
    """
    by_id = {b.id: b for b in blocks}
    n = 0
    for item in classifications:
        b = by_id.get(int(item["id"]))
        if b is None:
            continue
        cat = item.get("category", "none")
        if cat not in CATEGORIES:
            continue
        b.category = cat
        b.confidence = float(item.get("confidence", 0.0))
        n += 1
    return n




[docs]
def save_with_highlights(
    doc: pymupdf.Document,
    blocks: list[Block],
    output_path: str | os.PathLike,
    *,
    add_legend: bool = True,
    signature: Optional[str] = None,
    model_label: Optional[str] = None,
    source_name: Optional[str] = None,
    min_confidence: float = 0.0,
    on_info: Optional[Any] = None,
) -> int:
    """Write ``doc`` with highlight annotations for all labelled blocks.

    When ``add_legend=True`` (default) a colour legend + signature page is
    prepended so readers can see which colour means what. ``min_confidence``
    suppresses highlights below that confidence.

    ``on_info`` (optional callable) receives progress messages.

    The save deliberately uses ``garbage=0, deflate=False``. The earlier
    ``garbage=3, deflate=True`` recompressed every stream of the source
    PDF, which on a large (20 MB+) image-heavy paper ran for minutes
    entirely inside pymupdf's C code — and because CPython only delivers
    ``KeyboardInterrupt`` between bytecode ops, that made the run both
    slow *and* uninterruptible (Ctrl-C queued but never fired). Appending
    the annotation objects without recompression is near-instant and keeps
    the C calls short enough to stay responsive to signals.

    Returns the number of highlight annotations added (not counting the
    legend page).
    """
    info = on_info or (lambda _msg: None)
    n = apply_highlights(doc, blocks, min_confidence=min_confidence, on_info=on_info)
    if add_legend:
        add_legend_page(
            doc,
            signature=signature or _default_signature(model_label),
            model_label=model_label,
            source_name=source_name or "(unknown)",
        )
    out = Path(output_path)
    info(f"      writing {doc.page_count} pages → {out}")
    doc.save(out, garbage=0, deflate=False)
    size_mb = out.stat().st_size / 1e6
    info(f"      wrote {size_mb:.1f} MB")
    return n




[docs]
@dataclass
class HighlightResult:
    input_path: Path
    output_path: Optional[Path]
    blocks: list[Block]
    pages: int
    annotations_added: int


[docs]
    def counts(self) -> dict[str, int]:
        c: dict[str, int] = {}
        for b in self.blocks:
            c[b.category or "none"] = c.get(b.category or "none", 0) + 1
        return c





[docs]
def highlight_pdf(
    pdf_path: str | os.PathLike,
    output_path: Optional[str | os.PathLike] = None,
    *,
    model: str = "claude-haiku-4-5-20251001",
    use_stub: bool = False,
    dry_run: bool = False,
    max_blocks: int = 0,
    batch_size: int = 25,
    min_chars: int = 40,
    sentence_level: bool = True,
    add_legend: bool = True,
    min_confidence: float = 0.0,
    concurrency: int = 4,
    on_info: Optional[Any] = None,
    on_warning: Optional[Any] = None,
) -> HighlightResult:
    """Annotate a PDF with rhetorical-role highlights.

    Args:
        pdf_path: Input PDF path.
        output_path: Output PDF. Defaults to ``<input>.highlighted.pdf``.
        model: Anthropic model ID used by the LLM classifier.
        use_stub: If True, classify with a keyword heuristic (no API call).
        dry_run: If True, classify but do not write the output PDF.
        max_blocks: If >0, truncate to the first N extracted units.
        batch_size: Classifier batch size (units per API call).
        min_chars: Minimum text length for an extracted unit.
        sentence_level: If True (default), classify and highlight at
            sentence granularity. If False, use paragraph-level (less
            precise but ~5× cheaper on long papers).
        add_legend: If True, prepend a colour legend + signature page.

    Returns:
        ``HighlightResult`` with the classified units and annotation count.
    """
    info = on_info or (lambda _msg: None)
    warn = on_warning or (lambda _msg: None)

    pdf = Path(pdf_path)
    if not pdf.exists():
        raise FileNotFoundError(pdf)

    out: Optional[Path]
    if dry_run:
        out = None
    else:
        out = (
            Path(output_path)
            if output_path
            else pdf.with_name(pdf.stem + ".highlighted.pdf")
        )

    info(
        f"[1/3] Extracting {'sentences' if sentence_level else 'paragraphs'} from {pdf}"
    )
    doc, blocks = extract_blocks(
        pdf, min_chars=min_chars, sentence_level=sentence_level
    )
    if max_blocks > 0:
        blocks = blocks[:max_blocks]
    info(f"      {len(blocks)} candidate units across {doc.page_count} pages")

    info(f"[2/3] Classifying ({'stub' if use_stub else model})")
    if use_stub:
        classify_stub(blocks)
    else:
        if not os.environ.get(API_KEY_ENV):
            raise RuntimeError(
                f"{API_KEY_ENV} is not set (or call with use_stub=True). "
                "Scholar uses a namespaced key and does not read the ambient "
                "ANTHROPIC_API_KEY."
            )
        classify_llm(
            blocks,
            model=model,
            batch_size=batch_size,
            concurrency=concurrency,
            on_warning=warn,
            on_info=info,
        )

    added = 0
    if dry_run:
        info("[3/3] dry-run — not writing PDF")
    else:
        assert out is not None
        info(f"[3/3] Writing highlights to {out}")
        added = save_with_highlights(
            doc,
            blocks,
            out,
            add_legend=add_legend,
            model_label=(None if use_stub else model),
            source_name=pdf.name,
            min_confidence=min_confidence,
            on_info=info,
        )
        info(f"      added {added} highlight annotations")

    return HighlightResult(
        input_path=pdf,
        output_path=out,
        blocks=blocks,
        pages=doc.page_count,
        annotations_added=added,
    )