Source code for scitex_scholar.pdf_highlight._blocks

"""Block extraction — paragraph-level layout + sentence-level splitting."""

from __future__ import annotations

import os
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional

import pymupdf



[docs]
@dataclass
class Block:
    """A unit of classification — either a paragraph or a sentence.

    ``bbox`` is always the paragraph-level clip rectangle. For sentence
    units this is used only as the search region when locating the
    sentence's glyphs on the page at annotation time.
    """

    id: int
    page: int
    bbox: tuple[float, float, float, float]
    text: str
    category: Optional[str] = None
    confidence: float = 0.0



_ABBREV = {
    "fig",
    "figs",
    "eq",
    "eqs",
    "ref",
    "refs",
    "eg",
    "e.g",
    "ie",
    "i.e",
    "cf",
    "vs",
    "al",
    "etc",
    "inc",
    "ltd",
    "dr",
    "prof",
    "mr",
    "mrs",
    "no",
    "nos",
    "vol",
    "pp",
    "chap",
    "sec",
    "st",
}

_SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9(\"'])")



[docs]
def _split_sentences(text: str) -> list[str]:
    """Naive academic-aware sentence splitter.

    Splits on sentence-ending punctuation followed by whitespace and a
    capital/digit/opening quote, then re-joins splits that follow common
    abbreviations (Fig., e.g., et al., single-initial J.).
    """
    parts = _SENT_RE.split(text)
    out: list[str] = []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        if out:
            last = out[-1]
            tail = last.rsplit(None, 1)[-1].lower().rstrip(".,")
            if tail in _ABBREV or (len(tail) <= 2 and tail.isalpha()):
                out[-1] = f"{last} {p}"
                continue
        out.append(p)
    return out



def _iter_text_blocks(
    doc: pymupdf.Document, min_chars: int, sentence_level: bool
) -> Iterable[Block]:
    next_id = 0
    for page_idx in range(doc.page_count):
        page = doc[page_idx]
        for b in page.get_text("blocks"):
            btype = b[6]
            if btype != 0:
                continue
            text = " ".join(str(b[4]).split())
            if len(text) < min_chars:
                continue
            bbox = (float(b[0]), float(b[1]), float(b[2]), float(b[3]))
            if not sentence_level:
                yield Block(id=next_id, page=page_idx, bbox=bbox, text=text)
                next_id += 1
                continue
            for sent in _split_sentences(text):
                if len(sent) < min_chars:
                    continue
                yield Block(id=next_id, page=page_idx, bbox=bbox, text=sent)
                next_id += 1



[docs]
def extract_blocks(
    pdf_path: str | os.PathLike,
    min_chars: int = 40,
    *,
    sentence_level: bool = True,
) -> tuple[pymupdf.Document, list[Block]]:
    """Open a PDF and return (document, units-of-classification).

    ``sentence_level=True`` (default) yields one unit per sentence,
    which gives much tighter highlights — avoids painting a whole
    paragraph green when only its last two sentences state the claim.
    ``sentence_level=False`` yields one unit per paragraph.

    Units shorter than ``min_chars`` are dropped (filters page numbers,
    running headers, short captions, and sentence fragments).
    """
    doc = pymupdf.open(Path(pdf_path))
    return doc, list(
        _iter_text_blocks(doc, min_chars=min_chars, sentence_level=sentence_level)
    )