Source code for scitex_scholar.core.Paper

#!/usr/bin/env python3
# Timestamp: "2025-10-07 10:47:02 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/core/Paper.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = "./src/scitex/scholar/core/Paper.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

"""
Type-safe metadata structures for Scholar papers with runtime validation.

This module uses Pydantic for:
- Runtime type validation
- Automatic type coercion
- JSON key aliasing (e.g., "2025" -> y2025)
- Clean serialization/deserialization
"""

from typing import TYPE_CHECKING, Any, Dict, List, Optional

from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

if TYPE_CHECKING:
    from scitex_scholar.core.open_access import OAResult


class IDMetadata(BaseModel):
    """Identification metadata with source tracking."""

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)

    doi: Optional[str] = None
    doi_engines: List[str] = Field(default_factory=list)

    arxiv_id: Optional[str] = None
    arxiv_id_engines: List[str] = Field(default_factory=list)

    pmid: Optional[str] = None
    pmid_engines: List[str] = Field(default_factory=list)

    corpus_id: Optional[str] = None
    corpus_id_engines: List[str] = Field(default_factory=list)

    semantic_id: Optional[str] = None
    semantic_id_engines: List[str] = Field(default_factory=list)

    ieee_id: Optional[str] = None
    ieee_id_engines: List[str] = Field(default_factory=list)

    scholar_id: Optional[str] = None
    scholar_id_engines: List[str] = Field(default_factory=list)


class BasicMetadata(BaseModel):
    """Basic bibliographic metadata with source tracking."""

    title: Optional[str] = None
    title_engines: List[str] = Field(default_factory=list)

    authors: Optional[List[str]] = None
    authors_engines: List[str] = Field(default_factory=list)

    year: Optional[int] = None
    year_engines: List[str] = Field(default_factory=list)

    abstract: Optional[str] = None
    abstract_engines: List[str] = Field(default_factory=list)

    keywords: Optional[List[str]] = None
    keywords_engines: List[str] = Field(default_factory=list)

    type: Optional[str] = None  # article, conference, preprint, etc.
    type_engines: List[str] = Field(default_factory=list)

    @field_validator("year")
    @classmethod
    def validate_year(cls, v):
        """Validate year is reasonable."""
        if v is not None and (v < 1900 or v > 2100):
            raise ValueError(f"Year {v} is outside reasonable range (1900-2100)")
        return v

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)


class CitationCountMetadata(BaseModel):
    """Citation count metadata with yearly breakdown and source tracking."""

    total: Optional[int] = None
    total_engines: List[str] = Field(default_factory=list)

    # Yearly counts - use Field(alias=...) to map JSON "2025" to Python y2025
    y2025: Optional[int] = Field(None, alias="2025")
    y2025_engines: List[str] = Field(default_factory=list, alias="2025_engines")

    y2024: Optional[int] = Field(None, alias="2024")
    y2024_engines: List[str] = Field(default_factory=list, alias="2024_engines")

    y2023: Optional[int] = Field(None, alias="2023")
    y2023_engines: List[str] = Field(default_factory=list, alias="2023_engines")

    y2022: Optional[int] = Field(None, alias="2022")
    y2022_engines: List[str] = Field(default_factory=list, alias="2022_engines")

    y2021: Optional[int] = Field(None, alias="2021")
    y2021_engines: List[str] = Field(default_factory=list, alias="2021_engines")

    y2020: Optional[int] = Field(None, alias="2020")
    y2020_engines: List[str] = Field(default_factory=list, alias="2020_engines")

    y2019: Optional[int] = Field(None, alias="2019")
    y2019_engines: List[str] = Field(default_factory=list, alias="2019_engines")

    y2018: Optional[int] = Field(None, alias="2018")
    y2018_engines: List[str] = Field(default_factory=list, alias="2018_engines")

    y2017: Optional[int] = Field(None, alias="2017")
    y2017_engines: List[str] = Field(default_factory=list, alias="2017_engines")

    y2016: Optional[int] = Field(None, alias="2016")
    y2016_engines: List[str] = Field(default_factory=list, alias="2016_engines")

    y2015: Optional[int] = Field(None, alias="2015")
    y2015_engines: List[str] = Field(default_factory=list, alias="2015_engines")

    @field_validator(
        "total",
        "y2025",
        "y2024",
        "y2023",
        "y2022",
        "y2021",
        "y2020",
        "y2019",
        "y2018",
        "y2017",
        "y2016",
        "y2015",
    )
    @classmethod
    def validate_citation_counts(cls, v):
        """Validate citation counts are non-negative."""
        if v is not None and v < 0:
            raise ValueError(f"Citation count cannot be negative: {v}")
        return v

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)

    def model_dump(self, **kwargs) -> Dict[str, Any]:
        """Custom serialization to use aliases in output."""
        # Remove by_alias from kwargs if present to avoid duplicate
        kwargs.pop("by_alias", None)
        data = super().model_dump(by_alias=True, **kwargs)
        return data


class PublicationMetadata(BaseModel):
    """Publication venue metadata with source tracking."""

    journal: Optional[str] = None
    journal_engines: List[str] = Field(default_factory=list)

    short_journal: Optional[str] = None
    short_journal_engines: List[str] = Field(default_factory=list)

    impact_factor: Optional[float] = None
    impact_factor_engines: List[str] = Field(default_factory=list)

    issn: Optional[str] = None
    issn_engines: List[str] = Field(default_factory=list)

    volume: Optional[str] = None
    volume_engines: List[str] = Field(default_factory=list)

    issue: Optional[str] = None
    issue_engines: List[str] = Field(default_factory=list)

    first_page: Optional[str] = None
    first_page_engines: List[str] = Field(default_factory=list)

    last_page: Optional[str] = None
    last_page_engines: List[str] = Field(default_factory=list)

    pages: Optional[str] = None
    pages_engines: List[str] = Field(default_factory=list)

    publisher: Optional[str] = None
    publisher_engines: List[str] = Field(default_factory=list)

    @field_validator("impact_factor")
    @classmethod
    def validate_impact_factor(cls, v):
        """Validate impact factor is non-negative."""
        if v is not None and v < 0:
            raise ValueError(f"Impact factor cannot be negative: {v}")
        return v

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)


class URLMetadata(BaseModel):
    """URL metadata with source tracking."""

    doi: Optional[str] = None
    doi_engines: List[str] = Field(default_factory=list)

    publisher: Optional[str] = None
    publisher_engines: List[str] = Field(default_factory=list)

    arxiv: Optional[str] = None
    arxiv_engines: List[str] = Field(default_factory=list)

    corpus_id: Optional[str] = None
    corpus_id_engines: List[str] = Field(default_factory=list)

    openurl_query: Optional[str] = None
    openurl_engines: List[str] = Field(default_factory=list)

    openurl_resolved: List[str] = Field(default_factory=list)
    openurl_resolved_engines: List[str] = Field(default_factory=list)

    pdfs: List[Dict[str, str]] = Field(default_factory=list)
    pdfs_engines: List[str] = Field(default_factory=list)

    supplementary_files: List[str] = Field(default_factory=list)
    supplementary_files_engines: List[str] = Field(default_factory=list)

    additional_files: List[str] = Field(default_factory=list)
    additional_files_engines: List[str] = Field(default_factory=list)

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)


class PathMetadata(BaseModel):
    """Local file path metadata with source tracking."""

    pdfs: List[str] = Field(default_factory=list)
    pdfs_engines: List[str] = Field(default_factory=list)

    supplementary_files: List[str] = Field(default_factory=list)
    supplementary_files_engines: List[str] = Field(default_factory=list)

    additional_files: List[str] = Field(default_factory=list)
    additional_files_engines: List[str] = Field(default_factory=list)

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)


class AccessMetadata(BaseModel):
    """Open access and licensing metadata with source tracking.

    Tracks whether a paper is open access and provides URLs for OA versions.
    Also includes license information when available.
    """

    is_open_access: Optional[bool] = None
    is_open_access_engines: List[str] = Field(default_factory=list)

    oa_status: Optional[str] = None  # gold, green, bronze, hybrid, closed
    oa_status_engines: List[str] = Field(default_factory=list)

    oa_url: Optional[str] = None  # URL to open access version
    oa_url_engines: List[str] = Field(default_factory=list)

    license: Optional[str] = None  # CC-BY, CC-BY-NC, etc.
    license_engines: List[str] = Field(default_factory=list)

    license_url: Optional[str] = None
    license_url_engines: List[str] = Field(default_factory=list)

    # For paywalled journals - opt-in for local/personal users
    paywall_bypass_attempted: Optional[bool] = None
    paywall_bypass_success: Optional[bool] = None

    # Last-attempt PDF download outcome
    pdf_download_attempted_at: Optional[str] = None
    pdf_download_status: Optional[str] = None
    pdf_download_error: Optional[str] = None

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)


class SystemMetadata(BaseModel):
    """System tracking metadata (which engines were used to search)."""

    searched_by_arXiv: Optional[bool] = None
    searched_by_CrossRef: Optional[bool] = None
    searched_by_CrossRefLocal: Optional[bool] = None
    searched_by_OpenAlex: Optional[bool] = None
    searched_by_PubMed: Optional[bool] = None
    searched_by_Semantic_Scholar: Optional[bool] = None
    searched_by_URL: Optional[bool] = None

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)


class PaperMetadataStructure(BaseModel):
    """Complete paper metadata structure with nested typed sections."""

    id: IDMetadata = Field(default_factory=IDMetadata)
    basic: BasicMetadata = Field(default_factory=BasicMetadata)
    citation_count: CitationCountMetadata = Field(default_factory=CitationCountMetadata)
    publication: PublicationMetadata = Field(default_factory=PublicationMetadata)
    url: URLMetadata = Field(default_factory=URLMetadata)
    path: PathMetadata = Field(default_factory=PathMetadata)
    access: AccessMetadata = Field(default_factory=AccessMetadata)
    system: SystemMetadata = Field(default_factory=SystemMetadata)

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)

    @model_validator(mode="after")
    def sync_ids_and_urls(self):
        """Automatically sync ID and URL fields with source tracking.

        Generates URLs from IDs and vice versa for:
        - DOI ↔ url.doi
        - arXiv ID ↔ url.arxiv
        - Corpus ID ↔ url.corpus_id
        """
        # DOI sync
        if self.id.doi and not self.url.doi:
            self.url.doi = f"https://doi.org/{self.id.doi}"
            if (
                self.id.doi_engines
                and "PaperMetadataStructure" not in self.url.doi_engines
            ):
                self.url.doi_engines = (
                    self.id.doi_engines.copy() if self.id.doi_engines else []
                )
                if "PaperMetadataStructure" not in self.url.doi_engines:
                    self.url.doi_engines.append("PaperMetadataStructure")
        elif self.url.doi and not self.id.doi:
            url = self.url.doi
            if "doi.org/" in url:
                self.id.doi = url.split("doi.org/")[-1]
                if (
                    self.url.doi_engines
                    and "PaperMetadataStructure" not in self.id.doi_engines
                ):
                    self.id.doi_engines = (
                        self.url.doi_engines.copy() if self.url.doi_engines else []
                    )
                    if "PaperMetadataStructure" not in self.id.doi_engines:
                        self.id.doi_engines.append("PaperMetadataStructure")
        elif self.id.doi and self.url.doi:
            if not self.url.doi.startswith("https://"):
                if self.url.doi.startswith("http://"):
                    self.url.doi = "https://" + self.url.doi[7:]
                else:
                    self.url.doi = f"https://doi.org/{self.id.doi}"

        # arXiv sync
        if self.id.arxiv_id and not self.url.arxiv:
            self.url.arxiv = f"https://arxiv.org/abs/{self.id.arxiv_id}"
            if (
                self.id.arxiv_id_engines
                and "PaperMetadataStructure" not in self.url.arxiv_engines
            ):
                self.url.arxiv_engines = (
                    self.id.arxiv_id_engines.copy() if self.id.arxiv_id_engines else []
                )
                if "PaperMetadataStructure" not in self.url.arxiv_engines:
                    self.url.arxiv_engines.append("PaperMetadataStructure")
        elif self.url.arxiv and not self.id.arxiv_id:
            url = self.url.arxiv
            if "arxiv.org/abs/" in url:
                self.id.arxiv_id = (
                    url.split("arxiv.org/abs/")[-1].split("?")[0].split("#")[0]
                )
                if (
                    self.url.arxiv_engines
                    and "PaperMetadataStructure" not in self.id.arxiv_id_engines
                ):
                    self.id.arxiv_id_engines = (
                        self.url.arxiv_engines.copy() if self.url.arxiv_engines else []
                    )
                    if "PaperMetadataStructure" not in self.id.arxiv_id_engines:
                        self.id.arxiv_id_engines.append("PaperMetadataStructure")

        # Corpus ID sync
        if self.id.corpus_id and not self.url.corpus_id:
            corpus_id_clean = str(self.id.corpus_id).replace("CorpusId:", "")
            self.url.corpus_id = (
                f"https://www.semanticscholar.org/paper/{corpus_id_clean}"
            )
            if (
                self.id.corpus_id_engines
                and "PaperMetadataStructure" not in self.url.corpus_id_engines
            ):
                self.url.corpus_id_engines = (
                    self.id.corpus_id_engines.copy()
                    if self.id.corpus_id_engines
                    else []
                )
                if "PaperMetadataStructure" not in self.url.corpus_id_engines:
                    self.url.corpus_id_engines.append("PaperMetadataStructure")
        elif self.url.corpus_id and not self.id.corpus_id:
            url = self.url.corpus_id
            if "semanticscholar.org/paper/" in url:
                self.id.corpus_id = (
                    url.split("semanticscholar.org/paper/")[-1]
                    .split("?")[0]
                    .split("#")[0]
                )
                if (
                    self.url.corpus_id_engines
                    and "PaperMetadataStructure" not in self.id.corpus_id_engines
                ):
                    self.id.corpus_id_engines = (
                        self.url.corpus_id_engines.copy()
                        if self.url.corpus_id_engines
                        else []
                    )
                    if "PaperMetadataStructure" not in self.id.corpus_id_engines:
                        self.id.corpus_id_engines.append("PaperMetadataStructure")

        return self

    def set_doi(self, doi: str):
        """Set DOI and automatically sync URL.

        Use this method instead of direct assignment for automatic sync.
        """
        self.id.doi = doi
        if doi:
            self.url.doi = f"https://doi.org/{doi}"

    def set_doi_url(self, url: str):
        """Set DOI URL and automatically extract/sync DOI.

        Use this method instead of direct assignment for automatic sync.
        """
        self.url.doi = url
        if url and "doi.org/" in url:
            self.id.doi = url.split("doi.org/")[-1]

    def model_dump(self, **kwargs) -> Dict[str, Any]:
        """Custom serialization to ensure nested models use aliases."""
        # Remove by_alias from kwargs if present to avoid duplicate
        kwargs.pop("by_alias", None)
        return {
            "id": self.id.model_dump(by_alias=True, **kwargs),
            "basic": self.basic.model_dump(by_alias=True, **kwargs),
            "citation_count": self.citation_count.model_dump(by_alias=True, **kwargs),
            "publication": self.publication.model_dump(by_alias=True, **kwargs),
            "url": self.url.model_dump(by_alias=True, **kwargs),
            "path": self.path.model_dump(by_alias=True, **kwargs),
            "access": self.access.model_dump(by_alias=True, **kwargs),
            "system": self.system.model_dump(by_alias=True, **kwargs),
        }


class ContainerMetadata(BaseModel):
    """Container metadata for system tracking."""

    scitex_id: Optional[str] = None
    library_id: Optional[str] = None
    created_at: Optional[str] = None
    created_by: Optional[str] = None
    updated_at: Optional[str] = None
    projects: List[str] = Field(default_factory=list)
    master_storage_path: Optional[str] = None
    readable_name: Optional[str] = None
    metadata_file: Optional[str] = None
    pdf_downloaded_at: Optional[str] = None
    pdf_size_bytes: Optional[int] = None
    pdf_sha256: Optional[str] = None  # optional scitex-clew provenance hash

    @field_validator("pdf_size_bytes")
    @classmethod
    def validate_pdf_size(cls, v):
        """Validate PDF size is non-negative."""
        if v is not None and v < 0:
            raise ValueError(f"PDF size cannot be negative: {v}")
        return v

    model_config = ConfigDict(populate_by_name=True, validate_assignment=True)


[docs] class Paper(BaseModel): """Complete paper with metadata and container.""" metadata: PaperMetadataStructure = Field(default_factory=PaperMetadataStructure) container: ContainerMetadata = Field(default_factory=ContainerMetadata) model_config = ConfigDict(populate_by_name=True, validate_assignment=True)
[docs] def model_dump(self, **kwargs) -> Dict[str, Any]: """Custom serialization to ensure all nested models use aliases.""" # Remove by_alias from kwargs if present to avoid duplicate kwargs.pop("by_alias", None) return { "metadata": self.metadata.model_dump(by_alias=True, **kwargs), "container": self.container.model_dump(by_alias=True, **kwargs), }
[docs] @classmethod def from_dict(cls, data: Dict[str, Any]) -> Paper: """Create from dictionary (for loading from JSON). Uses Pydantic's model_validate which handles: - Type validation - Type coercion (e.g., "2024" -> 2024) - Field aliases (e.g., "2025" -> y2025) """ return cls.model_validate(data)
[docs] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for JSON serialization. Alias for model_dump() for backward compatibility. """ return self.model_dump()
[docs] def detect_open_access( self, use_unpaywall: bool = False, update_metadata: bool = True, ) -> OAResult: """ Detect open access status for this paper. Uses identifiers (DOI, arXiv ID, PMCID) and known OA sources to determine if the paper is freely available. Args: use_unpaywall: If True, query Unpaywall API for uncertain cases update_metadata: If True, update self.metadata.access with results Returns ------- OAResult with detection results """ from .open_access import check_oa_status result = check_oa_status( doi=self.metadata.id.doi, arxiv_id=self.metadata.id.arxiv_id, pmcid=None, # Not currently in IDMetadata source=None, # Source tracking not in Paper journal=self.metadata.publication.journal, is_open_access_flag=self.metadata.access.is_open_access, use_unpaywall=use_unpaywall, ) if update_metadata: self.metadata.access.is_open_access = result.is_open_access self.metadata.access.is_open_access_engines.append( f"detect_oa:{result.source}" ) if result.status: self.metadata.access.oa_status = result.status.value self.metadata.access.oa_status_engines.append( f"detect_oa:{result.source}" ) if result.oa_url: self.metadata.access.oa_url = result.oa_url self.metadata.access.oa_url_engines.append(f"detect_oa:{result.source}") if result.license: self.metadata.access.license = result.license self.metadata.access.license_engines.append( f"detect_oa:{result.source}" ) return result
@property def is_open_access(self) -> bool: """Check if paper is open access (quick check without API calls).""" if self.metadata.access.is_open_access is not None: return self.metadata.access.is_open_access # Quick detection from identifiers from .open_access import detect_oa_from_identifiers result = detect_oa_from_identifiers( doi=self.metadata.id.doi, arxiv_id=self.metadata.id.arxiv_id, journal=self.metadata.publication.journal, ) return result.is_open_access
if __name__ == "__main__": import json print("=" * 80) print("Paper Class - Pydantic Type-Safe Metadata with Runtime Validation") print("=" * 80) # 1. Create empty paper print("\n1. Create empty Paper:") paper = Paper() print(f" Empty paper created: {type(paper).__name__}") # 2. Set basic metadata print("\n2. Set basic metadata:") paper.metadata.basic.title = "Attention Is All You Need" paper.metadata.basic.authors = [ "Vaswani, Ashish", "Shazeer, Noam", "Parmar, Niki", ] paper.metadata.basic.year = 2017 paper.metadata.basic.abstract = "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks." paper.metadata.basic.keywords = [ "transformer", "attention", "neural networks", ] print(f" Title: {paper.metadata.basic.title}") print(f" Authors: {paper.metadata.basic.authors[:2]}...") print(f" Year: {paper.metadata.basic.year}") # 3. Set DOI (auto-syncs URL) print("\n3. Set DOI (auto-syncs DOI URL):") paper.metadata.set_doi("10.48550/arXiv.1706.03762") print(f" DOI: {paper.metadata.id.doi}") print(f" DOI URL (auto-synced): {paper.metadata.url.doi}") # 4. Set publication details print("\n4. Set publication details:") paper.metadata.publication.journal = "NeurIPS" paper.metadata.publication.volume = "30" paper.metadata.publication.impact_factor = 12.345 print(f" Journal: {paper.metadata.publication.journal}") print(f" Volume: {paper.metadata.publication.volume}") print(f" Impact Factor: {paper.metadata.publication.impact_factor}") # 5. Set citation counts with year breakdown print("\n5. Set citation counts:") paper.metadata.citation_count.total = 85432 paper.metadata.citation_count.y2024 = 15234 paper.metadata.citation_count.y2023 = 18765 print(f" Total citations: {paper.metadata.citation_count.total}") print(f" 2024 citations: {paper.metadata.citation_count.y2024}") print(f" 2023 citations: {paper.metadata.citation_count.y2023}") # 6. Set container metadata print("\n6. Set container metadata:") paper.container.projects = ["transformers_research", "nlp_2024"] paper.container.library_id = "ABC12345" paper.container.readable_name = "Vaswani-2017-NeurIPS" print(f" Projects: {paper.container.projects}") print(f" Library ID: {paper.container.library_id}") print(f" Readable name: {paper.container.readable_name}") # 7. Demonstrate type validation print("\n7. Type validation (validate_assignment=True):") print(" ✓ Automatic type coercion: year='2017' -> 2017 (int)") paper.metadata.basic.year = "2017" # String coerced to int print( f" Result: {paper.metadata.basic.year} (type: {type(paper.metadata.basic.year).__name__})" ) print(" ✓ Range validation: year must be 1900-2100") try: paper.metadata.basic.year = 1800 # Too old print(" ERROR: Should have raised ValidationError") except Exception as e: print(f" Correctly rejected: {type(e).__name__}") print(" ✓ Non-negative validation: citations cannot be negative") try: paper.metadata.citation_count.total = -100 print(" ERROR: Should have raised ValidationError") except Exception as e: print(f" Correctly rejected: {type(e).__name__}") # Reset to valid value paper.metadata.basic.year = 2017 paper.metadata.citation_count.total = 85432 # 8. Serialize to JSON (with aliases) print("\n8. Serialize to JSON with field aliases:") paper_dict = paper.to_dict() print(" Year fields use numeric keys in JSON:") print(f" '2024': {paper_dict['metadata']['citation_count'].get('2024')}") print(f" '2023': {paper_dict['metadata']['citation_count'].get('2023')}") # 9. Create from dictionary print("\n9. Load from dictionary (from_dict):") sample_data = { "metadata": { "basic": { "title": "BERT: Pre-training of Deep Bidirectional Transformers", "year": 2019, }, "id": {"doi": "10.18653/v1/N19-1423"}, "citation_count": { "2024": 5678, # Numeric key maps to y2024 "total": 45000, }, } } paper2 = Paper.from_dict(sample_data) print(f" Title: {paper2.metadata.basic.title}") print(f" Year: {paper2.metadata.basic.year}") print(f" DOI: {paper2.metadata.id.doi}") print(f" DOI URL (auto-synced): {paper2.metadata.url.doi}") print(f" 2024 citations: {paper2.metadata.citation_count.y2024}") # 10. Show JSON structure print("\n10. Full JSON structure (first 500 chars):") json_str = json.dumps(paper.to_dict(), indent=2) print(f" {json_str[:500]}...") print("\n" + "=" * 80) print("✅ Paper class demonstration complete!") print("=" * 80) # EOF