Source code for scitex_scholar.pdf_download.ScholarPDFDownloader

#!/usr/bin/env python3
# Timestamp: "2026-01-22 (ywatanabe)"
# File: src/scitex/scholar/pdf_download/ScholarPDFDownloader.py
"""PDF downloader with multiple fallback strategies."""

from __future__ import annotations

import asyncio
import os
import traceback
from pathlib import Path
from typing import TYPE_CHECKING, Any, List, Optional, Union

import scitex_logging as logging

if TYPE_CHECKING:
    from playwright.async_api import BrowserContext

from scitex_scholar.config import ScholarConfig
from scitex_scholar.pdf_download.strategies import (
    FlexibleFilenameGenerator,
    handle_manual_download_on_page_async,
    try_download_chrome_pdf_viewer_async,
    try_download_direct_async,
    try_download_open_access_async,
    try_download_response_body_async,
)

logger = logging.getLogger(__name__)



[docs]
class ScholarPDFDownloader:
    """Download PDFs from URLs with multiple fallback strategies.

    Strategies tried in order:
    - Chrome PDF Viewer
    - Direct Download (ERR_ABORTED)
    - Response Body Extraction
    - Manual Download Fallback

    URL resolution (DOI -> URL) should be handled by the caller.
    """


[docs]
    def __init__(self, context: BrowserContext, config: ScholarConfig = None):
        self.name = self.__class__.__name__
        self.config = config if config else ScholarConfig()
        self.context = context
        self.output_dir = self.config.get_library_downloads_dir()

        self.prefer_open_access = self.config.resolve(
            "prefer_open_access", default=True, type=bool
        )
        self.enable_paywall_access = self.config.resolve(
            "enable_paywall_access", default=False, type=bool
        )
        self.track_paywall_attempts = self.config.resolve(
            "track_paywall_attempts", default=True, type=bool
        )


    async def __aexit__(self, exc_type, exc_val, exc_tb):
        pass


[docs]
    async def download_from_urls(
        self,
        pdf_urls: List[str],
        output_dir: Union[str, Path] = None,
        max_concurrent: int = 3,
    ) -> List[Path]:
        """Download multiple PDFs with parallel processing."""
        output_dir = Path(output_dir or self.output_dir)
        if not pdf_urls:
            return []

        output_paths = [
            output_dir / f"{ii_pdf:03d}_{os.path.basename(pdf_url)}"
            for ii_pdf, pdf_url in enumerate(pdf_urls)
        ]

        semaphore = asyncio.Semaphore(max_concurrent)

        async def download_with_semaphore(url: str, path: Path, index: int):
            async with semaphore:
                logger.info(
                    f"{self.name}: Downloading PDF {index}/{len(pdf_urls)}: {url}"
                )
                result = await self.download_from_url(url, path)
                if result:
                    logger.info(f"{self.name}: Downloaded to {result}")
                return result

        tasks = [
            download_with_semaphore(url, path, idx + 1)
            for idx, (url, path) in enumerate(zip(pdf_urls, output_paths))
        ]

        results = await asyncio.gather(*tasks, return_exceptions=True)

        saved_paths = []
        for result in results:
            if isinstance(result, Exception):
                logger.debug(f"{self.name}: Download error: {result}")
            elif result:
                saved_paths.append(result)

        logger.info(f"{self.name}: Downloaded {len(saved_paths)}/{len(pdf_urls)} PDFs")
        return saved_paths



[docs]
    async def download_open_access(
        self,
        oa_url: str,
        output_path: Union[str, Path],
        metadata: Optional[dict] = None,
    ) -> Optional[Path]:
        """Download PDF from an Open Access URL."""
        if not oa_url:
            logger.debug(f"{self.name}: No OA URL provided")
            return None

        if isinstance(output_path, str):
            output_path = Path(output_path)
        if not str(output_path).endswith(".pdf"):
            output_path = Path(str(output_path) + ".pdf")
        output_path.parent.mkdir(parents=True, exist_ok=True)

        logger.info(f"{self.name}: Attempting OA download from {oa_url[:60]}...")

        result = await try_download_open_access_async(
            oa_url=oa_url,
            output_path=output_path,
            metadata=metadata,
            func_name=self.name,
        )

        if result:
            logger.info(f"{self.name}: Successfully downloaded OA PDF to {result}")
        else:
            logger.debug(f"{self.name}: OA download failed")

        return result



[docs]
    async def download_smart(
        self, paper, output_path: Union[str, Path]
    ) -> Optional[Path]:
        """Smart download choosing best strategy based on paper metadata."""
        if isinstance(output_path, str):
            output_path = Path(output_path)
        if not str(output_path).endswith(".pdf"):
            output_path = Path(str(output_path) + ".pdf")

        meta = paper.metadata if hasattr(paper, "metadata") else paper
        access = getattr(meta, "access", None)
        url_meta = getattr(meta, "url", None)
        id_meta = getattr(meta, "id", None)

        is_open_access = getattr(access, "is_open_access", False) if access else False
        oa_url = getattr(access, "oa_url", None) if access else None
        pdf_urls = getattr(url_meta, "pdfs", []) if url_meta else []
        doi = getattr(id_meta, "doi", None) if id_meta else None

        logger.info(f"{self.name}: Smart download for DOI={doi}, OA={is_open_access}")

        # Strategy 1: Try Open Access if available
        if self.prefer_open_access and oa_url:
            logger.info(f"{self.name}: Trying Open Access URL first")
            result = await self.download_open_access(oa_url, output_path)
            if result:
                if access and self.track_paywall_attempts:
                    access.paywall_bypass_attempted = False
                return result

        # Strategy 2: Try available PDF URLs
        for pdf_entry in pdf_urls:
            pdf_url = pdf_entry.get("url") if isinstance(pdf_entry, dict) else pdf_entry
            if pdf_url:
                logger.info(f"{self.name}: Trying PDF URL: {pdf_url[:60]}...")
                result = await self.download_from_url(pdf_url, output_path, doi=doi)
                if result:
                    return result

        # Strategy 3: Try paywall access if enabled
        if self.enable_paywall_access and not is_open_access:
            logger.info(f"{self.name}: Attempting paywall access (opt-in enabled)")
            if access and self.track_paywall_attempts:
                access.paywall_bypass_attempted = True

            if doi:
                doi_url = f"https://doi.org/{doi}"
                result = await self.download_from_url(doi_url, output_path, doi=doi)
                if result:
                    if access and self.track_paywall_attempts:
                        access.paywall_bypass_success = True
                    return result
                elif access and self.track_paywall_attempts:
                    access.paywall_bypass_success = False

        logger.warning(f"{self.name}: All download strategies exhausted for DOI={doi}")
        return None



[docs]
    async def download_from_url(
        self,
        pdf_url: str,
        output_path: Union[str, Path],
        doi: Optional[str] = None,
    ) -> Optional[Path]:
        """Main download method with manual override support."""
        if not pdf_url:
            logger.warning(f"{self.name}: PDF URL passed but not valid: {pdf_url}")
            return None

        if isinstance(output_path, str):
            output_path = Path(output_path)
        if not str(output_path).endswith(".pdf"):
            output_path = Path(str(output_path) + ".pdf")
        output_path.parent.mkdir(parents=True, exist_ok=True)

        target_filename = FlexibleFilenameGenerator.generate_filename(
            doi=doi, url=pdf_url, content_type="main"
        )

        stop_event = asyncio.Event()
        self.context._scitex_is_manual_mode = False
        self.context._scitex_manual_mode_event = stop_event

        from scitex_scholar.pdf_download.strategies.manual_download_utils import (
            get_manual_button_init_script,
        )

        button_script = get_manual_button_init_script(target_filename)
        await self.context.add_init_script(button_script)
        logger.info(f"{self.name}: Manual mode button injected into browser context")

        button_task = None
        pdf_page: Any = None

        async def chrome_pdf_wrapper(url, path):
            return await try_download_chrome_pdf_viewer_async(
                self.context, url, path, self.name
            )

        async def direct_download_wrapper(url, path):
            return await try_download_direct_async(self.context, url, path, self.name)

        async def response_body_wrapper(url, path):
            return await try_download_response_body_async(
                self.context, url, path, self.name
            )

        async def manual_fallback_wrapper(url, path):
            return None

        try_download_methods = [
            ("Chrome PDF", chrome_pdf_wrapper),
            ("Direct Download", direct_download_wrapper),
            ("From Response Body", response_body_wrapper),
            ("Manual Download", manual_fallback_wrapper),
        ]

        for method_name, method_func in try_download_methods:
            if stop_event.is_set():
                logger.info(f"{self.name}: Manual mode - stopping automation")
                break

            logger.info(f"{self.name}: Trying method: {method_name}")

            try:
                if stop_event.is_set():
                    logger.info(f"{self.name}: Manual mode, skipping {method_name}")
                    break

                is_downloaded = await method_func(pdf_url, output_path)

                if stop_event.is_set():
                    logger.info(f"{self.name}: Manual mode during {method_name}")
                    break

                if is_downloaded:
                    if button_task:
                        button_task.cancel()
                    if pdf_page:
                        await pdf_page.close()
                    logger.info(f"{self.name}: Downloaded via {method_name}")
                    return is_downloaded
                else:
                    logger.debug(f"{self.name}: {method_name} returned None")
            except Exception as e:
                logger.warning(f"{self.name}: {method_name} raised exception: {e}")
                logger.debug(f"{self.name}: Traceback: {traceback.format_exc()}")

        # Handle manual download if user chose it
        if stop_event.is_set():
            self.context._scitex_is_manual_mode = True
            logger.info(f"{self.name}: User chose manual download - starting")
            if button_task:
                button_task.cancel()

            if not pdf_page:
                pdf_page = await self.context.new_page()
                await pdf_page.goto(
                    pdf_url, timeout=30000, wait_until="domcontentloaded"
                )

            result = await handle_manual_download_on_page_async(
                pdf_page,
                pdf_url,
                output_path,
                func_name=self.name,
                config=self.config,
                doi=doi,
            )
            await pdf_page.close()
            return result

        # All methods failed
        if button_task:
            button_task.cancel()
        if pdf_page:
            await pdf_page.close()
        logger.fail(f"{self.name}: All download methods failed for {pdf_url}")
        return None




# CLI entry point moved to _cli.py
if __name__ == "__main__":
    from scitex_scholar.pdf_download._cli import run_main

    run_main()

# EOF