Source code for scitex_scholar.auth.core.AuthenticationGateway

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-10-10 03:24:07 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/auth/AuthenticationGateway.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = "./src/scitex/scholar/auth/core/AuthenticationGateway.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

__FILE__ = __file__

"""
Authentication Gateway Pattern for Scholar Module

Provides transparent authentication layer that:
- Determines if URL requires authentication (config-based)
- Prepares authenticated browser context before URL finding
- Visits authentication gateways (OpenURL) to establish sessions
- Caches authentication state to avoid redundant operations

This keeps URL finders and PDF downloaders free of authentication logic.
"""

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List, Optional

import scitex_logging as logging

if TYPE_CHECKING:
    from playwright.async_api import BrowserContext

from scitex_scholar.config import ScholarConfig

logger = logging.getLogger(__name__)



[docs]
@dataclass
class URLContext:
    """
    Context for URL operations with authentication information.

    This dataclass carries all information needed for URL resolution
    and PDF download, including authentication state.
    """

    doi: str
    title: Optional[str] = None
    url: Optional[str] = None  # Publisher landing page URL
    pdf_urls: List[str] = field(default_factory=list)
    requires_auth: Optional[bool] = None
    auth_provider: Optional[str] = None  # openathens, ezproxy, shibboleth
    auth_gateway_url: Optional[str] = None  # OpenURL for establishing session




[docs]
class AuthenticationGateway:
    """
    Transparent authentication layer for Scholar operations.

    Responsibilities:
    - Determine if URL requires authentication (config-based, no hardcoding)
    - Prepare authenticated browser context
    - Visit authentication gateways (OpenURL) to establish publisher sessions
    - Cache authentication state for performance

    This gateway sits between Scholar and URL/Download operations,
    preparing authentication transparently before content access.
    """

    @property
    def name(self):
        return self.__class__.__name__


[docs]
    def __init__(
        self,
        auth_manager,  # ScholarAuthManager
        browser_manager,  # ScholarBrowserManager
        config: ScholarConfig = None,
    ):
        """
        Initialize authentication gateway.

        Args:
            auth_manager: ScholarAuthManager instance
            browser_manager: ScholarBrowserManager instance
            config: ScholarConfig instance
        """
        self.auth_manager = auth_manager
        self.browser_manager = browser_manager
        self.config = config or ScholarConfig()
        self._auth_cache: Dict[str, "bool | str"] = {}  # Cache visited gateways / URLs



[docs]
    async def prepare_context_async(
        self, doi: str, context: BrowserContext, title: Optional[str] = None
    ) -> URLContext:
        """
        Prepare URL context with authentication if needed.

        This is the main entry point - called BEFORE URL finding.

        Flow:
        1. Build OpenURL (authentication gateway)
        2. Check if DOI needs authentication (based on known publishers)
        3. If auth needed: Visit OpenURL to establish publisher cookies
        4. Resolve to final publisher URL
        5. Return prepared context with authenticated session

        Args:
            doi: Paper DOI
            context: Browser context (will be updated with auth cookies)
            title: Optional paper title

        Returns:
            URLContext with authentication prepared and ready
        """
        url_context = URLContext(doi=doi, title=title)

        # Step 1: Build OpenURL
        from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver

        resolver = OpenURLResolver(config=self.config)
        openurl = resolver._build_query(url_context.doi)
        url_context.auth_gateway_url = openurl

        # Step 2: Try to determine if auth needed from DOI patterns
        # (IEEE DOIs start with 10.1109, Springer with 10.1007, etc.)
        url_context = self._check_auth_requirements_from_doi(url_context)

        # Step 3: If authentication needed, visit OpenURL and establish cookies
        # This also resolves to the publisher URL as a side effect
        if url_context.requires_auth:
            publisher_url = await self._establish_authentication_async(
                url_context, context
            )
            url_context.url = publisher_url or openurl
        else:
            # Step 4: For open access, use direct DOI navigation (faster than OpenURL)
            from scitex_scholar.auth.gateway._resolve_functions import (
                resolve_publisher_url_by_navigating_to_doi_page,
            )

            page = await context.new_page()
            try:
                # Try direct DOI navigation first (fast for open access)
                publisher_url = await resolve_publisher_url_by_navigating_to_doi_page(
                    url_context.doi, page
                )
                url_context.url = publisher_url
                logger.debug(
                    f"{self.name}: Resolved {url_context.doi} → {publisher_url}"
                )
            except Exception as e:
                # Fallback to OpenURL resolver if direct navigation fails
                logger.debug(
                    f"{self.name}: Direct navigation failed, trying OpenURL: {e}"
                )
                try:
                    publisher_url = await resolver.resolve_doi(url_context.doi, page)
                    url_context.url = publisher_url
                except Exception as openurl_error:
                    logger.warning(
                        f"{self.name}: Both methods failed for {url_context.doi}: {openurl_error}"
                    )
                    url_context.url = openurl  # Last resort fallback
            finally:
                await page.close()

        return url_context



[docs]
    async def _resolve_publisher_url_async(
        self, url_context: URLContext, context: BrowserContext
    ) -> URLContext:
        """
        Resolve DOI to publisher landing page URL.

        Uses OpenURLResolver which already exists and works.
        The OpenURL is the authentication gateway for paywalled content.

        Args:
            url_context: URLContext with DOI
            context: Browser context

        Returns:
            URLContext with url and auth_gateway_url populated
        """
        from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver

        resolver = OpenURLResolver(config=self.config)

        # Build OpenURL (this is the authentication gateway)
        # Use the private _build_query method since no public method exists
        openurl = resolver._build_query(url_context.doi)
        url_context.auth_gateway_url = openurl

        # Resolve to publisher URL (may redirect through OpenAthens)
        page = await context.new_page()
        try:
            publisher_url = await resolver.resolve_doi(url_context.doi, page)
            url_context.url = publisher_url
            logger.debug(f"{self.name}: Resolved {url_context.doi} → {publisher_url}")
        except Exception as e:
            logger.warning(f"{self.name}: Failed to resolve DOI {url_context.doi}: {e}")
            url_context.url = openurl  # Fallback to OpenURL
        finally:
            await page.close()

        return url_context



[docs]
    def _check_auth_requirements_from_doi(self, url_context: URLContext) -> URLContext:
        """
        Determine if DOI requires authentication based on DOI prefix patterns.

        This allows early detection before resolving URL.
        IEEE DOIs start with 10.1109, Springer with 10.1007, etc.

        Args:
            url_context: URLContext with doi populated

        Returns:
            URLContext with requires_auth and auth_provider populated
        """
        # Get authenticated publishers from config
        # auth_config = self.config.get("authentication") or {}
        # paywalled_publishers = auth_config.get("paywalled_publishers") or []
        paywalled_publishers = self.config.resolve(
            "paywalled_publishers", None, default=[]
        )
        if not isinstance(paywalled_publishers, list):
            paywalled_publishers = []

        doi = url_context.doi or ""

        for publisher_config in paywalled_publishers:
            doi_prefixes = publisher_config.get("doi_prefixes", [])
            for prefix in doi_prefixes:
                if doi.startswith(prefix):
                    url_context.requires_auth = True
                    url_context.auth_provider = publisher_config.get(
                        "preferred_provider", "openathens"
                    )
                    logger.info(
                        f"{self.name}: Authentication required for {publisher_config.get('name')} "
                        f"(DOI prefix: {prefix}, provider: {url_context.auth_provider})"
                    )
                    return url_context

        # Fallback: check by URL if DOI detection didn't match
        # (for cases where DOI prefix is not in config)
        url_context.requires_auth = False
        return url_context



[docs]
    def _check_auth_requirements(self, url_context: URLContext) -> URLContext:
        """
        Determine if URL requires authentication based on config.

        This is config-based (no hardcoded domain lists).
        Checks URL against paywalled_publishers in config.

        Args:
            url_context: URLContext with url populated

        Returns:
            URLContext with requires_auth and auth_provider populated
        """
        # Get authenticated publishers from config
        # auth_config = self.config.get("authentication") or {}
        # paywalled_publishers = auth_config.get("paywalled_publishers") or []
        paywalled_publishers = self.config.resolve(
            "paywalled_publishers", None, default=[]
        )
        if not isinstance(paywalled_publishers, list):
            paywalled_publishers = []

        # Check if URL matches any paywalled publisher
        url_lower = (url_context.url or "").lower()

        for publisher_config in paywalled_publishers:
            domain_patterns = publisher_config.get("domain_patterns", [])
            for pattern in domain_patterns:
                if pattern.lower() in url_lower:
                    url_context.requires_auth = True
                    url_context.auth_provider = publisher_config.get(
                        "preferred_provider", "openathens"
                    )
                    logger.info(
                        f"{self.name}: Authentication required for {publisher_config.get('name')} "
                        f"(provider: {url_context.auth_provider})"
                    )
                    return url_context

        # No authentication required
        url_context.requires_auth = False
        return url_context



[docs]
    async def _establish_authentication_async(
        self, url_context: URLContext, context: BrowserContext
    ) -> Optional[str]:
        """
        Establish authentication by visiting gateway URL and clicking through to publisher.

        This is the KEY OPERATION that solves the IEEE issue:
        1. Visit OpenURL (library resolver)
        2. Find publisher link on resolver page
        3. Click link → redirects through OpenAthens → lands at publisher
        4. Publisher session cookies established in browser context

        Without this step:
        - OpenAthens cookies exist at openathens.net
        - NO cookies exist at ieee.org
        - Chrome PDF viewer opens but download fails

        With this step:
        - Visit OpenURL
        - Click IEEE link → redirect through OpenAthens
        - Land at ieee.org → IEEE session cookies established
        - Now ieee.org has cookies, Chrome PDF viewer works

        Args:
            url_context: URLContext with auth_gateway_url and doi
            context: Browser context (will receive publisher cookies)

        Returns:
            Publisher URL if successful, None otherwise
        """
        gateway_url = url_context.auth_gateway_url

        if not gateway_url:
            logger.warning(f"{self.name}: No gateway URL available for authentication")
            return None

        # Check cache - avoid redundant visits
        cache_key = f"{url_context.doi}"
        if cache_key in self._auth_cache:
            logger.debug(
                f"{self.name}: Authentication already established for {url_context.doi}"
            )
            # Return cached URL if available
            cached = self._auth_cache.get(f"{cache_key}_url")
            return cached if isinstance(cached, str) else None

        logger.info(
            f"{self.name}: Establishing auth via OpenURL",
        )

        # Visit OpenURL and click through to publisher
        # This uses the existing OpenURLResolver flow
        from scitex_browser import browser_logger

        from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver

        resolver = OpenURLResolver(config=self.config)
        page = await context.new_page()

        try:
            publisher_url = await resolver.resolve_doi(url_context.doi, page)

            if publisher_url:
                logger.info(f"{self.name}: Auth established")
                await browser_logger.info(
                    page,
                    f"{self.name}: ✓ Session established at {publisher_url[:60]}",
                )
                await page.wait_for_timeout(2000)
                # Cache successful authentication
                self._auth_cache[cache_key] = True
                self._auth_cache[f"{cache_key}_url"] = publisher_url
                return publisher_url
            else:
                logger.warning(f"{self.name}: OpenURL resolution failed")
                await browser_logger.info(
                    page, f"{self.name}: ✗ Could not resolve to publisher URL"
                )
                await page.wait_for_timeout(2000)
                return None

        except Exception as e:
            logger.warning(f"{self.name}: Auth setup failed: {e}")
            try:
                await browser_logger.info(
                    page, f"{self.name}: ✗ EXCEPTION: {str(e)[:80]}"
                )
                await page.wait_for_timeout(2000)
            except Exception as ui_exc:
                logger.debug(
                    f"{self.name}: in-page error banner failed "
                    f"({type(ui_exc).__name__}: {ui_exc})"
                )
            # Don't raise - allow downstream to try anyway
            return None
        finally:
            await page.close()




async def main_async():
    """
    Demonstration of AuthenticationGateway usage.

    Shows how to:
    1. Initialize authentication components
    2. Prepare authenticated browser context
    3. Use the context for subsequent operations
    """
    from scitex_scholar.auth.ScholarAuthManager import ScholarAuthManager
    from scitex_scholar.browser.ScholarBrowserManager import ScholarBrowserManager
    from scitex_scholar.config import ScholarConfig

    # Initialize components
    config = ScholarConfig()
    auth_manager = ScholarAuthManager(config=config)
    browser_manager = ScholarBrowserManager(auth_manager=auth_manager, config=config)

    # Initialize gateway
    gateway = AuthenticationGateway(
        auth_manager=auth_manager,
        browser_manager=browser_manager,
        config=config,
    )

    # Example DOIs - one paywalled (IEEE), one open access
    test_dois = [
        "10.1109/JBHI.2024.1234567",  # IEEE (paywalled)
        "10.1088/1741-2552/aaf92e",  # IOP Publishing (paywalled)
        "10.1038/s41467-020-12345-6",  # Nature Communications (open access)
    ]

    # Get authenticated browser context
    (
        browser,
        context,
    ) = await browser_manager.get_authenticated_browser_and_context_async()

    try:
        for doi in test_dois:
            logger.info(f"\n{'=' * 60}")
            logger.info(f"Testing DOI: {doi}")
            logger.info(f"{'=' * 60}")

            # Prepare authentication (this is the key operation)
            url_context = await gateway.prepare_context_async(doi=doi, context=context)

            # Show results
            logger.info(f"Publisher URL: {url_context.url}")
            logger.info(f"Requires auth: {url_context.requires_auth}")
            logger.info(f"Auth provider: {url_context.auth_provider}")
            logger.info(f"Gateway URL: {url_context.auth_gateway_url}")

            # At this point, the browser context has publisher cookies
            # You can now use it for URL finding or PDF download

    finally:
        await context.close()
        await browser.close()


if __name__ == "__main__":
    import asyncio

    asyncio.run(main_async())

# EOF