Source code for scitex_scholar.auth.core.AuthenticationGateway

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-10-10 03:24:07 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/auth/AuthenticationGateway.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = "./src/scitex/scholar/auth/core/AuthenticationGateway.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

__FILE__ = __file__

"""
Authentication Gateway Pattern for Scholar Module

Provides transparent authentication layer that:
- Determines if URL requires authentication (config-based)
- Prepares authenticated browser context before URL finding
- Visits authentication gateways (OpenURL) to establish sessions
- Caches authentication state to avoid redundant operations

This keeps URL finders and PDF downloaders free of authentication logic.
"""

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List, Optional

import scitex_logging as logging

if TYPE_CHECKING:
    from playwright.async_api import BrowserContext

from scitex_scholar.config import ScholarConfig

logger = logging.getLogger(__name__)


[docs] @dataclass class URLContext: """ Context for URL operations with authentication information. This dataclass carries all information needed for URL resolution and PDF download, including authentication state. """ doi: str title: Optional[str] = None url: Optional[str] = None # Publisher landing page URL pdf_urls: List[str] = field(default_factory=list) requires_auth: Optional[bool] = None auth_provider: Optional[str] = None # openathens, ezproxy, shibboleth auth_gateway_url: Optional[str] = None # OpenURL for establishing session
[docs] class AuthenticationGateway: """ Transparent authentication layer for Scholar operations. Responsibilities: - Determine if URL requires authentication (config-based, no hardcoding) - Prepare authenticated browser context - Visit authentication gateways (OpenURL) to establish publisher sessions - Cache authentication state for performance This gateway sits between Scholar and URL/Download operations, preparing authentication transparently before content access. """ @property def name(self): return self.__class__.__name__
[docs] def __init__( self, auth_manager, # ScholarAuthManager browser_manager, # ScholarBrowserManager config: ScholarConfig = None, ): """ Initialize authentication gateway. Args: auth_manager: ScholarAuthManager instance browser_manager: ScholarBrowserManager instance config: ScholarConfig instance """ self.auth_manager = auth_manager self.browser_manager = browser_manager self.config = config or ScholarConfig() self._auth_cache: Dict[str, "bool | str"] = {} # Cache visited gateways / URLs
[docs] async def prepare_context_async( self, doi: str, context: BrowserContext, title: Optional[str] = None ) -> URLContext: """ Prepare URL context with authentication if needed. This is the main entry point - called BEFORE URL finding. Flow: 1. Build OpenURL (authentication gateway) 2. Check if DOI needs authentication (based on known publishers) 3. If auth needed: Visit OpenURL to establish publisher cookies 4. Resolve to final publisher URL 5. Return prepared context with authenticated session Args: doi: Paper DOI context: Browser context (will be updated with auth cookies) title: Optional paper title Returns: URLContext with authentication prepared and ready """ url_context = URLContext(doi=doi, title=title) # Step 1: Build OpenURL from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver resolver = OpenURLResolver(config=self.config) openurl = resolver._build_query(url_context.doi) url_context.auth_gateway_url = openurl # Step 2: Try to determine if auth needed from DOI patterns # (IEEE DOIs start with 10.1109, Springer with 10.1007, etc.) url_context = self._check_auth_requirements_from_doi(url_context) # Step 3: If authentication needed, visit OpenURL and establish cookies # This also resolves to the publisher URL as a side effect if url_context.requires_auth: publisher_url = await self._establish_authentication_async( url_context, context ) url_context.url = publisher_url or openurl else: # Step 4: For open access, use direct DOI navigation (faster than OpenURL) from scitex_scholar.auth.gateway._resolve_functions import ( resolve_publisher_url_by_navigating_to_doi_page, ) page = await context.new_page() try: # Try direct DOI navigation first (fast for open access) publisher_url = await resolve_publisher_url_by_navigating_to_doi_page( url_context.doi, page ) url_context.url = publisher_url logger.debug( f"{self.name}: Resolved {url_context.doi}{publisher_url}" ) except Exception as e: # Fallback to OpenURL resolver if direct navigation fails logger.debug( f"{self.name}: Direct navigation failed, trying OpenURL: {e}" ) try: publisher_url = await resolver.resolve_doi(url_context.doi, page) url_context.url = publisher_url except Exception as openurl_error: logger.warning( f"{self.name}: Both methods failed for {url_context.doi}: {openurl_error}" ) url_context.url = openurl # Last resort fallback finally: await page.close() return url_context
[docs] async def _resolve_publisher_url_async( self, url_context: URLContext, context: BrowserContext ) -> URLContext: """ Resolve DOI to publisher landing page URL. Uses OpenURLResolver which already exists and works. The OpenURL is the authentication gateway for paywalled content. Args: url_context: URLContext with DOI context: Browser context Returns: URLContext with url and auth_gateway_url populated """ from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver resolver = OpenURLResolver(config=self.config) # Build OpenURL (this is the authentication gateway) # Use the private _build_query method since no public method exists openurl = resolver._build_query(url_context.doi) url_context.auth_gateway_url = openurl # Resolve to publisher URL (may redirect through OpenAthens) page = await context.new_page() try: publisher_url = await resolver.resolve_doi(url_context.doi, page) url_context.url = publisher_url logger.debug(f"{self.name}: Resolved {url_context.doi}{publisher_url}") except Exception as e: logger.warning(f"{self.name}: Failed to resolve DOI {url_context.doi}: {e}") url_context.url = openurl # Fallback to OpenURL finally: await page.close() return url_context
[docs] def _check_auth_requirements_from_doi(self, url_context: URLContext) -> URLContext: """ Determine if DOI requires authentication based on DOI prefix patterns. This allows early detection before resolving URL. IEEE DOIs start with 10.1109, Springer with 10.1007, etc. Args: url_context: URLContext with doi populated Returns: URLContext with requires_auth and auth_provider populated """ # Get authenticated publishers from config # auth_config = self.config.get("authentication") or {} # paywalled_publishers = auth_config.get("paywalled_publishers") or [] paywalled_publishers = self.config.resolve( "paywalled_publishers", None, default=[] ) if not isinstance(paywalled_publishers, list): paywalled_publishers = [] doi = url_context.doi or "" for publisher_config in paywalled_publishers: doi_prefixes = publisher_config.get("doi_prefixes", []) for prefix in doi_prefixes: if doi.startswith(prefix): url_context.requires_auth = True url_context.auth_provider = publisher_config.get( "preferred_provider", "openathens" ) logger.info( f"{self.name}: Authentication required for {publisher_config.get('name')} " f"(DOI prefix: {prefix}, provider: {url_context.auth_provider})" ) return url_context # Fallback: check by URL if DOI detection didn't match # (for cases where DOI prefix is not in config) url_context.requires_auth = False return url_context
[docs] def _check_auth_requirements(self, url_context: URLContext) -> URLContext: """ Determine if URL requires authentication based on config. This is config-based (no hardcoded domain lists). Checks URL against paywalled_publishers in config. Args: url_context: URLContext with url populated Returns: URLContext with requires_auth and auth_provider populated """ # Get authenticated publishers from config # auth_config = self.config.get("authentication") or {} # paywalled_publishers = auth_config.get("paywalled_publishers") or [] paywalled_publishers = self.config.resolve( "paywalled_publishers", None, default=[] ) if not isinstance(paywalled_publishers, list): paywalled_publishers = [] # Check if URL matches any paywalled publisher url_lower = (url_context.url or "").lower() for publisher_config in paywalled_publishers: domain_patterns = publisher_config.get("domain_patterns", []) for pattern in domain_patterns: if pattern.lower() in url_lower: url_context.requires_auth = True url_context.auth_provider = publisher_config.get( "preferred_provider", "openathens" ) logger.info( f"{self.name}: Authentication required for {publisher_config.get('name')} " f"(provider: {url_context.auth_provider})" ) return url_context # No authentication required url_context.requires_auth = False return url_context
[docs] async def _establish_authentication_async( self, url_context: URLContext, context: BrowserContext ) -> Optional[str]: """ Establish authentication by visiting gateway URL and clicking through to publisher. This is the KEY OPERATION that solves the IEEE issue: 1. Visit OpenURL (library resolver) 2. Find publisher link on resolver page 3. Click link → redirects through OpenAthens → lands at publisher 4. Publisher session cookies established in browser context Without this step: - OpenAthens cookies exist at openathens.net - NO cookies exist at ieee.org - Chrome PDF viewer opens but download fails With this step: - Visit OpenURL - Click IEEE link → redirect through OpenAthens - Land at ieee.org → IEEE session cookies established - Now ieee.org has cookies, Chrome PDF viewer works Args: url_context: URLContext with auth_gateway_url and doi context: Browser context (will receive publisher cookies) Returns: Publisher URL if successful, None otherwise """ gateway_url = url_context.auth_gateway_url if not gateway_url: logger.warning(f"{self.name}: No gateway URL available for authentication") return None # Check cache - avoid redundant visits cache_key = f"{url_context.doi}" if cache_key in self._auth_cache: logger.debug( f"{self.name}: Authentication already established for {url_context.doi}" ) # Return cached URL if available cached = self._auth_cache.get(f"{cache_key}_url") return cached if isinstance(cached, str) else None logger.info( f"{self.name}: Establishing auth via OpenURL", ) # Visit OpenURL and click through to publisher # This uses the existing OpenURLResolver flow from scitex_browser import browser_logger from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver resolver = OpenURLResolver(config=self.config) page = await context.new_page() try: publisher_url = await resolver.resolve_doi(url_context.doi, page) if publisher_url: logger.info(f"{self.name}: Auth established") await browser_logger.info( page, f"{self.name}: ✓ Session established at {publisher_url[:60]}", ) await page.wait_for_timeout(2000) # Cache successful authentication self._auth_cache[cache_key] = True self._auth_cache[f"{cache_key}_url"] = publisher_url return publisher_url else: logger.warning(f"{self.name}: OpenURL resolution failed") await browser_logger.info( page, f"{self.name}: ✗ Could not resolve to publisher URL" ) await page.wait_for_timeout(2000) return None except Exception as e: logger.warning(f"{self.name}: Auth setup failed: {e}") try: await browser_logger.info( page, f"{self.name}: ✗ EXCEPTION: {str(e)[:80]}" ) await page.wait_for_timeout(2000) except Exception as ui_exc: logger.debug( f"{self.name}: in-page error banner failed " f"({type(ui_exc).__name__}: {ui_exc})" ) # Don't raise - allow downstream to try anyway return None finally: await page.close()
async def main_async(): """ Demonstration of AuthenticationGateway usage. Shows how to: 1. Initialize authentication components 2. Prepare authenticated browser context 3. Use the context for subsequent operations """ from scitex_scholar.auth.ScholarAuthManager import ScholarAuthManager from scitex_scholar.browser.ScholarBrowserManager import ScholarBrowserManager from scitex_scholar.config import ScholarConfig # Initialize components config = ScholarConfig() auth_manager = ScholarAuthManager(config=config) browser_manager = ScholarBrowserManager(auth_manager=auth_manager, config=config) # Initialize gateway gateway = AuthenticationGateway( auth_manager=auth_manager, browser_manager=browser_manager, config=config, ) # Example DOIs - one paywalled (IEEE), one open access test_dois = [ "10.1109/JBHI.2024.1234567", # IEEE (paywalled) "10.1088/1741-2552/aaf92e", # IOP Publishing (paywalled) "10.1038/s41467-020-12345-6", # Nature Communications (open access) ] # Get authenticated browser context ( browser, context, ) = await browser_manager.get_authenticated_browser_and_context_async() try: for doi in test_dois: logger.info(f"\n{'=' * 60}") logger.info(f"Testing DOI: {doi}") logger.info(f"{'=' * 60}") # Prepare authentication (this is the key operation) url_context = await gateway.prepare_context_async(doi=doi, context=context) # Show results logger.info(f"Publisher URL: {url_context.url}") logger.info(f"Requires auth: {url_context.requires_auth}") logger.info(f"Auth provider: {url_context.auth_provider}") logger.info(f"Gateway URL: {url_context.auth_gateway_url}") # At this point, the browser context has publisher cookies # You can now use it for URL finding or PDF download finally: await context.close() await browser.close() if __name__ == "__main__": import asyncio asyncio.run(main_async()) # EOF