#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-10-10 03:24:07 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/auth/AuthenticationGateway.py
# ----------------------------------------
from __future__ import annotations
import os
__FILE__ = "./src/scitex/scholar/auth/core/AuthenticationGateway.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------
__FILE__ = __file__
"""
Authentication Gateway Pattern for Scholar Module
Provides transparent authentication layer that:
- Determines if URL requires authentication (config-based)
- Prepares authenticated browser context before URL finding
- Visits authentication gateways (OpenURL) to establish sessions
- Caches authentication state to avoid redundant operations
This keeps URL finders and PDF downloaders free of authentication logic.
"""
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List, Optional
import scitex_logging as logging
if TYPE_CHECKING:
from playwright.async_api import BrowserContext
from scitex_scholar.config import ScholarConfig
logger = logging.getLogger(__name__)
[docs]
@dataclass
class URLContext:
"""
Context for URL operations with authentication information.
This dataclass carries all information needed for URL resolution
and PDF download, including authentication state.
"""
doi: str
title: Optional[str] = None
url: Optional[str] = None # Publisher landing page URL
pdf_urls: List[str] = field(default_factory=list)
requires_auth: Optional[bool] = None
auth_provider: Optional[str] = None # openathens, ezproxy, shibboleth
auth_gateway_url: Optional[str] = None # OpenURL for establishing session
[docs]
class AuthenticationGateway:
"""
Transparent authentication layer for Scholar operations.
Responsibilities:
- Determine if URL requires authentication (config-based, no hardcoding)
- Prepare authenticated browser context
- Visit authentication gateways (OpenURL) to establish publisher sessions
- Cache authentication state for performance
This gateway sits between Scholar and URL/Download operations,
preparing authentication transparently before content access.
"""
@property
def name(self):
return self.__class__.__name__
[docs]
def __init__(
self,
auth_manager, # ScholarAuthManager
browser_manager, # ScholarBrowserManager
config: ScholarConfig = None,
):
"""
Initialize authentication gateway.
Args:
auth_manager: ScholarAuthManager instance
browser_manager: ScholarBrowserManager instance
config: ScholarConfig instance
"""
self.auth_manager = auth_manager
self.browser_manager = browser_manager
self.config = config or ScholarConfig()
self._auth_cache: Dict[str, "bool | str"] = {} # Cache visited gateways / URLs
[docs]
async def prepare_context_async(
self, doi: str, context: BrowserContext, title: Optional[str] = None
) -> URLContext:
"""
Prepare URL context with authentication if needed.
This is the main entry point - called BEFORE URL finding.
Flow:
1. Build OpenURL (authentication gateway)
2. Check if DOI needs authentication (based on known publishers)
3. If auth needed: Visit OpenURL to establish publisher cookies
4. Resolve to final publisher URL
5. Return prepared context with authenticated session
Args:
doi: Paper DOI
context: Browser context (will be updated with auth cookies)
title: Optional paper title
Returns:
URLContext with authentication prepared and ready
"""
url_context = URLContext(doi=doi, title=title)
# Step 1: Build OpenURL
from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver
resolver = OpenURLResolver(config=self.config)
openurl = resolver._build_query(url_context.doi)
url_context.auth_gateway_url = openurl
# Step 2: Try to determine if auth needed from DOI patterns
# (IEEE DOIs start with 10.1109, Springer with 10.1007, etc.)
url_context = self._check_auth_requirements_from_doi(url_context)
# Step 3: If authentication needed, visit OpenURL and establish cookies
# This also resolves to the publisher URL as a side effect
if url_context.requires_auth:
publisher_url = await self._establish_authentication_async(
url_context, context
)
url_context.url = publisher_url or openurl
else:
# Step 4: For open access, use direct DOI navigation (faster than OpenURL)
from scitex_scholar.auth.gateway._resolve_functions import (
resolve_publisher_url_by_navigating_to_doi_page,
)
page = await context.new_page()
try:
# Try direct DOI navigation first (fast for open access)
publisher_url = await resolve_publisher_url_by_navigating_to_doi_page(
url_context.doi, page
)
url_context.url = publisher_url
logger.debug(
f"{self.name}: Resolved {url_context.doi} → {publisher_url}"
)
except Exception as e:
# Fallback to OpenURL resolver if direct navigation fails
logger.debug(
f"{self.name}: Direct navigation failed, trying OpenURL: {e}"
)
try:
publisher_url = await resolver.resolve_doi(url_context.doi, page)
url_context.url = publisher_url
except Exception as openurl_error:
logger.warning(
f"{self.name}: Both methods failed for {url_context.doi}: {openurl_error}"
)
url_context.url = openurl # Last resort fallback
finally:
await page.close()
return url_context
[docs]
async def _resolve_publisher_url_async(
self, url_context: URLContext, context: BrowserContext
) -> URLContext:
"""
Resolve DOI to publisher landing page URL.
Uses OpenURLResolver which already exists and works.
The OpenURL is the authentication gateway for paywalled content.
Args:
url_context: URLContext with DOI
context: Browser context
Returns:
URLContext with url and auth_gateway_url populated
"""
from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver
resolver = OpenURLResolver(config=self.config)
# Build OpenURL (this is the authentication gateway)
# Use the private _build_query method since no public method exists
openurl = resolver._build_query(url_context.doi)
url_context.auth_gateway_url = openurl
# Resolve to publisher URL (may redirect through OpenAthens)
page = await context.new_page()
try:
publisher_url = await resolver.resolve_doi(url_context.doi, page)
url_context.url = publisher_url
logger.debug(f"{self.name}: Resolved {url_context.doi} → {publisher_url}")
except Exception as e:
logger.warning(f"{self.name}: Failed to resolve DOI {url_context.doi}: {e}")
url_context.url = openurl # Fallback to OpenURL
finally:
await page.close()
return url_context
[docs]
def _check_auth_requirements_from_doi(self, url_context: URLContext) -> URLContext:
"""
Determine if DOI requires authentication based on DOI prefix patterns.
This allows early detection before resolving URL.
IEEE DOIs start with 10.1109, Springer with 10.1007, etc.
Args:
url_context: URLContext with doi populated
Returns:
URLContext with requires_auth and auth_provider populated
"""
# Get authenticated publishers from config
# auth_config = self.config.get("authentication") or {}
# paywalled_publishers = auth_config.get("paywalled_publishers") or []
paywalled_publishers = self.config.resolve(
"paywalled_publishers", None, default=[]
)
if not isinstance(paywalled_publishers, list):
paywalled_publishers = []
doi = url_context.doi or ""
for publisher_config in paywalled_publishers:
doi_prefixes = publisher_config.get("doi_prefixes", [])
for prefix in doi_prefixes:
if doi.startswith(prefix):
url_context.requires_auth = True
url_context.auth_provider = publisher_config.get(
"preferred_provider", "openathens"
)
logger.info(
f"{self.name}: Authentication required for {publisher_config.get('name')} "
f"(DOI prefix: {prefix}, provider: {url_context.auth_provider})"
)
return url_context
# Fallback: check by URL if DOI detection didn't match
# (for cases where DOI prefix is not in config)
url_context.requires_auth = False
return url_context
[docs]
def _check_auth_requirements(self, url_context: URLContext) -> URLContext:
"""
Determine if URL requires authentication based on config.
This is config-based (no hardcoded domain lists).
Checks URL against paywalled_publishers in config.
Args:
url_context: URLContext with url populated
Returns:
URLContext with requires_auth and auth_provider populated
"""
# Get authenticated publishers from config
# auth_config = self.config.get("authentication") or {}
# paywalled_publishers = auth_config.get("paywalled_publishers") or []
paywalled_publishers = self.config.resolve(
"paywalled_publishers", None, default=[]
)
if not isinstance(paywalled_publishers, list):
paywalled_publishers = []
# Check if URL matches any paywalled publisher
url_lower = (url_context.url or "").lower()
for publisher_config in paywalled_publishers:
domain_patterns = publisher_config.get("domain_patterns", [])
for pattern in domain_patterns:
if pattern.lower() in url_lower:
url_context.requires_auth = True
url_context.auth_provider = publisher_config.get(
"preferred_provider", "openathens"
)
logger.info(
f"{self.name}: Authentication required for {publisher_config.get('name')} "
f"(provider: {url_context.auth_provider})"
)
return url_context
# No authentication required
url_context.requires_auth = False
return url_context
[docs]
async def _establish_authentication_async(
self, url_context: URLContext, context: BrowserContext
) -> Optional[str]:
"""
Establish authentication by visiting gateway URL and clicking through to publisher.
This is the KEY OPERATION that solves the IEEE issue:
1. Visit OpenURL (library resolver)
2. Find publisher link on resolver page
3. Click link → redirects through OpenAthens → lands at publisher
4. Publisher session cookies established in browser context
Without this step:
- OpenAthens cookies exist at openathens.net
- NO cookies exist at ieee.org
- Chrome PDF viewer opens but download fails
With this step:
- Visit OpenURL
- Click IEEE link → redirect through OpenAthens
- Land at ieee.org → IEEE session cookies established
- Now ieee.org has cookies, Chrome PDF viewer works
Args:
url_context: URLContext with auth_gateway_url and doi
context: Browser context (will receive publisher cookies)
Returns:
Publisher URL if successful, None otherwise
"""
gateway_url = url_context.auth_gateway_url
if not gateway_url:
logger.warning(f"{self.name}: No gateway URL available for authentication")
return None
# Check cache - avoid redundant visits
cache_key = f"{url_context.doi}"
if cache_key in self._auth_cache:
logger.debug(
f"{self.name}: Authentication already established for {url_context.doi}"
)
# Return cached URL if available
cached = self._auth_cache.get(f"{cache_key}_url")
return cached if isinstance(cached, str) else None
logger.info(
f"{self.name}: Establishing auth via OpenURL",
)
# Visit OpenURL and click through to publisher
# This uses the existing OpenURLResolver flow
from scitex_browser import browser_logger
from scitex_scholar.auth.gateway._OpenURLResolver import OpenURLResolver
resolver = OpenURLResolver(config=self.config)
page = await context.new_page()
try:
publisher_url = await resolver.resolve_doi(url_context.doi, page)
if publisher_url:
logger.info(f"{self.name}: Auth established")
await browser_logger.info(
page,
f"{self.name}: ✓ Session established at {publisher_url[:60]}",
)
await page.wait_for_timeout(2000)
# Cache successful authentication
self._auth_cache[cache_key] = True
self._auth_cache[f"{cache_key}_url"] = publisher_url
return publisher_url
else:
logger.warning(f"{self.name}: OpenURL resolution failed")
await browser_logger.info(
page, f"{self.name}: ✗ Could not resolve to publisher URL"
)
await page.wait_for_timeout(2000)
return None
except Exception as e:
logger.warning(f"{self.name}: Auth setup failed: {e}")
try:
await browser_logger.info(
page, f"{self.name}: ✗ EXCEPTION: {str(e)[:80]}"
)
await page.wait_for_timeout(2000)
except Exception as ui_exc:
logger.debug(
f"{self.name}: in-page error banner failed "
f"({type(ui_exc).__name__}: {ui_exc})"
)
# Don't raise - allow downstream to try anyway
return None
finally:
await page.close()
async def main_async():
"""
Demonstration of AuthenticationGateway usage.
Shows how to:
1. Initialize authentication components
2. Prepare authenticated browser context
3. Use the context for subsequent operations
"""
from scitex_scholar.auth.ScholarAuthManager import ScholarAuthManager
from scitex_scholar.browser.ScholarBrowserManager import ScholarBrowserManager
from scitex_scholar.config import ScholarConfig
# Initialize components
config = ScholarConfig()
auth_manager = ScholarAuthManager(config=config)
browser_manager = ScholarBrowserManager(auth_manager=auth_manager, config=config)
# Initialize gateway
gateway = AuthenticationGateway(
auth_manager=auth_manager,
browser_manager=browser_manager,
config=config,
)
# Example DOIs - one paywalled (IEEE), one open access
test_dois = [
"10.1109/JBHI.2024.1234567", # IEEE (paywalled)
"10.1088/1741-2552/aaf92e", # IOP Publishing (paywalled)
"10.1038/s41467-020-12345-6", # Nature Communications (open access)
]
# Get authenticated browser context
(
browser,
context,
) = await browser_manager.get_authenticated_browser_and_context_async()
try:
for doi in test_dois:
logger.info(f"\n{'=' * 60}")
logger.info(f"Testing DOI: {doi}")
logger.info(f"{'=' * 60}")
# Prepare authentication (this is the key operation)
url_context = await gateway.prepare_context_async(doi=doi, context=context)
# Show results
logger.info(f"Publisher URL: {url_context.url}")
logger.info(f"Requires auth: {url_context.requires_auth}")
logger.info(f"Auth provider: {url_context.auth_provider}")
logger.info(f"Gateway URL: {url_context.auth_gateway_url}")
# At this point, the browser context has publisher cookies
# You can now use it for URL finding or PDF download
finally:
await context.close()
await browser.close()
if __name__ == "__main__":
import asyncio
asyncio.run(main_async())
# EOF