Source code for scitex_scholar.browser.ScholarBrowserManager

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-10-11 07:53:46 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/browser/ScholarBrowserManager.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = "./src/scitex/scholar/browser/ScholarBrowserManager.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

__FILE__ = __file__

import asyncio
import subprocess
import time
from datetime import datetime
from pathlib import Path
from typing import Union

import scitex_logging as logging
from playwright.async_api import Browser, BrowserContext, async_playwright
from scitex_browser.automation import CookieAutoAcceptor
from scitex_browser.core import BrowserMixin, ChromeProfileManager
from scitex_browser.stealth import StealthManager

from scitex_scholar.browser.utils.close_unwanted_pages import close_unwanted_pages
from scitex_scholar.config import ScholarConfig

logger = logging.getLogger(__name__)

"""
Browser Manager with persistent context support.

_persistent_context is a **persistent browser context** that stays alive across multiple operations.

## Regular vs Persistent Context

**Regular context** (new each time):
```python
browser = await playwright.chromium.launch()
context = await browser.new_context()  # New context each time
page = await context.new_page()
```

**Persistent context** (reused):
```python
# Created once in _launch_persistent_context_async()
self._persistent_context = await self._persistent_playwright.chromium.launch_persistent_context(
    user_data_dir=str(profile_dir),  # Persistent profile
    headless=False,
    args=[...extensions...]
)

# Reused multiple times
if hasattr(self, "_persistent_context") and self._persistent_context:
    context = self._persistent_context  # Same context
```

## Benefits of Persistent Context

1. **Extensions persist** - Extensions loaded once, available for all pages
2. **Authentication cookies persist** - No need to re-login
3. **Profile data persistent** - Bookmarks, history, settings maintained
4. **Performance** - Faster page creation (no browser restart)
5. **Session continuity** - Maintains login state across operations

## In Your Code

`_persistent_context` is set in `_launch_persistent_context_async()` and reused in `get_authenticated_browser_and_context_async()`. This allows multiple pages to share the same authenticated, extension-enabled browser session.
"""


[docs] class ScholarBrowserManager(BrowserMixin): """Manages a local browser instance with stealth enhancements and invisible mode."""
[docs] def __init__( self, browser_mode=None, auth_manager=None, chrome_profile_name=None, config: ScholarConfig = None, ): """ Initialize ScholarBrowserManager with invisible browser capabilities. Args: auth_manager: Authentication manager instance config: Scholar configuration instance """ # Store scholar_config for use by components like ChromeProfileManager self.name = self.__class__.__name__ self.config = config or ScholarConfig() # Browser self.browser_mode = self.config.resolve( "browser_mode", browser_mode, default="interactive" ) super().__init__(mode=self.browser_mode) self._set_interactive_or_stealth(browser_mode) # Library Authentication self.auth_manager = auth_manager if auth_manager is None: logger.fail( f"{self.name}: auth_manager not passed. University Authentication will not be enabled." ) # Chrome Extension self.chrome_profile_manager = ChromeProfileManager( chrome_profile_name, chrome_cache_dir=self.config.get_cache_chrome_dir( chrome_profile_name ).parent, ) # Stealth self.stealth_manager = StealthManager(self.viewport_size, self.spoof_dimension) # Cookie self.cookie_acceptor = CookieAutoAcceptor() # Initialize persistent browser attributes self._persistent_browser = None self._persistent_context = None self._persistent_playwright = None
def _set_interactive_or_stealth(self, browser_mode): # Interactive or Stealth if browser_mode == "interactive": self.headless = False self.spoof_dimension = False self.viewport_size = (1920, 1080) self.display = 0 elif browser_mode == "stealth": # Must be False for dimension spoofing to work self.headless = False self.spoof_dimension = True # This only affects internal viewport, not window size # self.viewport_size = (1, 1) self.viewport_size = (1920, 1080) self.display = 99 else: raise ValueError( "browser_mode must be eighther of 'interactive' or 'stealth'" ) logger.debug(f"{self.name}: Browser initialized:") logger.debug(f"{self.name}: headless: {self.headless}") logger.debug(f"{self.name}: spoof_dimension: {self.spoof_dimension}") logger.debug(f"{self.name}: viewport_size: {self.viewport_size}")
[docs] async def get_authenticated_browser_and_context_async( self, **context_options ) -> tuple[Browser, BrowserContext]: """Get browser context with authentication cookies and extensions loaded.""" if self.auth_manager is None: raise ValueError( f"{self.name}: " "Authentication manager is not set. " "To use this method, please initialize ScholarBrowserManager with an auth_manager." ) await self.auth_manager.ensure_authenticate_async() browser = ( await self._get_persistent_browser_with_profile_but_not_with_auth_async() ) if hasattr(self, "_persistent_context") and self._persistent_context: context = self._persistent_context logger.info( f"{self.name}: Using persistent context with profile and extensions" ) else: logger.warning(f"{self.name}: Falling back to regular context creation") auth_options = await self.auth_manager.get_auth_options() context_options.update(auth_options) context = await self._new_context_async(browser, **context_options) return browser, context
[docs] async def _new_context_async( self, browser: Browser, **context_options ) -> BrowserContext: """Creates a new browser context with stealth options and invisible mode applied.""" stealth_options = self.stealth_manager.get_stealth_options() context = await browser.new_context({**stealth_options, **context_options}) # Apply stealth script await context.add_init_script(self.stealth_manager.get_init_script()) await context.add_init_script( self.stealth_manager.get_dimension_spoofing_script() ) await context.add_init_script(self.cookie_acceptor.get_auto_acceptor_script()) return context
# ######################################## # Persistent Context # ######################################## async def _get_persistent_browser_with_profile_but_not_with_auth_async( self, ) -> Browser: if ( self._persistent_browser is None or self._persistent_browser.is_connected() is False ): await self.auth_manager.ensure_authenticate_async() await self._ensure_playwright_started_async() await self._ensure_extensions_installed_async() self._verify_xvfb_running() await self._launch_persistent_context_async() assert self._persistent_browser is not None return self._persistent_browser async def _ensure_playwright_started_async(self): if self._persistent_playwright is None: self._persistent_playwright = await async_playwright().start() async def _ensure_extensions_installed_async(self): if not self.chrome_profile_manager.check_extensions_installed(): logger.error(f"{self.name}: Chrome extensions not verified") try: logger.warning(f"{self.name}: Trying install extensions") await self.chrome_profile_manager.install_extensions_manually_if_not_installed_async() except Exception as e: logger.error(f"{self.name}: Installation failed: {str(e)}") async def _launch_persistent_context_async(self): persistent_context_launch_options = ( self._build_persistent_context_launch_options() ) # # Create preferences to disable PDF viewer and force downloads # self._set_pdf_download_preferences() # Clean up any existing singleton lock files that might prevent browser launch profile_dir = self.chrome_profile_manager.profile_dir # Multiple possible lock file locations lock_files = [ profile_dir / "SingletonLock", profile_dir / "SingletonSocket", profile_dir / "SingletonCookie", profile_dir / "lockfile", ] removed_locks = 0 for lock_file in lock_files: if lock_file.exists(): try: lock_file.unlink() logger.debug( f"{self.name}: Removed Chrome lock file: {lock_file.name}" ) removed_locks += 1 except Exception as e: logger.warning( f"{self.name}: Could not remove {lock_file.name}: {e}" ) if removed_locks > 0: logger.debug(f"{self.name}: Cleaned up {removed_locks} Chrome lock files") # Wait a moment for the system to release file handles time.sleep(1) # Kill any lingering Chrome processes using this profile try: profile_path_str = str(profile_dir) # Find and kill Chrome processes using this profile result = subprocess.run( ["pkill", "-f", f"user-data-dir={profile_path_str}"], capture_output=True, text=True, ) if result.returncode == 0: logger.debug( f"{self.name}: Killed lingering Chrome processes for this profile" ) time.sleep(2) # Give processes time to fully terminate except Exception as e: logger.debug(f"{self.name}: Chrome process cleanup attempt: {e}") # This show_asyncs a small screen with 4 extensions show_asyncn persistent_context_launch_options["headless"] = False self._persistent_context = ( await self._persistent_playwright.chromium.launch_persistent_context( **persistent_context_launch_options ) ) try: # First cleanup run (immediate, non-continuous) await close_unwanted_pages( self._persistent_context, delay_sec=1, continuous=False ) # Background continuous monitoring task asyncio.create_task( close_unwanted_pages( self._persistent_context, delay_sec=5, continuous=True ) ) # await self._close_unwanted_extension_pages_async() # asyncio.create_task(self._close_unwanted_extension_pages_async()) await self._apply_stealth_scripts_to_persistent_context_async() await self._load_auth_cookies_to_persistent_context_async() self._persistent_browser = self._persistent_context.browser except Exception: # Post-mortem of persistent-context setup failure: capture the # first open page's screenshot + HTML if one exists. from scitex_browser.debugging import capture_debug_artifacts_async pages = self._persistent_context.pages if pages: await capture_debug_artifacts_async( pages[0], label="persistent_context_launch_error" ) raise
[docs] def _verify_xvfb_running(self, _recursed=False): """Verify Xvfb virtual display is running; auto-start if absent.""" try: result = subprocess.run( ["xdpyinfo", "-display", f":{self.display}"], capture_output=True, text=True, timeout=5, ) running = result.returncode == 0 except Exception as e: logger.debug(f"{self.name}: xdpyinfo failed ({e}); assuming display absent") running = False if running: logger.debug(f"{self.name}: Xvfb display :{self.display} is running") return True if _recursed: logger.error(f"{self.name}: Xvfb :{self.display} failed to start") return False logger.debug(f"{self.name}: Starting Xvfb display :{self.display}") subprocess.run(["pkill", "-f", f"Xvfb.*:{self.display}"], capture_output=True) time.sleep(0.5) subprocess.Popen( [ "Xvfb", f":{self.display}", "-screen", "0", "1920x1080x24", "-ac", "+extension", "GLX", "+extension", "RANDR", "+render", "-noreset", "-dpi", "96", ], env={**os.environ, "DISPLAY": f":{self.display}"}, ) time.sleep(3) return self._verify_xvfb_running(_recursed=True)
def _build_persistent_context_launch_options(self): stealth_args = self.stealth_manager.get_stealth_options_additional() extension_args = self.chrome_profile_manager.get_extension_args() pdf_download_args = [ "--always-open-pdf-externally", "--disable-plugins-discovery", "--plugin-policy=block", ] stealth_args.extend( [ f"--display=:{self.display}", "--window-size=1920,1080", ] ) no_welcome_args = [ "--disable-extensions-file-access-check", "--disable-extensions-http-throttling", "--disable-component-extensions-with-background-pages", ] # Disable "Restore pages?" popup and session restore dialogs no_restore_args = [ "--disable-session-crashed-bubble", "--disable-infobars", "--no-first-run", "--no-default-browser-check", ] screenshot_args = [ "--no-sandbox", "--disable-blink-features=AutomationControlled", "--disable-features=VizDisplayCompositor", "--disable-web-security", "--disable-features=TranslateUI", "--disable-ipc-flooding-protection", "--font-render-hinting=none", "--disable-font-subpixel-positioning", "--disable-remote-fonts", "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-renderer-backgrounding", "--disable-font-loading-api", ] launch_args = ( extension_args + stealth_args + no_welcome_args + no_restore_args + pdf_download_args + screenshot_args ) # Debug: Show window args for stealth mode if self.spoof_dimension: window_args = [arg for arg in launch_args if "window-" in arg] logger.debug(f"{self.name}: Stealth window args: {window_args}") proxy_config = None # Set download directory to scholar library downloads folder downloads_path = self.config.get_library_downloads_dir() return { "user_data_dir": str(self.chrome_profile_manager.profile_dir), "headless": self.headless, "args": launch_args, "accept_downloads": True, # Enable download handling "downloads_path": str(downloads_path), # Set custom download directory "proxy": proxy_config, "viewport": { "width": self.viewport_size[0], "height": self.viewport_size[1], }, "screen": { "width": self.viewport_size[0], "height": self.viewport_size[1], }, } async def _apply_stealth_scripts_to_persistent_context_async(self): await self._persistent_context.add_init_script( self.stealth_manager.get_init_script() ) await self._persistent_context.add_init_script( self.stealth_manager.get_dimension_spoofing_script() ) await self._persistent_context.add_init_script( self.cookie_acceptor.get_auto_acceptor_script() )
[docs] async def _load_auth_cookies_to_persistent_context_async(self): """Load authentication cookies into the persistent browser context.""" if not self.auth_manager: logger.debug( f"{self.name}: No auth_manager available, skipping cookie loading" ) return try: # Check if we have authentication if await self.auth_manager.is_authenticate_async(verify_live=False): cookies = await self.auth_manager.get_auth_cookies_async() if cookies: await self._persistent_context.add_cookies(cookies) logger.info( f"{self.name}: Loaded {len(cookies)} authentication cookies into persistent browser context" ) else: logger.debug(f"{self.name}: No cookies available from auth manager") else: logger.debug(f"{self.name}: Not authenticated, skipping cookie loading") except Exception as e: logger.warning(f"{self.name}: Failed to load authentication cookies: {e}")
[docs] async def take_screenshot_async( self, page, path: Union[str, Path], timeout_sec: float = 30.0, timeout_after_sec: float = 30.0, full_page: bool = False, ): """Take screenshot without viewport changes.""" try: await page.screenshot( path=path, timeout=timeout_sec * 1000, full_page=full_page ) logger.info(f"{self.name}: Saved: {path}") except Exception as e: logger.fail(f"{self.name}: Screenshot failed for {path}: {e}")
[docs] async def start_periodic_screenshots_async( self, page, output_dir: Union[str, Path], prefix: str = "periodic", interval_seconds: int = 1, duration_seconds: int = 10, verbose: bool = False, ): """ Start taking periodic screenshots in the background. Args: page: The page to screenshot prefix: Prefix for screenshot filenames interval_seconds: Seconds between screenshots duration_seconds: Total duration to take screenshots (0 = infinite) verbose: Whether to log each screenshot Returns: asyncio.Task that can be cancelled to stop screenshots """ async def take_periodic_screenshots(): elapsed = 0 step = 0 while duration_seconds == 0 or elapsed < duration_seconds: step += 1 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[ :-3 ] # Include milliseconds path = os.path.join( str(output_dir), f"{prefix}_step{step:03d}_{timestamp}-{self.browser_mode}.png", ) try: await page.screenshot(path=path) if verbose: logger.debug(f"{self.name}: Screenshot {step}: {path}") elif step == 1: logger.debug( f"{self.name}: Started periodic screenshots: {prefix}_*" ) except Exception as e: if verbose: logger.debug(f"{self.name}: Screenshot {step} failed: {e}") await asyncio.sleep(interval_seconds) elapsed += interval_seconds logger.debug( f"{self.name}: Completed {step} periodic screenshots for {prefix}" ) # Start the task in background task = asyncio.create_task(take_periodic_screenshots()) return task
[docs] async def stop_periodic_screenshots_async(self, task: asyncio.Task): """Stop periodic screenshots task.""" if task and not task.done(): task.cancel() try: await task except asyncio.CancelledError: logger.debug(f"{self.name}: Periodic screenshots stopped")
async def __aexit__(self, exc_type, exc_val, exc_tb): await super().__aexit__(exc_type, exc_val, exc_tb)
[docs] async def close(self): """Close browser while preserving authentication and extension data.""" try: if ( self._persistent_context and not self._persistent_context.browser.is_connected() ): logger.debug(f"{self.name}: Browser already closed") return if self._persistent_context: await self._persistent_context.close() logger.debug(f"{self.name}: Closed persistent browser context") if self._persistent_browser and self._persistent_browser.is_connected(): await self._persistent_browser.close() logger.debug(f"{self.name}: Closed persistent browser") if self._persistent_playwright: await self._persistent_playwright.stop() logger.debug(f"{self.name}: Stopped Playwright instance") except Exception as e: logger.warning(f"{self.name}: Error during browser cleanup: {e}") finally: # Reset references but keep auth_manager and chrome_profile_manager self._persistent_context = None self._persistent_browser = None self._persistent_playwright = None
if __name__ == "__main__": async def main(browser_mode="interactive"): """Example usage of ScholarBrowserManager with stealth features.""" from scitex_scholar import ScholarAuthManager, ScholarBrowserManager browser_manager = ScholarBrowserManager( chrome_profile_name="system", browser_mode=browser_mode, auth_manager=ScholarAuthManager(), ) ( browser, context, ) = await browser_manager.get_authenticated_browser_and_context_async() page = await context.new_page() # Test sites configuration test_sites = [ # { # "name": "Extensions Test", # "url": "", # "screenshot_spath": "/tmp/openathens_test.png", # }, # { # "name": "SSO Test", # "url": "https://sso.unimelb.edu.au/", # "screenshot_spath": "/tmp/unimelb_sso_test.png", # }, # { # "name": "OpenAthens", # "url": "https://my.openathens.net/account", # "screenshot_spath": "/tmp/openathens_test.png", # }, # { # "name": "CAPTCHA Test", # "url": "https://www.google.com/recaptcha/api2/demo", # "screenshot_spath": "/tmp/captcha_test.png", # }, { "name": "Nature Test", "url": "https://www.nature.com/articles/s41593-025-01990-7", "screenshot_spath": "/tmp/nature_test.png", }, # { # "name": "Google Test", # "url": "https://www.google.com", # "screenshot_spath": "/tmp/google_test.png", # }, ] # Run tests for each site for site in test_sites: try: await page.goto( site["url"], wait_until="domcontentloaded", timeout=30000 ) await browser_manager.take_screenshot_async( page, site["screenshot_spath"] ) except Exception as e: logger.fail(f"Failed to process {site['name']}: {e}") continue import argparse parser = argparse.ArgumentParser(description="ScholarBrowserManager testing") parser.add_argument( "--stealth", action="store_true", help="Use stealth mode (default: interactive)", ) args = parser.parse_args() browser_mode = "stealth" if args.stealth else "interactive" asyncio.run(main(browser_mode=browser_mode)) # python -m scitex_scholar.browser.ScholarBrowserManager --stealth # python -m scitex_scholar.browser.ScholarBrowserManager # EOF