#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-10-11 07:53:46 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/browser/ScholarBrowserManager.py
# ----------------------------------------
from __future__ import annotations
import os
__FILE__ = "./src/scitex/scholar/browser/ScholarBrowserManager.py"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------
__FILE__ = __file__
import asyncio
import subprocess
import time
from datetime import datetime
from pathlib import Path
from typing import Union
import scitex_logging as logging
from playwright.async_api import Browser, BrowserContext, async_playwright
from scitex_browser.automation import CookieAutoAcceptor
from scitex_browser.core import BrowserMixin, ChromeProfileManager
from scitex_browser.stealth import StealthManager
from scitex_scholar.browser.utils.close_unwanted_pages import close_unwanted_pages
from scitex_scholar.config import ScholarConfig
logger = logging.getLogger(__name__)
"""
Browser Manager with persistent context support.
_persistent_context is a **persistent browser context** that stays alive across multiple operations.
## Regular vs Persistent Context
**Regular context** (new each time):
```python
browser = await playwright.chromium.launch()
context = await browser.new_context() # New context each time
page = await context.new_page()
```
**Persistent context** (reused):
```python
# Created once in _launch_persistent_context_async()
self._persistent_context = await self._persistent_playwright.chromium.launch_persistent_context(
user_data_dir=str(profile_dir), # Persistent profile
headless=False,
args=[...extensions...]
)
# Reused multiple times
if hasattr(self, "_persistent_context") and self._persistent_context:
context = self._persistent_context # Same context
```
## Benefits of Persistent Context
1. **Extensions persist** - Extensions loaded once, available for all pages
2. **Authentication cookies persist** - No need to re-login
3. **Profile data persistent** - Bookmarks, history, settings maintained
4. **Performance** - Faster page creation (no browser restart)
5. **Session continuity** - Maintains login state across operations
## In Your Code
`_persistent_context` is set in `_launch_persistent_context_async()` and reused in `get_authenticated_browser_and_context_async()`. This allows multiple pages to share the same authenticated, extension-enabled browser session.
"""
[docs]
class ScholarBrowserManager(BrowserMixin):
"""Manages a local browser instance with stealth enhancements and invisible mode."""
[docs]
def __init__(
self,
browser_mode=None,
auth_manager=None,
chrome_profile_name=None,
config: ScholarConfig = None,
):
"""
Initialize ScholarBrowserManager with invisible browser capabilities.
Args:
auth_manager: Authentication manager instance
config: Scholar configuration instance
"""
# Store scholar_config for use by components like ChromeProfileManager
self.name = self.__class__.__name__
self.config = config or ScholarConfig()
# Browser
self.browser_mode = self.config.resolve(
"browser_mode", browser_mode, default="interactive"
)
super().__init__(mode=self.browser_mode)
self._set_interactive_or_stealth(browser_mode)
# Library Authentication
self.auth_manager = auth_manager
if auth_manager is None:
logger.fail(
f"{self.name}: auth_manager not passed. University Authentication will not be enabled."
)
# Chrome Extension
self.chrome_profile_manager = ChromeProfileManager(
chrome_profile_name,
chrome_cache_dir=self.config.get_cache_chrome_dir(
chrome_profile_name
).parent,
)
# Stealth
self.stealth_manager = StealthManager(self.viewport_size, self.spoof_dimension)
# Cookie
self.cookie_acceptor = CookieAutoAcceptor()
# Initialize persistent browser attributes
self._persistent_browser = None
self._persistent_context = None
self._persistent_playwright = None
def _set_interactive_or_stealth(self, browser_mode):
# Interactive or Stealth
if browser_mode == "interactive":
self.headless = False
self.spoof_dimension = False
self.viewport_size = (1920, 1080)
self.display = 0
elif browser_mode == "stealth":
# Must be False for dimension spoofing to work
self.headless = False
self.spoof_dimension = True
# This only affects internal viewport, not window size
# self.viewport_size = (1, 1)
self.viewport_size = (1920, 1080)
self.display = 99
else:
raise ValueError(
"browser_mode must be eighther of 'interactive' or 'stealth'"
)
logger.debug(f"{self.name}: Browser initialized:")
logger.debug(f"{self.name}: headless: {self.headless}")
logger.debug(f"{self.name}: spoof_dimension: {self.spoof_dimension}")
logger.debug(f"{self.name}: viewport_size: {self.viewport_size}")
[docs]
async def get_authenticated_browser_and_context_async(
self, **context_options
) -> tuple[Browser, BrowserContext]:
"""Get browser context with authentication cookies and extensions loaded."""
if self.auth_manager is None:
raise ValueError(
f"{self.name}: "
"Authentication manager is not set. "
"To use this method, please initialize ScholarBrowserManager with an auth_manager."
)
await self.auth_manager.ensure_authenticate_async()
browser = (
await self._get_persistent_browser_with_profile_but_not_with_auth_async()
)
if hasattr(self, "_persistent_context") and self._persistent_context:
context = self._persistent_context
logger.info(
f"{self.name}: Using persistent context with profile and extensions"
)
else:
logger.warning(f"{self.name}: Falling back to regular context creation")
auth_options = await self.auth_manager.get_auth_options()
context_options.update(auth_options)
context = await self._new_context_async(browser, **context_options)
return browser, context
[docs]
async def _new_context_async(
self, browser: Browser, **context_options
) -> BrowserContext:
"""Creates a new browser context with stealth options and invisible mode applied."""
stealth_options = self.stealth_manager.get_stealth_options()
context = await browser.new_context({**stealth_options, **context_options})
# Apply stealth script
await context.add_init_script(self.stealth_manager.get_init_script())
await context.add_init_script(
self.stealth_manager.get_dimension_spoofing_script()
)
await context.add_init_script(self.cookie_acceptor.get_auto_acceptor_script())
return context
# ########################################
# Persistent Context
# ########################################
async def _get_persistent_browser_with_profile_but_not_with_auth_async(
self,
) -> Browser:
if (
self._persistent_browser is None
or self._persistent_browser.is_connected() is False
):
await self.auth_manager.ensure_authenticate_async()
await self._ensure_playwright_started_async()
await self._ensure_extensions_installed_async()
self._verify_xvfb_running()
await self._launch_persistent_context_async()
assert self._persistent_browser is not None
return self._persistent_browser
async def _ensure_playwright_started_async(self):
if self._persistent_playwright is None:
self._persistent_playwright = await async_playwright().start()
async def _ensure_extensions_installed_async(self):
if not self.chrome_profile_manager.check_extensions_installed():
logger.error(f"{self.name}: Chrome extensions not verified")
try:
logger.warning(f"{self.name}: Trying install extensions")
await self.chrome_profile_manager.install_extensions_manually_if_not_installed_async()
except Exception as e:
logger.error(f"{self.name}: Installation failed: {str(e)}")
async def _launch_persistent_context_async(self):
persistent_context_launch_options = (
self._build_persistent_context_launch_options()
)
# # Create preferences to disable PDF viewer and force downloads
# self._set_pdf_download_preferences()
# Clean up any existing singleton lock files that might prevent browser launch
profile_dir = self.chrome_profile_manager.profile_dir
# Multiple possible lock file locations
lock_files = [
profile_dir / "SingletonLock",
profile_dir / "SingletonSocket",
profile_dir / "SingletonCookie",
profile_dir / "lockfile",
]
removed_locks = 0
for lock_file in lock_files:
if lock_file.exists():
try:
lock_file.unlink()
logger.debug(
f"{self.name}: Removed Chrome lock file: {lock_file.name}"
)
removed_locks += 1
except Exception as e:
logger.warning(
f"{self.name}: Could not remove {lock_file.name}: {e}"
)
if removed_locks > 0:
logger.debug(f"{self.name}: Cleaned up {removed_locks} Chrome lock files")
# Wait a moment for the system to release file handles
time.sleep(1)
# Kill any lingering Chrome processes using this profile
try:
profile_path_str = str(profile_dir)
# Find and kill Chrome processes using this profile
result = subprocess.run(
["pkill", "-f", f"user-data-dir={profile_path_str}"],
capture_output=True,
text=True,
)
if result.returncode == 0:
logger.debug(
f"{self.name}: Killed lingering Chrome processes for this profile"
)
time.sleep(2) # Give processes time to fully terminate
except Exception as e:
logger.debug(f"{self.name}: Chrome process cleanup attempt: {e}")
# This show_asyncs a small screen with 4 extensions show_asyncn
persistent_context_launch_options["headless"] = False
self._persistent_context = (
await self._persistent_playwright.chromium.launch_persistent_context(
**persistent_context_launch_options
)
)
try:
# First cleanup run (immediate, non-continuous)
await close_unwanted_pages(
self._persistent_context, delay_sec=1, continuous=False
)
# Background continuous monitoring task
asyncio.create_task(
close_unwanted_pages(
self._persistent_context, delay_sec=5, continuous=True
)
)
# await self._close_unwanted_extension_pages_async()
# asyncio.create_task(self._close_unwanted_extension_pages_async())
await self._apply_stealth_scripts_to_persistent_context_async()
await self._load_auth_cookies_to_persistent_context_async()
self._persistent_browser = self._persistent_context.browser
except Exception:
# Post-mortem of persistent-context setup failure: capture the
# first open page's screenshot + HTML if one exists.
from scitex_browser.debugging import capture_debug_artifacts_async
pages = self._persistent_context.pages
if pages:
await capture_debug_artifacts_async(
pages[0], label="persistent_context_launch_error"
)
raise
[docs]
def _verify_xvfb_running(self, _recursed=False):
"""Verify Xvfb virtual display is running; auto-start if absent."""
try:
result = subprocess.run(
["xdpyinfo", "-display", f":{self.display}"],
capture_output=True,
text=True,
timeout=5,
)
running = result.returncode == 0
except Exception as e:
logger.debug(f"{self.name}: xdpyinfo failed ({e}); assuming display absent")
running = False
if running:
logger.debug(f"{self.name}: Xvfb display :{self.display} is running")
return True
if _recursed:
logger.error(f"{self.name}: Xvfb :{self.display} failed to start")
return False
logger.debug(f"{self.name}: Starting Xvfb display :{self.display}")
subprocess.run(["pkill", "-f", f"Xvfb.*:{self.display}"], capture_output=True)
time.sleep(0.5)
subprocess.Popen(
[
"Xvfb",
f":{self.display}",
"-screen",
"0",
"1920x1080x24",
"-ac",
"+extension",
"GLX",
"+extension",
"RANDR",
"+render",
"-noreset",
"-dpi",
"96",
],
env={**os.environ, "DISPLAY": f":{self.display}"},
)
time.sleep(3)
return self._verify_xvfb_running(_recursed=True)
def _build_persistent_context_launch_options(self):
stealth_args = self.stealth_manager.get_stealth_options_additional()
extension_args = self.chrome_profile_manager.get_extension_args()
pdf_download_args = [
"--always-open-pdf-externally",
"--disable-plugins-discovery",
"--plugin-policy=block",
]
stealth_args.extend(
[
f"--display=:{self.display}",
"--window-size=1920,1080",
]
)
no_welcome_args = [
"--disable-extensions-file-access-check",
"--disable-extensions-http-throttling",
"--disable-component-extensions-with-background-pages",
]
# Disable "Restore pages?" popup and session restore dialogs
no_restore_args = [
"--disable-session-crashed-bubble",
"--disable-infobars",
"--no-first-run",
"--no-default-browser-check",
]
screenshot_args = [
"--no-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-features=VizDisplayCompositor",
"--disable-web-security",
"--disable-features=TranslateUI",
"--disable-ipc-flooding-protection",
"--font-render-hinting=none",
"--disable-font-subpixel-positioning",
"--disable-remote-fonts",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-font-loading-api",
]
launch_args = (
extension_args
+ stealth_args
+ no_welcome_args
+ no_restore_args
+ pdf_download_args
+ screenshot_args
)
# Debug: Show window args for stealth mode
if self.spoof_dimension:
window_args = [arg for arg in launch_args if "window-" in arg]
logger.debug(f"{self.name}: Stealth window args: {window_args}")
proxy_config = None
# Set download directory to scholar library downloads folder
downloads_path = self.config.get_library_downloads_dir()
return {
"user_data_dir": str(self.chrome_profile_manager.profile_dir),
"headless": self.headless,
"args": launch_args,
"accept_downloads": True, # Enable download handling
"downloads_path": str(downloads_path), # Set custom download directory
"proxy": proxy_config,
"viewport": {
"width": self.viewport_size[0],
"height": self.viewport_size[1],
},
"screen": {
"width": self.viewport_size[0],
"height": self.viewport_size[1],
},
}
async def _apply_stealth_scripts_to_persistent_context_async(self):
await self._persistent_context.add_init_script(
self.stealth_manager.get_init_script()
)
await self._persistent_context.add_init_script(
self.stealth_manager.get_dimension_spoofing_script()
)
await self._persistent_context.add_init_script(
self.cookie_acceptor.get_auto_acceptor_script()
)
[docs]
async def _load_auth_cookies_to_persistent_context_async(self):
"""Load authentication cookies into the persistent browser context."""
if not self.auth_manager:
logger.debug(
f"{self.name}: No auth_manager available, skipping cookie loading"
)
return
try:
# Check if we have authentication
if await self.auth_manager.is_authenticate_async(verify_live=False):
cookies = await self.auth_manager.get_auth_cookies_async()
if cookies:
await self._persistent_context.add_cookies(cookies)
logger.info(
f"{self.name}: Loaded {len(cookies)} authentication cookies into persistent browser context"
)
else:
logger.debug(f"{self.name}: No cookies available from auth manager")
else:
logger.debug(f"{self.name}: Not authenticated, skipping cookie loading")
except Exception as e:
logger.warning(f"{self.name}: Failed to load authentication cookies: {e}")
[docs]
async def take_screenshot_async(
self,
page,
path: Union[str, Path],
timeout_sec: float = 30.0,
timeout_after_sec: float = 30.0,
full_page: bool = False,
):
"""Take screenshot without viewport changes."""
try:
await page.screenshot(
path=path, timeout=timeout_sec * 1000, full_page=full_page
)
logger.info(f"{self.name}: Saved: {path}")
except Exception as e:
logger.fail(f"{self.name}: Screenshot failed for {path}: {e}")
[docs]
async def start_periodic_screenshots_async(
self,
page,
output_dir: Union[str, Path],
prefix: str = "periodic",
interval_seconds: int = 1,
duration_seconds: int = 10,
verbose: bool = False,
):
"""
Start taking periodic screenshots in the background.
Args:
page: The page to screenshot
prefix: Prefix for screenshot filenames
interval_seconds: Seconds between screenshots
duration_seconds: Total duration to take screenshots (0 = infinite)
verbose: Whether to log each screenshot
Returns:
asyncio.Task that can be cancelled to stop screenshots
"""
async def take_periodic_screenshots():
elapsed = 0
step = 0
while duration_seconds == 0 or elapsed < duration_seconds:
step += 1
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[
:-3
] # Include milliseconds
path = os.path.join(
str(output_dir),
f"{prefix}_step{step:03d}_{timestamp}-{self.browser_mode}.png",
)
try:
await page.screenshot(path=path)
if verbose:
logger.debug(f"{self.name}: Screenshot {step}: {path}")
elif step == 1:
logger.debug(
f"{self.name}: Started periodic screenshots: {prefix}_*"
)
except Exception as e:
if verbose:
logger.debug(f"{self.name}: Screenshot {step} failed: {e}")
await asyncio.sleep(interval_seconds)
elapsed += interval_seconds
logger.debug(
f"{self.name}: Completed {step} periodic screenshots for {prefix}"
)
# Start the task in background
task = asyncio.create_task(take_periodic_screenshots())
return task
[docs]
async def stop_periodic_screenshots_async(self, task: asyncio.Task):
"""Stop periodic screenshots task."""
if task and not task.done():
task.cancel()
try:
await task
except asyncio.CancelledError:
logger.debug(f"{self.name}: Periodic screenshots stopped")
async def __aexit__(self, exc_type, exc_val, exc_tb):
await super().__aexit__(exc_type, exc_val, exc_tb)
[docs]
async def close(self):
"""Close browser while preserving authentication and extension data."""
try:
if (
self._persistent_context
and not self._persistent_context.browser.is_connected()
):
logger.debug(f"{self.name}: Browser already closed")
return
if self._persistent_context:
await self._persistent_context.close()
logger.debug(f"{self.name}: Closed persistent browser context")
if self._persistent_browser and self._persistent_browser.is_connected():
await self._persistent_browser.close()
logger.debug(f"{self.name}: Closed persistent browser")
if self._persistent_playwright:
await self._persistent_playwright.stop()
logger.debug(f"{self.name}: Stopped Playwright instance")
except Exception as e:
logger.warning(f"{self.name}: Error during browser cleanup: {e}")
finally:
# Reset references but keep auth_manager and chrome_profile_manager
self._persistent_context = None
self._persistent_browser = None
self._persistent_playwright = None
if __name__ == "__main__":
async def main(browser_mode="interactive"):
"""Example usage of ScholarBrowserManager with stealth features."""
from scitex_scholar import ScholarAuthManager, ScholarBrowserManager
browser_manager = ScholarBrowserManager(
chrome_profile_name="system",
browser_mode=browser_mode,
auth_manager=ScholarAuthManager(),
)
(
browser,
context,
) = await browser_manager.get_authenticated_browser_and_context_async()
page = await context.new_page()
# Test sites configuration
test_sites = [
# {
# "name": "Extensions Test",
# "url": "",
# "screenshot_spath": "/tmp/openathens_test.png",
# },
# {
# "name": "SSO Test",
# "url": "https://sso.unimelb.edu.au/",
# "screenshot_spath": "/tmp/unimelb_sso_test.png",
# },
# {
# "name": "OpenAthens",
# "url": "https://my.openathens.net/account",
# "screenshot_spath": "/tmp/openathens_test.png",
# },
# {
# "name": "CAPTCHA Test",
# "url": "https://www.google.com/recaptcha/api2/demo",
# "screenshot_spath": "/tmp/captcha_test.png",
# },
{
"name": "Nature Test",
"url": "https://www.nature.com/articles/s41593-025-01990-7",
"screenshot_spath": "/tmp/nature_test.png",
},
# {
# "name": "Google Test",
# "url": "https://www.google.com",
# "screenshot_spath": "/tmp/google_test.png",
# },
]
# Run tests for each site
for site in test_sites:
try:
await page.goto(
site["url"], wait_until="domcontentloaded", timeout=30000
)
await browser_manager.take_screenshot_async(
page, site["screenshot_spath"]
)
except Exception as e:
logger.fail(f"Failed to process {site['name']}: {e}")
continue
import argparse
parser = argparse.ArgumentParser(description="ScholarBrowserManager testing")
parser.add_argument(
"--stealth",
action="store_true",
help="Use stealth mode (default: interactive)",
)
args = parser.parse_args()
browser_mode = "stealth" if args.stealth else "interactive"
asyncio.run(main(browser_mode=browser_mode))
# python -m scitex_scholar.browser.ScholarBrowserManager --stealth
# python -m scitex_scholar.browser.ScholarBrowserManager
# EOF