import zipfile import undetected_chromedriver as uc import os import sys import time from urllib.parse import urlparse from bs4 import BeautifulSoup # Get parameters from environment variables website = os.environ.get('WEBSITE') browser_path = os.environ.get('BROWSER_PATH', '/usr/bin/google-chrome') browser_args = os.environ.get('BROWSER_ARGS', '').split(',') if os.environ.get('BROWSER_ARGS') else None proxy_url = os.environ.get('PROXY_URL') output_file = os.environ.get('OUTPUT_FILE', '/output/output.txt') if not website: print("No website specified") sys.exit(1) options = uc.ChromeOptions() if browser_args: for b_arg in browser_args: if b_arg.strip(): # Skip empty strings options.add_argument(f'--{b_arg.strip()}') if proxy_url: parsed = urlparse(proxy_url) proxy_host = parsed.hostname proxy_port = parsed.port proxy_username = parsed.username proxy_password = parsed.password manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "", "webRequest", "webRequestBlocking"], "background": {"scripts": ["background.js"], "persistent": true}, "minimum_chrome_version": "76.0.0" } """ background_js = f""" var config = {{ mode: "fixed_servers", rules: {{ singleProxy: {{ scheme: "http", host: "{proxy_host}", port: parseInt({proxy_port}) }}, bypassList: ["localhost"] }} }}; chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}}); function callbackFn(details) {{ return {{ authCredentials: {{ username: "{proxy_username}", password: "{proxy_password}" }} }}; }} chrome.webRequest.onAuthRequired.addListener( callbackFn, {{urls: [""]}}, ['blocking'] ); """ with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file: zip_file.writestr("manifest.json", manifest_json) zip_file.writestr("background.js", background_js) options.add_extension('proxy_auth.zip') driver = uc.Chrome( browser_executable_path=browser_path, headless=True, use_subprocess=False, options=options ) print(f"Cold start finished: {time.time()}") print(f"Start response time: {time.time()}") driver.get(website) data = driver.execute_cdp_cmd('DOM.getDocument', {}) output_text = "" if data: if 'root' in data: root_node_id = data['root']['nodeId'] html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id}) print(f"Stop response time: {time.time()}") soup = BeautifulSoup(html['outerHTML'], 'html.parser') output_text = soup.get_text() else: output_text = f"Got data without a root: {data}" else: output_text = "Didn't get any data..." driver.quit() output_dir = os.path.dirname(output_file) os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w') as f: f.write(output_text)