import zipfile import undetected_chromedriver as uc import sys import argparse from urllib.parse import urlparse from bs4 import BeautifulSoup parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser') parser.add_argument('website', help='Desired website you want to scrape') parser.add_argument('-b', '--browser-path', help='Path to browser binary', default='/usr/bin/google-chrome') parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser') parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)') args = parser.parse_args() options = uc.ChromeOptions() if args.browser_args: for b_arg in args.browser_args: options.add_argument(f'--{b_arg}') if args.proxy_url: parsed = urlparse(args.proxy_url) proxy_host = parsed.hostname proxy_port = parsed.port proxy_username = parsed.username proxy_password = parsed.password manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "", "webRequest", "webRequestBlocking"], "background": {"scripts": ["background.js"], "persistent": true}, "minimum_chrome_version": "76.0.0" } """ background_js = f""" var config = {{ mode: "fixed_servers", rules: {{ singleProxy: {{ scheme: "http", host: "{proxy_host}", port: parseInt({proxy_port}) }}, bypassList: ["localhost"] }} }}; chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}}); function callbackFn(details) {{ return {{ authCredentials: {{ username: "{proxy_username}", password: "{proxy_password}" }} }}; }} chrome.webRequest.onAuthRequired.addListener( callbackFn, {{urls: [""]}}, ['blocking'] ); """ with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file: zip_file.writestr("manifest.json", manifest_json) zip_file.writestr("background.js", background_js) options.add_extension('proxy_auth.zip') driver = uc.Chrome( browser_executable_path=args.browser_path, headless=True, use_subprocess=False, options=options ) driver.get(args.website) data = driver.execute_cdp_cmd('DOM.getDocument', {}) if data: if 'root' in data: root_node_id = data['root']['nodeId'] html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id}) soup = BeautifulSoup(html['outerHTML'], 'html.parser') print(soup.get_text()) else: print("Got data without a root:", data) else: print("Didn't get any data...")