diff --git a/driver.py b/driver.py deleted file mode 100644 index a3efd13..0000000 --- a/driver.py +++ /dev/null @@ -1,100 +0,0 @@ -import zipfile -import undetected_chromedriver as uc -import sys -import argparse - -from urllib.parse import urlparse -from bs4 import BeautifulSoup - -parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser') - -parser.add_argument('website', help='Desired website you want to scrape') -parser.add_argument('-b', '--browser-path', help='Path to browser binary', - default='/usr/bin/google-chrome') -parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser') -parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)') - -args = parser.parse_args() - -options = uc.ChromeOptions() -if args.browser_args: - for b_arg in args.browser_args: - options.add_argument(f'--{b_arg}') - -if args.proxy_url: - parsed = urlparse(args.proxy_url) - - proxy_host = parsed.hostname - proxy_port = parsed.port - proxy_username = parsed.username - proxy_password = parsed.password - - manifest_json = """ -{ - "version": "1.0.0", - "manifest_version": 2, - "name": "Chrome Proxy", - "permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "", "webRequest", "webRequestBlocking"], - "background": {"scripts": ["background.js"], "persistent": true}, - "minimum_chrome_version": "76.0.0" -} -""" - - background_js = f""" -var config = {{ - mode: "fixed_servers", - rules: {{ - singleProxy: {{ - scheme: "http", - host: "{proxy_host}", - port: parseInt({proxy_port}) - }}, - bypassList: ["localhost"] - }} -}}; - -chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}}); - -function callbackFn(details) {{ - return {{ - authCredentials: {{ - username: "{proxy_username}", - password: "{proxy_password}" - }} - }}; -}} - -chrome.webRequest.onAuthRequired.addListener( - callbackFn, - {{urls: [""]}}, - ['blocking'] -); -""" - - - with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file: - zip_file.writestr("manifest.json", manifest_json) - zip_file.writestr("background.js", background_js) - - options.add_extension('proxy_auth.zip') - -driver = uc.Chrome( - browser_executable_path=args.browser_path, - headless=True, - use_subprocess=False, - options=options -) -driver.get(args.website) - -data = driver.execute_cdp_cmd('DOM.getDocument', {}) -if data: - if 'root' in data: - root_node_id = data['root']['nodeId'] - html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id}) - soup = BeautifulSoup(html['outerHTML'], 'html.parser') - print(soup.get_text()) - else: - print("Got data without a root:", data) -else: - print("Didn't get any data...") - diff --git a/scrape.py b/scrape.py index 76d5f50..a3efd13 100644 --- a/scrape.py +++ b/scrape.py @@ -1,91 +1,100 @@ -import requests -import websockets -import json -import asyncio -from pprint import pprint +import zipfile +import undetected_chromedriver as uc +import sys +import argparse -# TODO: Use docker thing to start a docker service -# TODO: Accept command line args for docker image +from urllib.parse import urlparse +from bs4 import BeautifulSoup -async def scrape(): - id_count = [0] - def get_id(): - id_count[0] += 1 - return id_count[0] +parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser') - response = requests.get("http://localhost:3000/json") - targets = response.json() +parser.add_argument('website', help='Desired website you want to scrape') +parser.add_argument('-b', '--browser-path', help='Path to browser binary', + default='/usr/bin/google-chrome') +parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser') +parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)') - if not targets: - print("No active sessions found") - return +args = parser.parse_args() - websocket_url = targets[0]['webSocketDebuggerUrl'] - print(f"Connecting to: {websocket_url}") +options = uc.ChromeOptions() +if args.browser_args: + for b_arg in args.browser_args: + options.add_argument(f'--{b_arg}') - async with websockets.connect(websocket_url) as ws: - for elem in ["DOM", "Page"]: - print("Enabling", elem) - await ws.send(json.dumps({ - "id": get_id(), - "method": f"{elem}.enable" - })) - # await asyncio.sleep(1) - response = await ws.recv() - print(f"{elem} enabled:", json.loads(response)) +if args.proxy_url: + parsed = urlparse(args.proxy_url) - print("Staring up") + proxy_host = parsed.hostname + proxy_port = parsed.port + proxy_username = parsed.username + proxy_password = parsed.password - await ws.send(json.dumps({ - "id": get_id(), - "method": "Page.navigate", - "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"} - # "params": {"url": "https://ferano.io"} - })) + manifest_json = """ +{ + "version": "1.0.0", + "manifest_version": 2, + "name": "Chrome Proxy", + "permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "", "webRequest", "webRequestBlocking"], + "background": {"scripts": ["background.js"], "persistent": true}, + "minimum_chrome_version": "76.0.0" +} +""" - print("Send navigate request") + background_js = f""" +var config = {{ + mode: "fixed_servers", + rules: {{ + singleProxy: {{ + scheme: "http", + host: "{proxy_host}", + port: parseInt({proxy_port}) + }}, + bypassList: ["localhost"] + }} +}}; - while True: - response = await ws.recv() - data = json.loads(response) - if data.get("method") == "Page.loadEventFired": - break +chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}}); - print("Got loadEventFired event") - print("Get Document...") +function callbackFn(details) {{ + return {{ + authCredentials: {{ + username: "{proxy_username}", + password: "{proxy_password}" + }} + }}; +}} - await ws.send(json.dumps({ - "id": get_id(), - "method": "DOM.getDocument" - })) +chrome.webRequest.onAuthRequired.addListener( + callbackFn, + {{urls: [""]}}, + ['blocking'] +); +""" - print("Woot") - document_id = id_count[0] # Store the ID we just used - while True: - response = await ws.recv() - data = json.loads(response) + with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file: + zip_file.writestr("manifest.json", manifest_json) + zip_file.writestr("background.js", background_js) - # Check if this is the response to our DOM.getDocument request - if data.get("id") == document_id: - root_node_id = data['result']['root']['nodeId'] - await ws.send(json.dumps({ - "id": get_id(), - "method": "DOM.getOuterHTML", - "params": {"nodeId": root_node_id} - })) + options.add_extension('proxy_auth.zip') - html_id = id_count[0] - while True: - response = await ws.recv() - data = json.loads(response) - if data.get("id") == html_id and "result" in data: - html_content = data['result']['outerHTML'] - print(html_content) - break - else: - print("Received event:", data) - print("Something happened") - break +driver = uc.Chrome( + browser_executable_path=args.browser_path, + headless=True, + use_subprocess=False, + options=options +) +driver.get(args.website) + +data = driver.execute_cdp_cmd('DOM.getDocument', {}) +if data: + if 'root' in data: + root_node_id = data['root']['nodeId'] + html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id}) + soup = BeautifulSoup(html['outerHTML'], 'html.parser') + print(soup.get_text()) + else: + print("Got data without a root:", data) +else: + print("Didn't get any data...") -asyncio.run(scrape())