Replace scrape.py with contents of driver.py

This commit is contained in:
Joseph Ferano 2025-07-30 18:44:41 +07:00
parent 84059a208a
commit 4084ca505f
2 changed files with 83 additions and 174 deletions

100
driver.py
View File

@ -1,100 +0,0 @@
import zipfile
import undetected_chromedriver as uc
import sys
import argparse
from urllib.parse import urlparse
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
parser.add_argument('website', help='Desired website you want to scrape')
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
default='/usr/bin/google-chrome')
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
args = parser.parse_args()
options = uc.ChromeOptions()
if args.browser_args:
for b_arg in args.browser_args:
options.add_argument(f'--{b_arg}')
if args.proxy_url:
parsed = urlparse(args.proxy_url)
proxy_host = parsed.hostname
proxy_port = parsed.port
proxy_username = parsed.username
proxy_password = parsed.password
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"],
"background": {"scripts": ["background.js"], "persistent": true},
"minimum_chrome_version": "76.0.0"
}
"""
background_js = f"""
var config = {{
mode: "fixed_servers",
rules: {{
singleProxy: {{
scheme: "http",
host: "{proxy_host}",
port: parseInt({proxy_port})
}},
bypassList: ["localhost"]
}}
}};
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
function callbackFn(details) {{
return {{
authCredentials: {{
username: "{proxy_username}",
password: "{proxy_password}"
}}
}};
}}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{{urls: ["<all_urls>"]}},
['blocking']
);
"""
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
zip_file.writestr("manifest.json", manifest_json)
zip_file.writestr("background.js", background_js)
options.add_extension('proxy_auth.zip')
driver = uc.Chrome(
browser_executable_path=args.browser_path,
headless=True,
use_subprocess=False,
options=options
)
driver.get(args.website)
data = driver.execute_cdp_cmd('DOM.getDocument', {})
if data:
if 'root' in data:
root_node_id = data['root']['nodeId']
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
print(soup.get_text())
else:
print("Got data without a root:", data)
else:
print("Didn't get any data...")

157
scrape.py
View File

@ -1,91 +1,100 @@
import requests
import websockets
import json
import asyncio
from pprint import pprint
import zipfile
import undetected_chromedriver as uc
import sys
import argparse
# TODO: Use docker thing to start a docker service
# TODO: Accept command line args for docker image
from urllib.parse import urlparse
from bs4 import BeautifulSoup
async def scrape():
id_count = [0]
def get_id():
id_count[0] += 1
return id_count[0]
parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
response = requests.get("http://localhost:3000/json")
targets = response.json()
parser.add_argument('website', help='Desired website you want to scrape')
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
default='/usr/bin/google-chrome')
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
if not targets:
print("No active sessions found")
return
args = parser.parse_args()
websocket_url = targets[0]['webSocketDebuggerUrl']
print(f"Connecting to: {websocket_url}")
options = uc.ChromeOptions()
if args.browser_args:
for b_arg in args.browser_args:
options.add_argument(f'--{b_arg}')
async with websockets.connect(websocket_url) as ws:
for elem in ["DOM", "Page"]:
print("Enabling", elem)
await ws.send(json.dumps({
"id": get_id(),
"method": f"{elem}.enable"
}))
# await asyncio.sleep(1)
response = await ws.recv()
print(f"{elem} enabled:", json.loads(response))
if args.proxy_url:
parsed = urlparse(args.proxy_url)
print("Staring up")
proxy_host = parsed.hostname
proxy_port = parsed.port
proxy_username = parsed.username
proxy_password = parsed.password
await ws.send(json.dumps({
"id": get_id(),
"method": "Page.navigate",
"params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
# "params": {"url": "https://ferano.io"}
}))
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"],
"background": {"scripts": ["background.js"], "persistent": true},
"minimum_chrome_version": "76.0.0"
}
"""
print("Send navigate request")
background_js = f"""
var config = {{
mode: "fixed_servers",
rules: {{
singleProxy: {{
scheme: "http",
host: "{proxy_host}",
port: parseInt({proxy_port})
}},
bypassList: ["localhost"]
}}
}};
while True:
response = await ws.recv()
data = json.loads(response)
if data.get("method") == "Page.loadEventFired":
break
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
print("Got loadEventFired event")
print("Get Document...")
function callbackFn(details) {{
return {{
authCredentials: {{
username: "{proxy_username}",
password: "{proxy_password}"
}}
}};
}}
await ws.send(json.dumps({
"id": get_id(),
"method": "DOM.getDocument"
}))
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{{urls: ["<all_urls>"]}},
['blocking']
);
"""
print("Woot")
document_id = id_count[0] # Store the ID we just used
while True:
response = await ws.recv()
data = json.loads(response)
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
zip_file.writestr("manifest.json", manifest_json)
zip_file.writestr("background.js", background_js)
# Check if this is the response to our DOM.getDocument request
if data.get("id") == document_id:
root_node_id = data['result']['root']['nodeId']
await ws.send(json.dumps({
"id": get_id(),
"method": "DOM.getOuterHTML",
"params": {"nodeId": root_node_id}
}))
options.add_extension('proxy_auth.zip')
html_id = id_count[0]
while True:
response = await ws.recv()
data = json.loads(response)
if data.get("id") == html_id and "result" in data:
html_content = data['result']['outerHTML']
print(html_content)
break
else:
print("Received event:", data)
print("Something happened")
break
driver = uc.Chrome(
browser_executable_path=args.browser_path,
headless=True,
use_subprocess=False,
options=options
)
driver.get(args.website)
data = driver.execute_cdp_cmd('DOM.getDocument', {})
if data:
if 'root' in data:
root_node_id = data['root']['nodeId']
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
print(soup.get_text())
else:
print("Got data without a root:", data)
else:
print("Didn't get any data...")
asyncio.run(scrape())