Replace scrape.py with contents of driver.py
This commit is contained in:
parent
84059a208a
commit
4084ca505f
100
driver.py
100
driver.py
@ -1,100 +0,0 @@
|
|||||||
import zipfile
|
|
||||||
import undetected_chromedriver as uc
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
|
|
||||||
|
|
||||||
parser.add_argument('website', help='Desired website you want to scrape')
|
|
||||||
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
|
|
||||||
default='/usr/bin/google-chrome')
|
|
||||||
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
|
|
||||||
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
options = uc.ChromeOptions()
|
|
||||||
if args.browser_args:
|
|
||||||
for b_arg in args.browser_args:
|
|
||||||
options.add_argument(f'--{b_arg}')
|
|
||||||
|
|
||||||
if args.proxy_url:
|
|
||||||
parsed = urlparse(args.proxy_url)
|
|
||||||
|
|
||||||
proxy_host = parsed.hostname
|
|
||||||
proxy_port = parsed.port
|
|
||||||
proxy_username = parsed.username
|
|
||||||
proxy_password = parsed.password
|
|
||||||
|
|
||||||
manifest_json = """
|
|
||||||
{
|
|
||||||
"version": "1.0.0",
|
|
||||||
"manifest_version": 2,
|
|
||||||
"name": "Chrome Proxy",
|
|
||||||
"permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"],
|
|
||||||
"background": {"scripts": ["background.js"], "persistent": true},
|
|
||||||
"minimum_chrome_version": "76.0.0"
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
background_js = f"""
|
|
||||||
var config = {{
|
|
||||||
mode: "fixed_servers",
|
|
||||||
rules: {{
|
|
||||||
singleProxy: {{
|
|
||||||
scheme: "http",
|
|
||||||
host: "{proxy_host}",
|
|
||||||
port: parseInt({proxy_port})
|
|
||||||
}},
|
|
||||||
bypassList: ["localhost"]
|
|
||||||
}}
|
|
||||||
}};
|
|
||||||
|
|
||||||
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
|
|
||||||
|
|
||||||
function callbackFn(details) {{
|
|
||||||
return {{
|
|
||||||
authCredentials: {{
|
|
||||||
username: "{proxy_username}",
|
|
||||||
password: "{proxy_password}"
|
|
||||||
}}
|
|
||||||
}};
|
|
||||||
}}
|
|
||||||
|
|
||||||
chrome.webRequest.onAuthRequired.addListener(
|
|
||||||
callbackFn,
|
|
||||||
{{urls: ["<all_urls>"]}},
|
|
||||||
['blocking']
|
|
||||||
);
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
|
|
||||||
zip_file.writestr("manifest.json", manifest_json)
|
|
||||||
zip_file.writestr("background.js", background_js)
|
|
||||||
|
|
||||||
options.add_extension('proxy_auth.zip')
|
|
||||||
|
|
||||||
driver = uc.Chrome(
|
|
||||||
browser_executable_path=args.browser_path,
|
|
||||||
headless=True,
|
|
||||||
use_subprocess=False,
|
|
||||||
options=options
|
|
||||||
)
|
|
||||||
driver.get(args.website)
|
|
||||||
|
|
||||||
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
|
||||||
if data:
|
|
||||||
if 'root' in data:
|
|
||||||
root_node_id = data['root']['nodeId']
|
|
||||||
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
|
||||||
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
|
|
||||||
print(soup.get_text())
|
|
||||||
else:
|
|
||||||
print("Got data without a root:", data)
|
|
||||||
else:
|
|
||||||
print("Didn't get any data...")
|
|
||||||
|
|
155
scrape.py
155
scrape.py
@ -1,91 +1,100 @@
|
|||||||
import requests
|
import zipfile
|
||||||
import websockets
|
import undetected_chromedriver as uc
|
||||||
import json
|
import sys
|
||||||
import asyncio
|
import argparse
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
# TODO: Use docker thing to start a docker service
|
from urllib.parse import urlparse
|
||||||
# TODO: Accept command line args for docker image
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
async def scrape():
|
parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
|
||||||
id_count = [0]
|
|
||||||
def get_id():
|
|
||||||
id_count[0] += 1
|
|
||||||
return id_count[0]
|
|
||||||
|
|
||||||
response = requests.get("http://localhost:3000/json")
|
parser.add_argument('website', help='Desired website you want to scrape')
|
||||||
targets = response.json()
|
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
|
||||||
|
default='/usr/bin/google-chrome')
|
||||||
|
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
|
||||||
|
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
|
||||||
|
|
||||||
if not targets:
|
args = parser.parse_args()
|
||||||
print("No active sessions found")
|
|
||||||
return
|
|
||||||
|
|
||||||
websocket_url = targets[0]['webSocketDebuggerUrl']
|
options = uc.ChromeOptions()
|
||||||
print(f"Connecting to: {websocket_url}")
|
if args.browser_args:
|
||||||
|
for b_arg in args.browser_args:
|
||||||
|
options.add_argument(f'--{b_arg}')
|
||||||
|
|
||||||
async with websockets.connect(websocket_url) as ws:
|
if args.proxy_url:
|
||||||
for elem in ["DOM", "Page"]:
|
parsed = urlparse(args.proxy_url)
|
||||||
print("Enabling", elem)
|
|
||||||
await ws.send(json.dumps({
|
|
||||||
"id": get_id(),
|
|
||||||
"method": f"{elem}.enable"
|
|
||||||
}))
|
|
||||||
# await asyncio.sleep(1)
|
|
||||||
response = await ws.recv()
|
|
||||||
print(f"{elem} enabled:", json.loads(response))
|
|
||||||
|
|
||||||
print("Staring up")
|
proxy_host = parsed.hostname
|
||||||
|
proxy_port = parsed.port
|
||||||
|
proxy_username = parsed.username
|
||||||
|
proxy_password = parsed.password
|
||||||
|
|
||||||
await ws.send(json.dumps({
|
manifest_json = """
|
||||||
"id": get_id(),
|
{
|
||||||
"method": "Page.navigate",
|
"version": "1.0.0",
|
||||||
"params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
|
"manifest_version": 2,
|
||||||
# "params": {"url": "https://ferano.io"}
|
"name": "Chrome Proxy",
|
||||||
}))
|
"permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"],
|
||||||
|
"background": {"scripts": ["background.js"], "persistent": true},
|
||||||
|
"minimum_chrome_version": "76.0.0"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
print("Send navigate request")
|
background_js = f"""
|
||||||
|
var config = {{
|
||||||
|
mode: "fixed_servers",
|
||||||
|
rules: {{
|
||||||
|
singleProxy: {{
|
||||||
|
scheme: "http",
|
||||||
|
host: "{proxy_host}",
|
||||||
|
port: parseInt({proxy_port})
|
||||||
|
}},
|
||||||
|
bypassList: ["localhost"]
|
||||||
|
}}
|
||||||
|
}};
|
||||||
|
|
||||||
while True:
|
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
|
||||||
response = await ws.recv()
|
|
||||||
data = json.loads(response)
|
|
||||||
if data.get("method") == "Page.loadEventFired":
|
|
||||||
break
|
|
||||||
|
|
||||||
print("Got loadEventFired event")
|
function callbackFn(details) {{
|
||||||
print("Get Document...")
|
return {{
|
||||||
|
authCredentials: {{
|
||||||
|
username: "{proxy_username}",
|
||||||
|
password: "{proxy_password}"
|
||||||
|
}}
|
||||||
|
}};
|
||||||
|
}}
|
||||||
|
|
||||||
await ws.send(json.dumps({
|
chrome.webRequest.onAuthRequired.addListener(
|
||||||
"id": get_id(),
|
callbackFn,
|
||||||
"method": "DOM.getDocument"
|
{{urls: ["<all_urls>"]}},
|
||||||
}))
|
['blocking']
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
|
||||||
print("Woot")
|
|
||||||
|
|
||||||
document_id = id_count[0] # Store the ID we just used
|
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
|
||||||
while True:
|
zip_file.writestr("manifest.json", manifest_json)
|
||||||
response = await ws.recv()
|
zip_file.writestr("background.js", background_js)
|
||||||
data = json.loads(response)
|
|
||||||
|
|
||||||
# Check if this is the response to our DOM.getDocument request
|
options.add_extension('proxy_auth.zip')
|
||||||
if data.get("id") == document_id:
|
|
||||||
root_node_id = data['result']['root']['nodeId']
|
|
||||||
await ws.send(json.dumps({
|
|
||||||
"id": get_id(),
|
|
||||||
"method": "DOM.getOuterHTML",
|
|
||||||
"params": {"nodeId": root_node_id}
|
|
||||||
}))
|
|
||||||
|
|
||||||
html_id = id_count[0]
|
driver = uc.Chrome(
|
||||||
while True:
|
browser_executable_path=args.browser_path,
|
||||||
response = await ws.recv()
|
headless=True,
|
||||||
data = json.loads(response)
|
use_subprocess=False,
|
||||||
if data.get("id") == html_id and "result" in data:
|
options=options
|
||||||
html_content = data['result']['outerHTML']
|
)
|
||||||
print(html_content)
|
driver.get(args.website)
|
||||||
break
|
|
||||||
|
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
||||||
|
if data:
|
||||||
|
if 'root' in data:
|
||||||
|
root_node_id = data['root']['nodeId']
|
||||||
|
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
||||||
|
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
|
||||||
|
print(soup.get_text())
|
||||||
else:
|
else:
|
||||||
print("Received event:", data)
|
print("Got data without a root:", data)
|
||||||
print("Something happened")
|
else:
|
||||||
break
|
print("Didn't get any data...")
|
||||||
|
|
||||||
asyncio.run(scrape())
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user