Argument parsing for better cli experience
This commit is contained in:
parent
3700f26dc3
commit
84059a208a
52
driver.py
52
driver.py
@ -1,21 +1,35 @@
|
|||||||
import zipfile
|
import zipfile
|
||||||
import undetected_chromedriver as uc
|
import undetected_chromedriver as uc
|
||||||
import sys
|
import sys
|
||||||
from dotenv import load_dotenv
|
import argparse
|
||||||
|
|
||||||
|
from urllib.parse import urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
|
||||||
sys.exit("usage: driver.py <path-to-browser> <site-to-scrape>")
|
|
||||||
|
|
||||||
|
parser.add_argument('website', help='Desired website you want to scrape')
|
||||||
|
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
|
||||||
|
default='/usr/bin/google-chrome')
|
||||||
|
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
|
||||||
|
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
|
||||||
|
|
||||||
load_dotenv()
|
args = parser.parse_args()
|
||||||
|
|
||||||
proxy_host = os.getenv('PROXY_HOST')
|
options = uc.ChromeOptions()
|
||||||
proxy_port = os.getenv('PROXY_PORT')
|
if args.browser_args:
|
||||||
username = os.getenv('PROXY_USERNAME')
|
for b_arg in args.browser_args:
|
||||||
password = os.getenv('PROXY_PASSWORD')
|
options.add_argument(f'--{b_arg}')
|
||||||
|
|
||||||
manifest_json = """
|
if args.proxy_url:
|
||||||
|
parsed = urlparse(args.proxy_url)
|
||||||
|
|
||||||
|
proxy_host = parsed.hostname
|
||||||
|
proxy_port = parsed.port
|
||||||
|
proxy_username = parsed.username
|
||||||
|
proxy_password = parsed.password
|
||||||
|
|
||||||
|
manifest_json = """
|
||||||
{
|
{
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"manifest_version": 2,
|
"manifest_version": 2,
|
||||||
@ -26,13 +40,13 @@ manifest_json = """
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
background_js = f"""
|
background_js = f"""
|
||||||
var config = {{
|
var config = {{
|
||||||
mode: "fixed_servers",
|
mode: "fixed_servers",
|
||||||
rules: {{
|
rules: {{
|
||||||
singleProxy: {{
|
singleProxy: {{
|
||||||
scheme: "http",
|
scheme: "http",
|
||||||
host: "{proxy_server}",
|
host: "{proxy_host}",
|
||||||
port: parseInt({proxy_port})
|
port: parseInt({proxy_port})
|
||||||
}},
|
}},
|
||||||
bypassList: ["localhost"]
|
bypassList: ["localhost"]
|
||||||
@ -44,8 +58,8 @@ chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
|
|||||||
function callbackFn(details) {{
|
function callbackFn(details) {{
|
||||||
return {{
|
return {{
|
||||||
authCredentials: {{
|
authCredentials: {{
|
||||||
username: "{username}",
|
username: "{proxy_username}",
|
||||||
password: "{password}"
|
password: "{proxy_password}"
|
||||||
}}
|
}}
|
||||||
}};
|
}};
|
||||||
}}
|
}}
|
||||||
@ -58,23 +72,19 @@ chrome.webRequest.onAuthRequired.addListener(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
|
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
|
||||||
zip_file.writestr("manifest.json", manifest_json)
|
zip_file.writestr("manifest.json", manifest_json)
|
||||||
zip_file.writestr("background.js", background_js)
|
zip_file.writestr("background.js", background_js)
|
||||||
|
|
||||||
options = uc.ChromeOptions()
|
options.add_extension('proxy_auth.zip')
|
||||||
options.add_argument('--no-sandbox')
|
|
||||||
options.add_argument('--disable-dev-shm-usage')
|
|
||||||
options.add_argument('--disable-gpu')
|
|
||||||
options.add_extension('proxy_auth.zip')
|
|
||||||
|
|
||||||
driver = uc.Chrome(
|
driver = uc.Chrome(
|
||||||
browser_executable_path=sys.argv[1],
|
browser_executable_path=args.browser_path,
|
||||||
headless=True,
|
headless=True,
|
||||||
use_subprocess=False,
|
use_subprocess=False,
|
||||||
options=options
|
options=options
|
||||||
)
|
)
|
||||||
driver.get(sys.argv[2])
|
driver.get(args.website)
|
||||||
|
|
||||||
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
||||||
if data:
|
if data:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user