Argument parsing for better cli experience

This commit is contained in:
Joseph Ferano 2025-07-30 18:43:04 +07:00
parent 3700f26dc3
commit 84059a208a

View File

@ -1,21 +1,35 @@
import zipfile import zipfile
import undetected_chromedriver as uc import undetected_chromedriver as uc
import sys import sys
from dotenv import load_dotenv import argparse
from urllib.parse import urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
if len(sys.argv) < 3: parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
sys.exit("usage: driver.py <path-to-browser> <site-to-scrape>")
parser.add_argument('website', help='Desired website you want to scrape')
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
default='/usr/bin/google-chrome')
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
load_dotenv() args = parser.parse_args()
proxy_host = os.getenv('PROXY_HOST') options = uc.ChromeOptions()
proxy_port = os.getenv('PROXY_PORT') if args.browser_args:
username = os.getenv('PROXY_USERNAME') for b_arg in args.browser_args:
password = os.getenv('PROXY_PASSWORD') options.add_argument(f'--{b_arg}')
manifest_json = """ if args.proxy_url:
parsed = urlparse(args.proxy_url)
proxy_host = parsed.hostname
proxy_port = parsed.port
proxy_username = parsed.username
proxy_password = parsed.password
manifest_json = """
{ {
"version": "1.0.0", "version": "1.0.0",
"manifest_version": 2, "manifest_version": 2,
@ -26,13 +40,13 @@ manifest_json = """
} }
""" """
background_js = f""" background_js = f"""
var config = {{ var config = {{
mode: "fixed_servers", mode: "fixed_servers",
rules: {{ rules: {{
singleProxy: {{ singleProxy: {{
scheme: "http", scheme: "http",
host: "{proxy_server}", host: "{proxy_host}",
port: parseInt({proxy_port}) port: parseInt({proxy_port})
}}, }},
bypassList: ["localhost"] bypassList: ["localhost"]
@ -44,8 +58,8 @@ chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
function callbackFn(details) {{ function callbackFn(details) {{
return {{ return {{
authCredentials: {{ authCredentials: {{
username: "{username}", username: "{proxy_username}",
password: "{password}" password: "{proxy_password}"
}} }}
}}; }};
}} }}
@ -58,23 +72,19 @@ chrome.webRequest.onAuthRequired.addListener(
""" """
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file: with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
zip_file.writestr("manifest.json", manifest_json) zip_file.writestr("manifest.json", manifest_json)
zip_file.writestr("background.js", background_js) zip_file.writestr("background.js", background_js)
options = uc.ChromeOptions() options.add_extension('proxy_auth.zip')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_extension('proxy_auth.zip')
driver = uc.Chrome( driver = uc.Chrome(
browser_executable_path=sys.argv[1], browser_executable_path=args.browser_path,
headless=True, headless=True,
use_subprocess=False, use_subprocess=False,
options=options options=options
) )
driver.get(sys.argv[2]) driver.get(args.website)
data = driver.execute_cdp_cmd('DOM.getDocument', {}) data = driver.execute_cdp_cmd('DOM.getDocument', {})
if data: if data: