Get proxy working with dotenv, scrap with BeautifulSoup

This commit is contained in:
Joseph Ferano 2025-07-30 15:09:11 +07:00
parent 8d821c36af
commit 3700f26dc3
3 changed files with 69 additions and 2 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/.env
/proxy_auth.zip

View File

@ -4,7 +4,7 @@ USER root
RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/*
RUN pip3 install --break-system-packages undetected-chromedriver
RUN pip3 install --break-system-packages undetected-chromedriver beautifulsoup4
COPY driver.py /app/
WORKDIR /app

View File

@ -1,13 +1,72 @@
import zipfile
import undetected_chromedriver as uc
import sys
from dotenv import load_dotenv
from bs4 import BeautifulSoup
if len(sys.argv) < 3:
sys.exit("usage: driver.py <path-to-browser> <site-to-scrape>")
load_dotenv()
proxy_host = os.getenv('PROXY_HOST')
proxy_port = os.getenv('PROXY_PORT')
username = os.getenv('PROXY_USERNAME')
password = os.getenv('PROXY_PASSWORD')
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"],
"background": {"scripts": ["background.js"], "persistent": true},
"minimum_chrome_version": "76.0.0"
}
"""
background_js = f"""
var config = {{
mode: "fixed_servers",
rules: {{
singleProxy: {{
scheme: "http",
host: "{proxy_server}",
port: parseInt({proxy_port})
}},
bypassList: ["localhost"]
}}
}};
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
function callbackFn(details) {{
return {{
authCredentials: {{
username: "{username}",
password: "{password}"
}}
}};
}}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{{urls: ["<all_urls>"]}},
['blocking']
);
"""
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
zip_file.writestr("manifest.json", manifest_json)
zip_file.writestr("background.js", background_js)
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_extension('proxy_auth.zip')
driver = uc.Chrome(
browser_executable_path=sys.argv[1],
@ -22,4 +81,10 @@ if data:
if 'root' in data:
root_node_id = data['root']['nodeId']
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
print(html)
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
print(soup.get_text())
else:
print("Got data without a root:", data)
else:
print("Didn't get any data...")