Get proxy working with dotenv, scrap with BeautifulSoup
This commit is contained in:
parent
8d821c36af
commit
3700f26dc3
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
/.env
|
||||||
|
/proxy_auth.zip
|
@ -4,7 +4,7 @@ USER root
|
|||||||
|
|
||||||
RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip3 install --break-system-packages undetected-chromedriver
|
RUN pip3 install --break-system-packages undetected-chromedriver beautifulsoup4
|
||||||
|
|
||||||
COPY driver.py /app/
|
COPY driver.py /app/
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
67
driver.py
67
driver.py
@ -1,13 +1,72 @@
|
|||||||
|
import zipfile
|
||||||
import undetected_chromedriver as uc
|
import undetected_chromedriver as uc
|
||||||
import sys
|
import sys
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
if len(sys.argv) < 3:
|
||||||
sys.exit("usage: driver.py <path-to-browser> <site-to-scrape>")
|
sys.exit("usage: driver.py <path-to-browser> <site-to-scrape>")
|
||||||
|
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
proxy_host = os.getenv('PROXY_HOST')
|
||||||
|
proxy_port = os.getenv('PROXY_PORT')
|
||||||
|
username = os.getenv('PROXY_USERNAME')
|
||||||
|
password = os.getenv('PROXY_PASSWORD')
|
||||||
|
|
||||||
|
manifest_json = """
|
||||||
|
{
|
||||||
|
"version": "1.0.0",
|
||||||
|
"manifest_version": 2,
|
||||||
|
"name": "Chrome Proxy",
|
||||||
|
"permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"],
|
||||||
|
"background": {"scripts": ["background.js"], "persistent": true},
|
||||||
|
"minimum_chrome_version": "76.0.0"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
background_js = f"""
|
||||||
|
var config = {{
|
||||||
|
mode: "fixed_servers",
|
||||||
|
rules: {{
|
||||||
|
singleProxy: {{
|
||||||
|
scheme: "http",
|
||||||
|
host: "{proxy_server}",
|
||||||
|
port: parseInt({proxy_port})
|
||||||
|
}},
|
||||||
|
bypassList: ["localhost"]
|
||||||
|
}}
|
||||||
|
}};
|
||||||
|
|
||||||
|
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
|
||||||
|
|
||||||
|
function callbackFn(details) {{
|
||||||
|
return {{
|
||||||
|
authCredentials: {{
|
||||||
|
username: "{username}",
|
||||||
|
password: "{password}"
|
||||||
|
}}
|
||||||
|
}};
|
||||||
|
}}
|
||||||
|
|
||||||
|
chrome.webRequest.onAuthRequired.addListener(
|
||||||
|
callbackFn,
|
||||||
|
{{urls: ["<all_urls>"]}},
|
||||||
|
['blocking']
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
|
||||||
|
zip_file.writestr("manifest.json", manifest_json)
|
||||||
|
zip_file.writestr("background.js", background_js)
|
||||||
|
|
||||||
options = uc.ChromeOptions()
|
options = uc.ChromeOptions()
|
||||||
options.add_argument('--no-sandbox')
|
options.add_argument('--no-sandbox')
|
||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
options.add_argument('--disable-gpu')
|
options.add_argument('--disable-gpu')
|
||||||
|
options.add_extension('proxy_auth.zip')
|
||||||
|
|
||||||
driver = uc.Chrome(
|
driver = uc.Chrome(
|
||||||
browser_executable_path=sys.argv[1],
|
browser_executable_path=sys.argv[1],
|
||||||
@ -22,4 +81,10 @@ if data:
|
|||||||
if 'root' in data:
|
if 'root' in data:
|
||||||
root_node_id = data['root']['nodeId']
|
root_node_id = data['root']['nodeId']
|
||||||
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
||||||
print(html)
|
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
|
||||||
|
print(soup.get_text())
|
||||||
|
else:
|
||||||
|
print("Got data without a root:", data)
|
||||||
|
else:
|
||||||
|
print("Didn't get any data...")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user