auto-scraper/scrape.py

115 lines
3.0 KiB
Python

import zipfile
import undetected_chromedriver as uc
import os
import sys
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup
# Get parameters from environment variables
website = os.environ.get('WEBSITE')
browser_path = os.environ.get('BROWSER_PATH', '/usr/bin/google-chrome')
browser_args = os.environ.get('BROWSER_ARGS', '').split(',') if os.environ.get('BROWSER_ARGS') else None
proxy_url = os.environ.get('PROXY_URL')
output_file = os.environ.get('OUTPUT_FILE', '/output/output.txt')
if not website:
print("No website specified")
sys.exit(1)
options = uc.ChromeOptions()
if browser_args:
for b_arg in browser_args:
if b_arg.strip(): # Skip empty strings
options.add_argument(f'--{b_arg.strip()}')
if proxy_url:
parsed = urlparse(proxy_url)
proxy_host = parsed.hostname
proxy_port = parsed.port
proxy_username = parsed.username
proxy_password = parsed.password
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": ["proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"],
"background": {"scripts": ["background.js"], "persistent": true},
"minimum_chrome_version": "76.0.0"
}
"""
background_js = f"""
var config = {{
mode: "fixed_servers",
rules: {{
singleProxy: {{
scheme: "http",
host: "{proxy_host}",
port: parseInt({proxy_port})
}},
bypassList: ["localhost"]
}}
}};
chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
function callbackFn(details) {{
return {{
authCredentials: {{
username: "{proxy_username}",
password: "{proxy_password}"
}}
}};
}}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{{urls: ["<all_urls>"]}},
['blocking']
);
"""
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
zip_file.writestr("manifest.json", manifest_json)
zip_file.writestr("background.js", background_js)
options.add_extension('proxy_auth.zip')
driver = uc.Chrome(
browser_executable_path=browser_path,
headless=True,
use_subprocess=False,
options=options
)
print(f"Cold start finished: {time.time()}")
print(f"Start response time: {time.time()}")
driver.get(website)
data = driver.execute_cdp_cmd('DOM.getDocument', {})
output_text = ""
if data:
if 'root' in data:
root_node_id = data['root']['nodeId']
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
print(f"Stop response time: {time.time()}")
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
output_text = soup.get_text()
else:
output_text = f"Got data without a root: {data}"
else:
output_text = "Didn't get any data..."
driver.quit()
output_dir = os.path.dirname(output_file)
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w') as f:
f.write(output_text)