Start pod, split scripts into orchestrator/scrape, track perf metrics
This commit is contained in:
parent
4084ca505f
commit
2debe2aee1
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
|||||||
/.env
|
/.env
|
||||||
/proxy_auth.zip
|
/proxy_auth.zip
|
||||||
|
/output/
|
||||||
|
@ -6,9 +6,9 @@ RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/
|
|||||||
|
|
||||||
RUN pip3 install --break-system-packages undetected-chromedriver beautifulsoup4
|
RUN pip3 install --break-system-packages undetected-chromedriver beautifulsoup4
|
||||||
|
|
||||||
COPY driver.py /app/
|
COPY scrape.py /app/
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
EXPOSE 3000
|
VOLUME ["/output"]
|
||||||
|
|
||||||
CMD ["google-chrome", "--headless", "--no-sandbox", "--disable-gpu", "--remote-debugging-port=3000", "--remote-debugging-address=0.0.0.0"]
|
CMD ["python3", "scrape.py"]
|
||||||
|
@ -3,5 +3,5 @@
|
|||||||
docker stop search-api
|
docker stop search-api
|
||||||
docker rm search-api
|
docker rm search-api
|
||||||
docker build -t search-api .
|
docker build -t search-api .
|
||||||
docker run -d -p 3000:3000 --name search-api search-api
|
# docker run -d -p 3000:3000 --name search-api search-api
|
||||||
# docker exec -it search-api python driver.py /usr/bin/chromium-browser https://ferano.io
|
# docker exec -it search-api python driver.py /usr/bin/chromium-browser https://ferano.io
|
||||||
|
128
orchestrator.py
Normal file
128
orchestrator.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
import argparse
|
||||||
|
import docker
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
total_start = time.time()
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
|
||||||
|
|
||||||
|
parser.add_argument('website', help='Desired website you want to scrape')
|
||||||
|
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
|
||||||
|
default='/usr/bin/google-chrome')
|
||||||
|
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
|
||||||
|
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
|
||||||
|
parser.add_argument('-i', '--image-name', help='Name of the docker image',
|
||||||
|
default='search-api')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
docker_client = docker.from_env()
|
||||||
|
|
||||||
|
try:
|
||||||
|
docker_client.images.get(args.image_name)
|
||||||
|
except docker.errors.ImageNotFound:
|
||||||
|
print(f"Image {args.image_name} not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
env_vars = {
|
||||||
|
'WEBSITE': args.website,
|
||||||
|
'BROWSER_PATH': args.browser_path,
|
||||||
|
'OUTPUT_FILE': '/output/output.txt'
|
||||||
|
}
|
||||||
|
|
||||||
|
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'output')
|
||||||
|
|
||||||
|
# os.makedirs(output_dir, exist_ok=True)
|
||||||
|
# os.chmod(output_dir, 0o777)
|
||||||
|
|
||||||
|
if args.browser_args:
|
||||||
|
env_vars['BROWSER_ARGS'] = ','.join(args.browser_args)
|
||||||
|
|
||||||
|
if args.proxy_url:
|
||||||
|
env_vars['PROXY_URL'] = args.proxy_url
|
||||||
|
|
||||||
|
try:
|
||||||
|
container_start_time = time.time()
|
||||||
|
print("Container start time:", container_start_time)
|
||||||
|
|
||||||
|
container = docker_client.containers.run(
|
||||||
|
args.image_name,
|
||||||
|
# remove=True,
|
||||||
|
detach=True,
|
||||||
|
environment=env_vars,
|
||||||
|
# The `,z` part is needed to avoid SELinux issues
|
||||||
|
volumes={output_dir: {'bind': '/output', 'mode': 'rw,z'}},
|
||||||
|
user='0'
|
||||||
|
)
|
||||||
|
|
||||||
|
max_cpu_percent = 0
|
||||||
|
max_memory_mb = 0
|
||||||
|
total_rx_bytes = 0
|
||||||
|
total_tx_bytes = 0
|
||||||
|
stats_stream = container.stats(stream=True)
|
||||||
|
while container.status in ['running', 'created']:
|
||||||
|
container.reload()
|
||||||
|
if container.status != 'running':
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
stats = next(stats_stream)
|
||||||
|
stats = json.loads(stats.decode('utf-8'))
|
||||||
|
|
||||||
|
if 'online_cpus' not in stats['cpu_stats'] or 'usage' not in stats.get('memory_stats', {}):
|
||||||
|
continue
|
||||||
|
|
||||||
|
cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
|
||||||
|
system_delta = stats['cpu_stats'].get('system_cpu_usage', 0) - stats['precpu_stats'].get('system_cpu_usage', 0)
|
||||||
|
online_cpus = stats['cpu_stats']['online_cpus']
|
||||||
|
|
||||||
|
if system_delta > 0:
|
||||||
|
cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0
|
||||||
|
max_cpu_percent = max(max_cpu_percent, cpu_percent)
|
||||||
|
|
||||||
|
if 'usage' in stats.get('memory_stats', {}):
|
||||||
|
memory_mb = stats['memory_stats']['usage'] / 1024 / 1024
|
||||||
|
max_memory_mb = max(max_memory_mb, memory_mb)
|
||||||
|
|
||||||
|
networks = stats.get('networks', {})
|
||||||
|
if isinstance(networks, dict):
|
||||||
|
for interface in networks.values():
|
||||||
|
total_rx_bytes = max(total_rx_bytes, interface.get('rx_bytes', 0))
|
||||||
|
total_tx_bytes = max(total_tx_bytes, interface.get('tx_bytes', 0))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('Error:', e)
|
||||||
|
break
|
||||||
|
|
||||||
|
container.reload() # Refresh container status
|
||||||
|
|
||||||
|
print(f"""Performance:
|
||||||
|
Max CPU: {max_cpu_percent:.2f}%
|
||||||
|
Max Memory: {max_memory_mb:.2f}MB
|
||||||
|
Total Transfer: {(total_rx_bytes + total_tx_bytes) / 1024} KBs""")
|
||||||
|
|
||||||
|
container.wait()
|
||||||
|
|
||||||
|
print("Container logs:")
|
||||||
|
logs = container.logs().decode('utf-8')
|
||||||
|
for line in logs.strip().split('\n'):
|
||||||
|
if "Cold start" in line:
|
||||||
|
coldstart_stop_time = float(re.search(r'([\d.]+)', line).group(1))
|
||||||
|
if "Start response" in line:
|
||||||
|
scrape_start_time = float(re.search(r'([\d.]+)', line).group(1))
|
||||||
|
if "Stop response" in line:
|
||||||
|
scrape_stop_time = float(re.search(r'([\d.]+)', line).group(1))
|
||||||
|
print(f"\t{line}")
|
||||||
|
|
||||||
|
container.remove()
|
||||||
|
print(f"Cold Start Time: {(coldstart_stop_time - container_start_time):.3f} seconds")
|
||||||
|
print(f"Time to Response: {(scrape_stop_time - scrape_start_time):.3f} seconds")
|
||||||
|
print(f"Completed everything in: {(time.time() - total_start):.3f} seconds")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error running container: {e}")
|
||||||
|
sys.exit(1)
|
56
scrape.py
56
scrape.py
@ -1,28 +1,30 @@
|
|||||||
import zipfile
|
import zipfile
|
||||||
import undetected_chromedriver as uc
|
import undetected_chromedriver as uc
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import time
|
||||||
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')
|
# Get parameters from environment variables
|
||||||
|
website = os.environ.get('WEBSITE')
|
||||||
|
browser_path = os.environ.get('BROWSER_PATH', '/usr/bin/google-chrome')
|
||||||
|
browser_args = os.environ.get('BROWSER_ARGS', '').split(',') if os.environ.get('BROWSER_ARGS') else None
|
||||||
|
proxy_url = os.environ.get('PROXY_URL')
|
||||||
|
output_file = os.environ.get('OUTPUT_FILE', '/output/output.txt')
|
||||||
|
|
||||||
parser.add_argument('website', help='Desired website you want to scrape')
|
if not website:
|
||||||
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
|
print("No website specified")
|
||||||
default='/usr/bin/google-chrome')
|
sys.exit(1)
|
||||||
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
|
|
||||||
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
options = uc.ChromeOptions()
|
options = uc.ChromeOptions()
|
||||||
if args.browser_args:
|
if browser_args:
|
||||||
for b_arg in args.browser_args:
|
for b_arg in browser_args:
|
||||||
options.add_argument(f'--{b_arg}')
|
if b_arg.strip(): # Skip empty strings
|
||||||
|
options.add_argument(f'--{b_arg.strip()}')
|
||||||
|
|
||||||
if args.proxy_url:
|
if proxy_url:
|
||||||
parsed = urlparse(args.proxy_url)
|
parsed = urlparse(proxy_url)
|
||||||
|
|
||||||
proxy_host = parsed.hostname
|
proxy_host = parsed.hostname
|
||||||
proxy_port = parsed.port
|
proxy_port = parsed.port
|
||||||
@ -71,7 +73,6 @@ chrome.webRequest.onAuthRequired.addListener(
|
|||||||
);
|
);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
|
with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file:
|
||||||
zip_file.writestr("manifest.json", manifest_json)
|
zip_file.writestr("manifest.json", manifest_json)
|
||||||
zip_file.writestr("background.js", background_js)
|
zip_file.writestr("background.js", background_js)
|
||||||
@ -79,22 +80,35 @@ chrome.webRequest.onAuthRequired.addListener(
|
|||||||
options.add_extension('proxy_auth.zip')
|
options.add_extension('proxy_auth.zip')
|
||||||
|
|
||||||
driver = uc.Chrome(
|
driver = uc.Chrome(
|
||||||
browser_executable_path=args.browser_path,
|
browser_executable_path=browser_path,
|
||||||
headless=True,
|
headless=True,
|
||||||
use_subprocess=False,
|
use_subprocess=False,
|
||||||
options=options
|
options=options
|
||||||
)
|
)
|
||||||
driver.get(args.website)
|
print(f"Cold start finished: {time.time()}")
|
||||||
|
|
||||||
|
print(f"Start response time: {time.time()}")
|
||||||
|
driver.get(website)
|
||||||
|
|
||||||
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
||||||
|
output_text = ""
|
||||||
|
|
||||||
if data:
|
if data:
|
||||||
if 'root' in data:
|
if 'root' in data:
|
||||||
root_node_id = data['root']['nodeId']
|
root_node_id = data['root']['nodeId']
|
||||||
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
||||||
|
print(f"Stop response time: {time.time()}")
|
||||||
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
|
soup = BeautifulSoup(html['outerHTML'], 'html.parser')
|
||||||
print(soup.get_text())
|
output_text = soup.get_text()
|
||||||
else:
|
else:
|
||||||
print("Got data without a root:", data)
|
output_text = f"Got data without a root: {data}"
|
||||||
else:
|
else:
|
||||||
print("Didn't get any data...")
|
output_text = "Didn't get any data..."
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
output_dir = os.path.dirname(output_file)
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
f.write(output_text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user