auto-scraper/orchestrator.py

import argparse
import docker
import sys
import os
import json
import time
import re
import pprint
import subprocess

from jinja2 import Template

total_start = time.time()

parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser')

parser.add_argument('website', help='Desired website you want to scrape')
parser.add_argument('-b', '--browser-path', help='Path to browser binary',
                    default='/usr/bin/google-chrome')
parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser')
parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)')
parser.add_argument('-i', '--image-name', help='Name of the docker image',
                    default='search-api')

args = parser.parse_args()

docker_client = docker.from_env()

try:
    docker_client.images.get(args.image_name)
except docker.errors.ImageNotFound:
    print(f"Image {args.image_name} not found")
    sys.exit(1)

env_vars = {
    'WEBSITE': args.website,
    'BROWSER_PATH': args.browser_path,
    'OUTPUT_FILE': '/output/output.txt'
}

output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'output')

# os.makedirs(output_dir, exist_ok=True)
# os.chmod(output_dir, 0o777)

if args.browser_args:
    env_vars['BROWSER_ARGS'] = ','.join(args.browser_args)

if args.proxy_url:
    env_vars['PROXY_URL'] = args.proxy_url

try:
    container_start_time = time.time()
    print("Container start time:", container_start_time)

    container = docker_client.containers.run(
        args.image_name,
        # remove=True,
        detach=True,
        environment=env_vars,
        # The `,z` part is needed to avoid SELinux issues
        volumes={output_dir: {'bind': '/output', 'mode': 'rw,z'}},
        user='0'
    )

    max_cpu_percent = 0
    max_memory_mb = 0
    total_rx_bytes = 0
    total_tx_bytes = 0
    stats_stream = container.stats(stream=True)
    while container.status in ['running', 'created']:
        container.reload()
        if container.status != 'running':
            break
        try:
            stats = next(stats_stream)
            stats = json.loads(stats.decode('utf-8'))

            if 'online_cpus' not in stats['cpu_stats'] or 'usage' not in stats.get('memory_stats', {}):
                continue

            cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
            system_delta = stats['cpu_stats'].get('system_cpu_usage', 0) - stats['precpu_stats'].get('system_cpu_usage', 0)
            online_cpus = stats['cpu_stats']['online_cpus']

            if system_delta > 0:
                cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0
                max_cpu_percent = max(max_cpu_percent, cpu_percent)

            if 'usage' in stats.get('memory_stats', {}):
                memory_mb = stats['memory_stats']['usage'] / 1024 / 1024
                max_memory_mb = max(max_memory_mb, memory_mb)

            networks = stats.get('networks', {})
            if isinstance(networks, dict):
                for interface in networks.values():
                    total_rx_bytes = max(total_rx_bytes, interface.get('rx_bytes', 0))
                    total_tx_bytes = max(total_tx_bytes, interface.get('tx_bytes', 0))

        except Exception as e:
            print('Error:', e)
            break

        container.reload()  # Refresh container status

    container.wait()

    print("Container logs:")
    logs = container.logs().decode('utf-8')
    for line in logs.strip().split('\n'):
        if "Cold start" in line:
            coldstart_stop_time = float(re.search(r'([\d.]+)', line).group(1))
        if "Start response" in line:
            scrape_start_time = float(re.search(r'([\d.]+)', line).group(1))
        if "Stop response" in line:
            scrape_stop_time = float(re.search(r'([\d.]+)', line).group(1))
        print(f"\t{line}")

    container.remove()

    metrics = [
        {"name": "Max CPU", "value": f"{max_cpu_percent:.2f}", "unit": "%"},
        {"name": "Max Memory", "value": f"{max_memory_mb:.2f}", "unit": "MB"},
        {"name": "Total Transfer", "value": f"{(total_rx_bytes + total_tx_bytes) / 1024:.0f}", "unit": "KB"},
        {"name": "Cold Start Time", "value": f"{(coldstart_stop_time - container_start_time):.3f}", "unit": "seconds"},
        {"name": "Time to Response", "value": f"{(scrape_stop_time - scrape_start_time):.3f}", "unit": "seconds"},
        {"name": "Total Runtime", "value": f"{(time.time() - total_start):.3f}", "unit": "seconds"}
    ]

    with open('output/output.txt', 'r', encoding='utf-8') as f:
        text_content = f.read()

    with open('report.html', 'r') as f:
        template = Template(f.read())

    rendered_html = template.render(metrics=metrics, text_content=text_content)

    with open('output/report.html', 'w') as f:
        f.write(rendered_html)

    # Try running `open` to launch the html file automatically, fail silently
    try:
        subprocess.run(['open', 'output/report.html'])
        print("Report opened in browser")
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("what happened")
        pass


    print(f"""Performance:
    Max CPU: {max_cpu_percent:.2f}%
    Max Memory: {max_memory_mb:.2f}MB
    Total Transfer: {(total_rx_bytes + total_tx_bytes) / 1024} KBs""")

    print(f"Cold Start Time: {(coldstart_stop_time - container_start_time):.3f} seconds")
    print(f"Time to Response: {(scrape_stop_time - scrape_start_time):.3f} seconds")
    print(f"Completed everything in: {(time.time() - total_start):.3f} seconds")

except Exception as e:
    print(f"Error running container: {e}")
    sys.exit(1)