import argparse import docker import sys import os import json import time import re import pprint import subprocess from jinja2 import Template total_start = time.time() parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser') parser.add_argument('website', help='Desired website you want to scrape') parser.add_argument('-b', '--browser-path', help='Path to browser binary', default='/usr/bin/google-chrome') parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser') parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)') parser.add_argument('-i', '--image-name', help='Name of the docker image', default='search-api') args = parser.parse_args() docker_client = docker.from_env() try: docker_client.images.get(args.image_name) except docker.errors.ImageNotFound: print(f"Image {args.image_name} not found") sys.exit(1) env_vars = { 'WEBSITE': args.website, 'BROWSER_PATH': args.browser_path, 'OUTPUT_FILE': '/output/output.txt' } output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'output') os.makedirs(output_dir, exist_ok=True) os.chmod(output_dir, 0o777) if args.browser_args: env_vars['BROWSER_ARGS'] = ','.join(args.browser_args) if args.proxy_url: env_vars['PROXY_URL'] = args.proxy_url try: container_start_time = time.time() container = docker_client.containers.run( args.image_name, # remove=True, detach=True, environment=env_vars, # The `,z` part is needed to avoid SELinux issues volumes={output_dir: {'bind': '/output', 'mode': 'rw,z'}}, user='0' ) max_cpu_percent = 0 max_memory_mb = 0 total_rx_bytes = 0 total_tx_bytes = 0 stats_stream = container.stats(stream=True) while container.status in ['running', 'created']: container.reload() if container.status != 'running': break try: stats = next(stats_stream) stats = json.loads(stats.decode('utf-8')) if 'online_cpus' not in stats['cpu_stats'] or 'usage' not in stats.get('memory_stats', {}): continue cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage'] system_delta = stats['cpu_stats'].get('system_cpu_usage', 0) - stats['precpu_stats'].get('system_cpu_usage', 0) online_cpus = stats['cpu_stats']['online_cpus'] if system_delta > 0: cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0 max_cpu_percent = max(max_cpu_percent, cpu_percent) if 'usage' in stats.get('memory_stats', {}): memory_mb = stats['memory_stats']['usage'] / 1024 / 1024 max_memory_mb = max(max_memory_mb, memory_mb) networks = stats.get('networks', {}) if isinstance(networks, dict): for interface in networks.values(): total_rx_bytes = max(total_rx_bytes, interface.get('rx_bytes', 0)) total_tx_bytes = max(total_tx_bytes, interface.get('tx_bytes', 0)) except Exception as e: print('Error:', e) break container.reload() # Refresh container status container.wait() print("Container logs:") logs = container.logs().decode('utf-8') for line in logs.strip().split('\n'): if "Cold start" in line: coldstart_stop_time = float(re.search(r'([\d.]+)', line).group(1)) if "Start response" in line: scrape_start_time = float(re.search(r'([\d.]+)', line).group(1)) if "Stop response" in line: scrape_stop_time = float(re.search(r'([\d.]+)', line).group(1)) print(f"\t{line}") container.remove() metrics = [ {"name": "Max CPU", "value": f"{max_cpu_percent:.2f}", "unit": "%"}, {"name": "Max Memory", "value": f"{max_memory_mb:.2f}", "unit": "MB"}, {"name": "Total Transfer", "value": f"{(total_rx_bytes + total_tx_bytes) / 1024:.0f}", "unit": "KB"}, {"name": "Cold Start Time", "value": f"{(coldstart_stop_time - container_start_time):.3f}", "unit": "seconds"}, {"name": "Time to Response", "value": f"{(scrape_stop_time - scrape_start_time):.3f}", "unit": "seconds"}, {"name": "Total Runtime", "value": f"{(time.time() - total_start):.3f}", "unit": "seconds"} ] with open('output/output.txt', 'r', encoding='utf-8') as f: text_content = f.read() with open('report.html', 'r') as f: template = Template(f.read()) rendered_html = template.render(metrics=metrics, text_content=text_content) with open('output/report.html', 'w') as f: f.write(rendered_html) # Try running `open` to launch the html file automatically, fail silently try: subprocess.run(['open', 'output/report.html']) print("Report opened in browser") except (subprocess.CalledProcessError, FileNotFoundError): pass print(f"""Performance: Max CPU: {max_cpu_percent:.2f}% Max Memory: {max_memory_mb:.2f}MB Total Transfer: {(total_rx_bytes + total_tx_bytes) / 1024} KBs""") print(f"Cold Start Time: {(coldstart_stop_time - container_start_time):.3f} seconds") print(f"Time to Response: {(scrape_stop_time - scrape_start_time):.3f} seconds") print(f"Completed everything in: {(time.time() - total_start):.3f} seconds") except Exception as e: print(f"Error running container: {e}") sys.exit(1)