diff --git a/.gitignore b/.gitignore index d4374ac..14573b3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /.env /proxy_auth.zip +/output/ diff --git a/Dockerfile b/Dockerfile index 4e19ed2..8aafcde 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,9 +6,9 @@ RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/ RUN pip3 install --break-system-packages undetected-chromedriver beautifulsoup4 -COPY driver.py /app/ +COPY scrape.py /app/ WORKDIR /app -EXPOSE 3000 +VOLUME ["/output"] -CMD ["google-chrome", "--headless", "--no-sandbox", "--disable-gpu", "--remote-debugging-port=3000", "--remote-debugging-address=0.0.0.0"] +CMD ["python3", "scrape.py"] diff --git a/image-rebuild.sh b/image-rebuild.sh index 72a4b01..b141179 100755 --- a/image-rebuild.sh +++ b/image-rebuild.sh @@ -3,5 +3,5 @@ docker stop search-api docker rm search-api docker build -t search-api . -docker run -d -p 3000:3000 --name search-api search-api +# docker run -d -p 3000:3000 --name search-api search-api # docker exec -it search-api python driver.py /usr/bin/chromium-browser https://ferano.io diff --git a/orchestrator.py b/orchestrator.py new file mode 100644 index 0000000..b863785 --- /dev/null +++ b/orchestrator.py @@ -0,0 +1,128 @@ +import argparse +import docker +import sys +import os +import json +import time +import re +import pprint + +total_start = time.time() + +parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser') + +parser.add_argument('website', help='Desired website you want to scrape') +parser.add_argument('-b', '--browser-path', help='Path to browser binary', + default='/usr/bin/google-chrome') +parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser') +parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)') +parser.add_argument('-i', '--image-name', help='Name of the docker image', + default='search-api') + +args = parser.parse_args() + +docker_client = docker.from_env() + +try: + docker_client.images.get(args.image_name) +except docker.errors.ImageNotFound: + print(f"Image {args.image_name} not found") + sys.exit(1) + +env_vars = { + 'WEBSITE': args.website, + 'BROWSER_PATH': args.browser_path, + 'OUTPUT_FILE': '/output/output.txt' +} + +output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'output') + +# os.makedirs(output_dir, exist_ok=True) +# os.chmod(output_dir, 0o777) + +if args.browser_args: + env_vars['BROWSER_ARGS'] = ','.join(args.browser_args) + +if args.proxy_url: + env_vars['PROXY_URL'] = args.proxy_url + +try: + container_start_time = time.time() + print("Container start time:", container_start_time) + + container = docker_client.containers.run( + args.image_name, + # remove=True, + detach=True, + environment=env_vars, + # The `,z` part is needed to avoid SELinux issues + volumes={output_dir: {'bind': '/output', 'mode': 'rw,z'}}, + user='0' + ) + + max_cpu_percent = 0 + max_memory_mb = 0 + total_rx_bytes = 0 + total_tx_bytes = 0 + stats_stream = container.stats(stream=True) + while container.status in ['running', 'created']: + container.reload() + if container.status != 'running': + break + try: + stats = next(stats_stream) + stats = json.loads(stats.decode('utf-8')) + + if 'online_cpus' not in stats['cpu_stats'] or 'usage' not in stats.get('memory_stats', {}): + continue + + cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage'] + system_delta = stats['cpu_stats'].get('system_cpu_usage', 0) - stats['precpu_stats'].get('system_cpu_usage', 0) + online_cpus = stats['cpu_stats']['online_cpus'] + + if system_delta > 0: + cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0 + max_cpu_percent = max(max_cpu_percent, cpu_percent) + + if 'usage' in stats.get('memory_stats', {}): + memory_mb = stats['memory_stats']['usage'] / 1024 / 1024 + max_memory_mb = max(max_memory_mb, memory_mb) + + networks = stats.get('networks', {}) + if isinstance(networks, dict): + for interface in networks.values(): + total_rx_bytes = max(total_rx_bytes, interface.get('rx_bytes', 0)) + total_tx_bytes = max(total_tx_bytes, interface.get('tx_bytes', 0)) + + except Exception as e: + print('Error:', e) + break + + container.reload() # Refresh container status + + print(f"""Performance: + Max CPU: {max_cpu_percent:.2f}% + Max Memory: {max_memory_mb:.2f}MB + Total Transfer: {(total_rx_bytes + total_tx_bytes) / 1024} KBs""") + + container.wait() + + print("Container logs:") + logs = container.logs().decode('utf-8') + for line in logs.strip().split('\n'): + if "Cold start" in line: + coldstart_stop_time = float(re.search(r'([\d.]+)', line).group(1)) + if "Start response" in line: + scrape_start_time = float(re.search(r'([\d.]+)', line).group(1)) + if "Stop response" in line: + scrape_stop_time = float(re.search(r'([\d.]+)', line).group(1)) + print(f"\t{line}") + + container.remove() + print(f"Cold Start Time: {(coldstart_stop_time - container_start_time):.3f} seconds") + print(f"Time to Response: {(scrape_stop_time - scrape_start_time):.3f} seconds") + print(f"Completed everything in: {(time.time() - total_start):.3f} seconds") + +except Exception as e: + print(f"Error running container: {e}") + sys.exit(1) diff --git a/scrape.py b/scrape.py index a3efd13..49ace68 100644 --- a/scrape.py +++ b/scrape.py @@ -1,28 +1,30 @@ import zipfile import undetected_chromedriver as uc +import os import sys -import argparse - +import time from urllib.parse import urlparse from bs4 import BeautifulSoup -parser = argparse.ArgumentParser(description='Scrape websites with a containerized web browser') +# Get parameters from environment variables +website = os.environ.get('WEBSITE') +browser_path = os.environ.get('BROWSER_PATH', '/usr/bin/google-chrome') +browser_args = os.environ.get('BROWSER_ARGS', '').split(',') if os.environ.get('BROWSER_ARGS') else None +proxy_url = os.environ.get('PROXY_URL') +output_file = os.environ.get('OUTPUT_FILE', '/output/output.txt') -parser.add_argument('website', help='Desired website you want to scrape') -parser.add_argument('-b', '--browser-path', help='Path to browser binary', - default='/usr/bin/google-chrome') -parser.add_argument('-a', '--browser-args', nargs='+', help='Additional Args to pass the browser') -parser.add_argument('-p', '--proxy-url', help='Proxy URL (e.g., http://user:pass@host:port)') - -args = parser.parse_args() +if not website: + print("No website specified") + sys.exit(1) options = uc.ChromeOptions() -if args.browser_args: - for b_arg in args.browser_args: - options.add_argument(f'--{b_arg}') +if browser_args: + for b_arg in browser_args: + if b_arg.strip(): # Skip empty strings + options.add_argument(f'--{b_arg.strip()}') -if args.proxy_url: - parsed = urlparse(args.proxy_url) +if proxy_url: + parsed = urlparse(proxy_url) proxy_host = parsed.hostname proxy_port = parsed.port @@ -71,7 +73,6 @@ chrome.webRequest.onAuthRequired.addListener( ); """ - with zipfile.ZipFile('proxy_auth.zip', 'w') as zip_file: zip_file.writestr("manifest.json", manifest_json) zip_file.writestr("background.js", background_js) @@ -79,22 +80,35 @@ chrome.webRequest.onAuthRequired.addListener( options.add_extension('proxy_auth.zip') driver = uc.Chrome( - browser_executable_path=args.browser_path, + browser_executable_path=browser_path, headless=True, use_subprocess=False, options=options ) -driver.get(args.website) +print(f"Cold start finished: {time.time()}") + +print(f"Start response time: {time.time()}") +driver.get(website) data = driver.execute_cdp_cmd('DOM.getDocument', {}) +output_text = "" + if data: if 'root' in data: root_node_id = data['root']['nodeId'] html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id}) + print(f"Stop response time: {time.time()}") soup = BeautifulSoup(html['outerHTML'], 'html.parser') - print(soup.get_text()) + output_text = soup.get_text() else: - print("Got data without a root:", data) + output_text = f"Got data without a root: {data}" else: - print("Didn't get any data...") + output_text = "Didn't get any data..." +driver.quit() + +output_dir = os.path.dirname(output_file) + +os.makedirs(os.path.dirname(output_file), exist_ok=True) +with open(output_file, 'w') as f: + f.write(output_text)