commit 875b8c35ac7f7a531ad74ee1941daa576fb0075f Author: Joseph Ferano Date: Wed Jul 30 11:44:43 2025 +0700 Scrape my website with zenika/alpine-chrome diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..27ac3ab --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM zenika/alpine-chrome:latest + +# Expose port 3000 for remote debugging +EXPOSE 3000 + +# Override the default command to use port 3000 +CMD ["chromium-browser", "--headless", "--no-sandbox", "--disable-gpu", "--remote-debugging-port=3000", "--remote-debugging-address=0.0.0.0"] diff --git a/Log.org b/Log.org new file mode 100644 index 0000000..e002761 --- /dev/null +++ b/Log.org @@ -0,0 +1,25 @@ +* Setting up + +After reading the following on https://hub.docker.com/r/browserless/chrome + +#+begin_quote +Getting Chrome running well in docker is also a challenge as there's quiet a few packages you need in order to get Chrome running. Once that's done then there's still missing fonts, getting libraries to work with it, and having limitations on service reliability. +#+end_quote + +Made me think twice about setting it up myself, so just grabbed this for now. + +- I realized soon eough that ws://localhost:3000 is browserless' own API, so I went + and tried to figure out how to go about getting the websocket for the chrome + devtools, turns out I need to launch an instance first. + +Browserless has an API but I went through the documentation and quickly felt +like it probably defeats the purpose of the exercise to use them, so I instead +used this; + +https://hub.docker.com/r/zenika/alpine-chrome + +Perhaps the exercise is looking for me to actually build an image from scratch, +but let's make progress on all other other tasks before tackling that. + + + diff --git a/Task.org b/Task.org new file mode 100644 index 0000000..6fe7e86 --- /dev/null +++ b/Task.org @@ -0,0 +1,43 @@ +* Task + +Build a Docker image that boots a minimal browser (Chromium, Firefox, Safari, or Edge all work). Then write a small script that uses the image to scrape the following URL: + +https://www.google.com/search?q=MINISFORUM+MS-A2 + +Requirements: + +- Accept optional proxy URL and optional browser launch flags + +* Estimate and report: + +- Cold start time +- Total transfer size (bandwidth over the wire) +- Time to response +- CPU and memory usage + +- Save final HTML output to a file +- Use any language you're comfortable with +- We can provide a proxy URL, or you can use your own + +* Goal: + + Optimize for: + +- Low latency +- Minimal bandwidth +- High success rate (avoid bans, captchas, etc.) + +Then: + + Write a short design doc (max 4 pages) outlining how you'd scale this to 10k concurrent requests. No need to detail measurement tooling just focus on next steps to evolve this into a full browser farm. Include: + +- Fingerprinting and TLS shaping +- Crash recovery +- Session pooling and management +- Scaling and orchestration model +- Anti-bot defenses +- Unknowns and how you'd tackle them + +We want to see how you'd approach this independently and steer the project forward. You don’t need to know everything, but the plan should be grounded and reasonable. + +Time cap: 1–2 days max. Let us know if that sounds fair or if you'd prefer to tweak anything. We’re flexible, just aiming for something valuable and time-bounded. diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..b75db66 --- /dev/null +++ b/scrape.py @@ -0,0 +1,94 @@ +import requests +import websockets +import json +import asyncio +from pprint import pprint + +async def scrape(): + id_count = [0] + def get_id(): + id_count[0] += 1 + return id_count[0] + + response = requests.get("http://localhost:3000/json") + targets = response.json() + + if not targets: + print("No active sessions found") + return + + websocket_url = targets[0]['webSocketDebuggerUrl'] + print(f"Connecting to: {websocket_url}") + + async with websockets.connect(websocket_url) as ws: + for elem in ["DOM", "Page"]: + print("Enabling", elem) + await ws.send(json.dumps({ + "id": get_id(), + "method": f"{elem}.enable" + })) + # await asyncio.sleep(1) + response = await ws.recv() + print(f"{elem} enabled:", json.loads(response)) + + print("Staring up") + + await ws.send(json.dumps({ + "id": get_id(), + "method": "Page.navigate", + # "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"} + "params": {"url": "https://ferano.io"} + })) + + print("Send navigate request") + + while True: + response = await ws.recv() + data = json.loads(response) + if data.get("method") == "Page.loadEventFired": + break + + print("Got loadEventFired event") + print("Get Document...") + + await ws.send(json.dumps({ + "id": get_id(), + "method": "DOM.getDocument" + })) + + print("Woot") + + + document_id = id_count[0] # Store the ID we just used + while True: + response = await ws.recv() + data = json.loads(response) + + # Check if this is the response to our DOM.getDocument request + if data.get("id") == document_id: + root_node_id = data['result']['root']['nodeId'] + await ws.send(json.dumps({ + "id": get_id(), + "method": "DOM.getOuterHTML", + "params": {"nodeId": root_node_id} + })) + + html_id = id_count[0] + while True: + response = await ws.recv() + data = json.loads(response) + if data.get("id") == html_id and "result" in data: + html_content = data['result']['outerHTML'] + print(html_content) + break + else: + print("Received event:", data) + print("Something happened") + break + + # response = await ws.recv() + # root_data = json.loads(response) + # root_node_id = root_data["result"]["root"]["nodeId"] + # print(root_data) + +asyncio.run(scrape())