import requests import websockets import json import asyncio from pprint import pprint async def scrape(): id_count = [0] def get_id(): id_count[0] += 1 return id_count[0] response = requests.get("http://localhost:3000/json") targets = response.json() if not targets: print("No active sessions found") return websocket_url = targets[0]['webSocketDebuggerUrl'] print(f"Connecting to: {websocket_url}") async with websockets.connect(websocket_url) as ws: for elem in ["DOM", "Page"]: print("Enabling", elem) await ws.send(json.dumps({ "id": get_id(), "method": f"{elem}.enable" })) # await asyncio.sleep(1) response = await ws.recv() print(f"{elem} enabled:", json.loads(response)) print("Staring up") await ws.send(json.dumps({ "id": get_id(), "method": "Page.navigate", # "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"} "params": {"url": "https://ferano.io"} })) print("Send navigate request") while True: response = await ws.recv() data = json.loads(response) if data.get("method") == "Page.loadEventFired": break print("Got loadEventFired event") print("Get Document...") await ws.send(json.dumps({ "id": get_id(), "method": "DOM.getDocument" })) print("Woot") document_id = id_count[0] # Store the ID we just used while True: response = await ws.recv() data = json.loads(response) # Check if this is the response to our DOM.getDocument request if data.get("id") == document_id: root_node_id = data['result']['root']['nodeId'] await ws.send(json.dumps({ "id": get_id(), "method": "DOM.getOuterHTML", "params": {"nodeId": root_node_id} })) html_id = id_count[0] while True: response = await ws.recv() data = json.loads(response) if data.get("id") == html_id and "result" in data: html_content = data['result']['outerHTML'] print(html_content) break else: print("Received event:", data) print("Something happened") break # response = await ws.recv() # root_data = json.loads(response) # root_node_id = root_data["result"]["root"]["nodeId"] # print(root_data) asyncio.run(scrape())