auto-scraper/scrape.py

95 lines
2.8 KiB
Python

import requests
import websockets
import json
import asyncio
from pprint import pprint
async def scrape():
id_count = [0]
def get_id():
id_count[0] += 1
return id_count[0]
response = requests.get("http://localhost:3000/json")
targets = response.json()
if not targets:
print("No active sessions found")
return
websocket_url = targets[0]['webSocketDebuggerUrl']
print(f"Connecting to: {websocket_url}")
async with websockets.connect(websocket_url) as ws:
for elem in ["DOM", "Page"]:
print("Enabling", elem)
await ws.send(json.dumps({
"id": get_id(),
"method": f"{elem}.enable"
}))
# await asyncio.sleep(1)
response = await ws.recv()
print(f"{elem} enabled:", json.loads(response))
print("Staring up")
await ws.send(json.dumps({
"id": get_id(),
"method": "Page.navigate",
# "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
"params": {"url": "https://ferano.io"}
}))
print("Send navigate request")
while True:
response = await ws.recv()
data = json.loads(response)
if data.get("method") == "Page.loadEventFired":
break
print("Got loadEventFired event")
print("Get Document...")
await ws.send(json.dumps({
"id": get_id(),
"method": "DOM.getDocument"
}))
print("Woot")
document_id = id_count[0] # Store the ID we just used
while True:
response = await ws.recv()
data = json.loads(response)
# Check if this is the response to our DOM.getDocument request
if data.get("id") == document_id:
root_node_id = data['result']['root']['nodeId']
await ws.send(json.dumps({
"id": get_id(),
"method": "DOM.getOuterHTML",
"params": {"nodeId": root_node_id}
}))
html_id = id_count[0]
while True:
response = await ws.recv()
data = json.loads(response)
if data.get("id") == html_id and "result" in data:
html_content = data['result']['outerHTML']
print(html_content)
break
else:
print("Received event:", data)
print("Something happened")
break
# response = await ws.recv()
# root_data = json.loads(response)
# root_node_id = root_data["result"]["root"]["nodeId"]
# print(root_data)
asyncio.run(scrape())