92 lines
2.7 KiB
Python
92 lines
2.7 KiB
Python
import requests
|
|
import websockets
|
|
import json
|
|
import asyncio
|
|
from pprint import pprint
|
|
|
|
# TODO: Use docker thing to start a docker service
|
|
# TODO: Accept command line args for docker image
|
|
|
|
async def scrape():
|
|
id_count = [0]
|
|
def get_id():
|
|
id_count[0] += 1
|
|
return id_count[0]
|
|
|
|
response = requests.get("http://localhost:3000/json")
|
|
targets = response.json()
|
|
|
|
if not targets:
|
|
print("No active sessions found")
|
|
return
|
|
|
|
websocket_url = targets[0]['webSocketDebuggerUrl']
|
|
print(f"Connecting to: {websocket_url}")
|
|
|
|
async with websockets.connect(websocket_url) as ws:
|
|
for elem in ["DOM", "Page"]:
|
|
print("Enabling", elem)
|
|
await ws.send(json.dumps({
|
|
"id": get_id(),
|
|
"method": f"{elem}.enable"
|
|
}))
|
|
# await asyncio.sleep(1)
|
|
response = await ws.recv()
|
|
print(f"{elem} enabled:", json.loads(response))
|
|
|
|
print("Staring up")
|
|
|
|
await ws.send(json.dumps({
|
|
"id": get_id(),
|
|
"method": "Page.navigate",
|
|
"params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
|
|
# "params": {"url": "https://ferano.io"}
|
|
}))
|
|
|
|
print("Send navigate request")
|
|
|
|
while True:
|
|
response = await ws.recv()
|
|
data = json.loads(response)
|
|
if data.get("method") == "Page.loadEventFired":
|
|
break
|
|
|
|
print("Got loadEventFired event")
|
|
print("Get Document...")
|
|
|
|
await ws.send(json.dumps({
|
|
"id": get_id(),
|
|
"method": "DOM.getDocument"
|
|
}))
|
|
|
|
print("Woot")
|
|
|
|
document_id = id_count[0] # Store the ID we just used
|
|
while True:
|
|
response = await ws.recv()
|
|
data = json.loads(response)
|
|
|
|
# Check if this is the response to our DOM.getDocument request
|
|
if data.get("id") == document_id:
|
|
root_node_id = data['result']['root']['nodeId']
|
|
await ws.send(json.dumps({
|
|
"id": get_id(),
|
|
"method": "DOM.getOuterHTML",
|
|
"params": {"nodeId": root_node_id}
|
|
}))
|
|
|
|
html_id = id_count[0]
|
|
while True:
|
|
response = await ws.recv()
|
|
data = json.loads(response)
|
|
if data.get("id") == html_id and "result" in data:
|
|
html_content = data['result']['outerHTML']
|
|
print(html_content)
|
|
break
|
|
else:
|
|
print("Received event:", data)
|
|
print("Something happened")
|
|
break
|
|
|
|
asyncio.run(scrape())
|