Get google scrape working with undetected-chromedriver, include some research images

2025-07-30 12:58:31 +07:00 · 2025-07-30 12:58:31 +07:00 · 25e8cd49c4
commit 25e8cd49c4
parent 875b8c35ac
6 changed files with 39 additions and 9 deletions
--- a/Log.org
+++ b/Log.org
@ -23,3 +23,19 @@ but let's make progress on all other other tasks before tackling that.
 Ok, so found this;
 https://github.com/ultrafunkamsterdam/undetected-chromedriver/
 This is how to pass brave to the URL
 https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/806
 I could set this up in the docker container, however, I'm not sure this is the
 right thing.
 I found this resource;
 https://bot.incolumitas.com/#botChallenge
 Ok, so it works! I was able to scrape google with the =driver.py= script!
--- a/Task.org
+++ b/Task.org
@ -8,7 +8,7 @@ Requirements:
 - Accept optional proxy URL and optional browser launch flags
-* Estimate and report:
+** Estimate and report:
 - Cold start time
 - Total transfer size (bandwidth over the wire)
--- a/driver.py
+++ b/driver.py
@ -0,0 +1,17 @@
 import undetected_chromedriver as uc
 driver = uc.Chrome(
    browser_executable_path='/opt/brave.com/brave/brave',
    # headless=True,
    # use_subprocess=False
 )
 driver.get('https://www.google.com/search?q=MINISFORUM+MS-A2')
 driver.save_screenshot('nowsecure.png')
 doc = await iframe_tab.send(cdp_generator("DOM.getDocument", {"depth": -1, "pierce": True}))
 data = driver.execute_cdp_cmd('DOM.getDocument', {})
 if data:
    if 'root' in data:
        root_node_id = data['root']['nodeId']
        html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
--- a/google-store-search.png
+++ b/google-store-search.png
--- a/nowsecure.png
+++ b/nowsecure.png
--- a/scrape.py
+++ b/scrape.py
@ -4,6 +4,9 @@ import json
 import asyncio
 from pprint import pprint
 # TODO: Use docker thing to start a docker service
 # TODO: Accept command line args for docker image
 async def scrape():
    id_count = [0]
    def get_id():
@ -36,8 +39,8 @@ async def scrape():
        await ws.send(json.dumps({
            "id": get_id(),
            "method": "Page.navigate",
-            # "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
+            "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
-            "params": {"url": "https://ferano.io"}
+            # "params": {"url": "https://ferano.io"}
        }))
        print("Send navigate request")
@ -58,7 +61,6 @@ async def scrape():
        print("Woot")
        document_id = id_count[0]  # Store the ID we just used
        while True:
            response = await ws.recv()
@ -86,9 +88,4 @@ async def scrape():
                print("Something happened")
                break
        # response = await ws.recv()
        # root_data = json.loads(response)
        # root_node_id = root_data["result"]["root"]["nodeId"]
        # print(root_data)
 asyncio.run(scrape())