Get google scrape working with undetected-chromedriver, include some research images

2025-07-30 12:58:31 +07:00 · 2025-07-30 12:58:31 +07:00 · 25e8cd49c4
commit 25e8cd49c4
parent 875b8c35ac
6 changed files with 39 additions and 9 deletions
--- a/Log.org
+++ b/Log.org
@ -23,3 +23,19 @@ but let's make progress on all other other tasks before tackling that.



+
+Ok, so found this;
+
+https://github.com/ultrafunkamsterdam/undetected-chromedriver/
+
+This is how to pass brave to the URL
+https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/806
+
+I could set this up in the docker container, however, I'm not sure this is the
+right thing.
+
+
+I found this resource;
+https://bot.incolumitas.com/#botChallenge
+
+Ok, so it works! I was able to scrape google with the =driver.py= script!
--- a/Task.org
+++ b/Task.org
@ -8,7 +8,7 @@ Requirements:

 - Accept optional proxy URL and optional browser launch flags

-* Estimate and report:
+** Estimate and report:

 - Cold start time
 - Total transfer size (bandwidth over the wire)
--- a/driver.py
+++ b/driver.py
@ -0,0 +1,17 @@
+import undetected_chromedriver as uc
+
+driver = uc.Chrome(
+    browser_executable_path='/opt/brave.com/brave/brave',
+    # headless=True,
+    # use_subprocess=False
+)
+driver.get('https://www.google.com/search?q=MINISFORUM+MS-A2')
+driver.save_screenshot('nowsecure.png')
+
+doc = await iframe_tab.send(cdp_generator("DOM.getDocument", {"depth": -1, "pierce": True}))
+
+data = driver.execute_cdp_cmd('DOM.getDocument', {})
+if data:
+    if 'root' in data:
+        root_node_id = data['root']['nodeId']
+        html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
--- a/google-store-search.png
+++ b/google-store-search.png
--- a/nowsecure.png
+++ b/nowsecure.png
--- a/scrape.py
+++ b/scrape.py
@ -4,6 +4,9 @@ import json
 import asyncio
 from pprint import pprint

+# TODO: Use docker thing to start a docker service
+# TODO: Accept command line args for docker image
+
 async def scrape():
    id_count = [0]
    def get_id():
@ -36,8 +39,8 @@ async def scrape():
        await ws.send(json.dumps({
            "id": get_id(),
            "method": "Page.navigate",
-            # "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
-            "params": {"url": "https://ferano.io"}
+            "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
+            # "params": {"url": "https://ferano.io"}
        }))

        print("Send navigate request")
@ -58,7 +61,6 @@ async def scrape():

        print("Woot")

-
        document_id = id_count[0]  # Store the ID we just used
        while True:
            response = await ws.recv()
@ -86,9 +88,4 @@ async def scrape():
                print("Something happened")
                break

-        # response = await ws.recv()
-        # root_data = json.loads(response)
-        # root_node_id = root_data["result"]["root"]["nodeId"]
-        # print(root_data)
-
 asyncio.run(scrape())