Get google scrape working with undetected-chromedriver, include some research images

This commit is contained in:
Joseph Ferano 2025-07-30 12:58:31 +07:00
parent 875b8c35ac
commit 25e8cd49c4
6 changed files with 39 additions and 9 deletions

16
Log.org
View File

@ -23,3 +23,19 @@ but let's make progress on all other other tasks before tackling that.
Ok, so found this;
https://github.com/ultrafunkamsterdam/undetected-chromedriver/
This is how to pass brave to the URL
https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/806
I could set this up in the docker container, however, I'm not sure this is the
right thing.
I found this resource;
https://bot.incolumitas.com/#botChallenge
Ok, so it works! I was able to scrape google with the =driver.py= script!

View File

@ -8,7 +8,7 @@ Requirements:
- Accept optional proxy URL and optional browser launch flags
* Estimate and report:
** Estimate and report:
- Cold start time
- Total transfer size (bandwidth over the wire)

17
driver.py Normal file
View File

@ -0,0 +1,17 @@
import undetected_chromedriver as uc
driver = uc.Chrome(
browser_executable_path='/opt/brave.com/brave/brave',
# headless=True,
# use_subprocess=False
)
driver.get('https://www.google.com/search?q=MINISFORUM+MS-A2')
driver.save_screenshot('nowsecure.png')
doc = await iframe_tab.send(cdp_generator("DOM.getDocument", {"depth": -1, "pierce": True}))
data = driver.execute_cdp_cmd('DOM.getDocument', {})
if data:
if 'root' in data:
root_node_id = data['root']['nodeId']
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})

BIN
google-store-search.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 320 KiB

BIN
nowsecure.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

View File

@ -4,6 +4,9 @@ import json
import asyncio
from pprint import pprint
# TODO: Use docker thing to start a docker service
# TODO: Accept command line args for docker image
async def scrape():
id_count = [0]
def get_id():
@ -36,8 +39,8 @@ async def scrape():
await ws.send(json.dumps({
"id": get_id(),
"method": "Page.navigate",
# "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
"params": {"url": "https://ferano.io"}
"params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
# "params": {"url": "https://ferano.io"}
}))
print("Send navigate request")
@ -58,7 +61,6 @@ async def scrape():
print("Woot")
document_id = id_count[0] # Store the ID we just used
while True:
response = await ws.recv()
@ -86,9 +88,4 @@ async def scrape():
print("Something happened")
break
# response = await ws.recv()
# root_data = json.loads(response)
# root_node_id = root_data["result"]["root"]["nodeId"]
# print(root_data)
asyncio.run(scrape())