diff --git a/Log.org b/Log.org index e002761..ea5d40c 100644 --- a/Log.org +++ b/Log.org @@ -23,3 +23,19 @@ but let's make progress on all other other tasks before tackling that. + +Ok, so found this; + +https://github.com/ultrafunkamsterdam/undetected-chromedriver/ + +This is how to pass brave to the URL +https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/806 + +I could set this up in the docker container, however, I'm not sure this is the +right thing. + + +I found this resource; +https://bot.incolumitas.com/#botChallenge + +Ok, so it works! I was able to scrape google with the =driver.py= script! diff --git a/Task.org b/Task.org index 6fe7e86..6feae31 100644 --- a/Task.org +++ b/Task.org @@ -8,7 +8,7 @@ Requirements: - Accept optional proxy URL and optional browser launch flags -* Estimate and report: +** Estimate and report: - Cold start time - Total transfer size (bandwidth over the wire) diff --git a/driver.py b/driver.py new file mode 100644 index 0000000..ea61a67 --- /dev/null +++ b/driver.py @@ -0,0 +1,17 @@ +import undetected_chromedriver as uc + +driver = uc.Chrome( + browser_executable_path='/opt/brave.com/brave/brave', + # headless=True, + # use_subprocess=False +) +driver.get('https://www.google.com/search?q=MINISFORUM+MS-A2') +driver.save_screenshot('nowsecure.png') + +doc = await iframe_tab.send(cdp_generator("DOM.getDocument", {"depth": -1, "pierce": True})) + +data = driver.execute_cdp_cmd('DOM.getDocument', {}) +if data: + if 'root' in data: + root_node_id = data['root']['nodeId'] + html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id}) diff --git a/google-store-search.png b/google-store-search.png new file mode 100644 index 0000000..d4ae159 Binary files /dev/null and b/google-store-search.png differ diff --git a/nowsecure.png b/nowsecure.png new file mode 100644 index 0000000..3ea97dc Binary files /dev/null and b/nowsecure.png differ diff --git a/scrape.py b/scrape.py index b75db66..76d5f50 100644 --- a/scrape.py +++ b/scrape.py @@ -4,6 +4,9 @@ import json import asyncio from pprint import pprint +# TODO: Use docker thing to start a docker service +# TODO: Accept command line args for docker image + async def scrape(): id_count = [0] def get_id(): @@ -36,8 +39,8 @@ async def scrape(): await ws.send(json.dumps({ "id": get_id(), "method": "Page.navigate", - # "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"} - "params": {"url": "https://ferano.io"} + "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"} + # "params": {"url": "https://ferano.io"} })) print("Send navigate request") @@ -58,7 +61,6 @@ async def scrape(): print("Woot") - document_id = id_count[0] # Store the ID we just used while True: response = await ws.recv() @@ -86,9 +88,4 @@ async def scrape(): print("Something happened") break - # response = await ws.recv() - # root_data = json.loads(response) - # root_node_id = root_data["result"]["root"]["nodeId"] - # print(root_data) - asyncio.run(scrape())