Get google scrape working with undetected-chromedriver, include some research images
This commit is contained in:
parent
875b8c35ac
commit
25e8cd49c4
16
Log.org
16
Log.org
@ -23,3 +23,19 @@ but let's make progress on all other other tasks before tackling that.
|
||||
|
||||
|
||||
|
||||
|
||||
Ok, so found this;
|
||||
|
||||
https://github.com/ultrafunkamsterdam/undetected-chromedriver/
|
||||
|
||||
This is how to pass brave to the URL
|
||||
https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/806
|
||||
|
||||
I could set this up in the docker container, however, I'm not sure this is the
|
||||
right thing.
|
||||
|
||||
|
||||
I found this resource;
|
||||
https://bot.incolumitas.com/#botChallenge
|
||||
|
||||
Ok, so it works! I was able to scrape google with the =driver.py= script!
|
||||
|
2
Task.org
2
Task.org
@ -8,7 +8,7 @@ Requirements:
|
||||
|
||||
- Accept optional proxy URL and optional browser launch flags
|
||||
|
||||
* Estimate and report:
|
||||
** Estimate and report:
|
||||
|
||||
- Cold start time
|
||||
- Total transfer size (bandwidth over the wire)
|
||||
|
17
driver.py
Normal file
17
driver.py
Normal file
@ -0,0 +1,17 @@
|
||||
import undetected_chromedriver as uc
|
||||
|
||||
driver = uc.Chrome(
|
||||
browser_executable_path='/opt/brave.com/brave/brave',
|
||||
# headless=True,
|
||||
# use_subprocess=False
|
||||
)
|
||||
driver.get('https://www.google.com/search?q=MINISFORUM+MS-A2')
|
||||
driver.save_screenshot('nowsecure.png')
|
||||
|
||||
doc = await iframe_tab.send(cdp_generator("DOM.getDocument", {"depth": -1, "pierce": True}))
|
||||
|
||||
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
||||
if data:
|
||||
if 'root' in data:
|
||||
root_node_id = data['root']['nodeId']
|
||||
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
BIN
google-store-search.png
Normal file
BIN
google-store-search.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 320 KiB |
BIN
nowsecure.png
Normal file
BIN
nowsecure.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.1 MiB |
13
scrape.py
13
scrape.py
@ -4,6 +4,9 @@ import json
|
||||
import asyncio
|
||||
from pprint import pprint
|
||||
|
||||
# TODO: Use docker thing to start a docker service
|
||||
# TODO: Accept command line args for docker image
|
||||
|
||||
async def scrape():
|
||||
id_count = [0]
|
||||
def get_id():
|
||||
@ -36,8 +39,8 @@ async def scrape():
|
||||
await ws.send(json.dumps({
|
||||
"id": get_id(),
|
||||
"method": "Page.navigate",
|
||||
# "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
|
||||
"params": {"url": "https://ferano.io"}
|
||||
"params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
|
||||
# "params": {"url": "https://ferano.io"}
|
||||
}))
|
||||
|
||||
print("Send navigate request")
|
||||
@ -58,7 +61,6 @@ async def scrape():
|
||||
|
||||
print("Woot")
|
||||
|
||||
|
||||
document_id = id_count[0] # Store the ID we just used
|
||||
while True:
|
||||
response = await ws.recv()
|
||||
@ -86,9 +88,4 @@ async def scrape():
|
||||
print("Something happened")
|
||||
break
|
||||
|
||||
# response = await ws.recv()
|
||||
# root_data = json.loads(response)
|
||||
# root_node_id = root_data["result"]["root"]["nodeId"]
|
||||
# print(root_data)
|
||||
|
||||
asyncio.run(scrape())
|
||||
|
Loading…
x
Reference in New Issue
Block a user