Get google scrape working with undetected-chromedriver, include some research images
This commit is contained in:
parent
875b8c35ac
commit
25e8cd49c4
16
Log.org
16
Log.org
@ -23,3 +23,19 @@ but let's make progress on all other other tasks before tackling that.
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Ok, so found this;
|
||||||
|
|
||||||
|
https://github.com/ultrafunkamsterdam/undetected-chromedriver/
|
||||||
|
|
||||||
|
This is how to pass brave to the URL
|
||||||
|
https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/806
|
||||||
|
|
||||||
|
I could set this up in the docker container, however, I'm not sure this is the
|
||||||
|
right thing.
|
||||||
|
|
||||||
|
|
||||||
|
I found this resource;
|
||||||
|
https://bot.incolumitas.com/#botChallenge
|
||||||
|
|
||||||
|
Ok, so it works! I was able to scrape google with the =driver.py= script!
|
||||||
|
2
Task.org
2
Task.org
@ -8,7 +8,7 @@ Requirements:
|
|||||||
|
|
||||||
- Accept optional proxy URL and optional browser launch flags
|
- Accept optional proxy URL and optional browser launch flags
|
||||||
|
|
||||||
* Estimate and report:
|
** Estimate and report:
|
||||||
|
|
||||||
- Cold start time
|
- Cold start time
|
||||||
- Total transfer size (bandwidth over the wire)
|
- Total transfer size (bandwidth over the wire)
|
||||||
|
17
driver.py
Normal file
17
driver.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import undetected_chromedriver as uc
|
||||||
|
|
||||||
|
driver = uc.Chrome(
|
||||||
|
browser_executable_path='/opt/brave.com/brave/brave',
|
||||||
|
# headless=True,
|
||||||
|
# use_subprocess=False
|
||||||
|
)
|
||||||
|
driver.get('https://www.google.com/search?q=MINISFORUM+MS-A2')
|
||||||
|
driver.save_screenshot('nowsecure.png')
|
||||||
|
|
||||||
|
doc = await iframe_tab.send(cdp_generator("DOM.getDocument", {"depth": -1, "pierce": True}))
|
||||||
|
|
||||||
|
data = driver.execute_cdp_cmd('DOM.getDocument', {})
|
||||||
|
if data:
|
||||||
|
if 'root' in data:
|
||||||
|
root_node_id = data['root']['nodeId']
|
||||||
|
html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id})
|
BIN
google-store-search.png
Normal file
BIN
google-store-search.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 320 KiB |
BIN
nowsecure.png
Normal file
BIN
nowsecure.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.1 MiB |
13
scrape.py
13
scrape.py
@ -4,6 +4,9 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
|
# TODO: Use docker thing to start a docker service
|
||||||
|
# TODO: Accept command line args for docker image
|
||||||
|
|
||||||
async def scrape():
|
async def scrape():
|
||||||
id_count = [0]
|
id_count = [0]
|
||||||
def get_id():
|
def get_id():
|
||||||
@ -36,8 +39,8 @@ async def scrape():
|
|||||||
await ws.send(json.dumps({
|
await ws.send(json.dumps({
|
||||||
"id": get_id(),
|
"id": get_id(),
|
||||||
"method": "Page.navigate",
|
"method": "Page.navigate",
|
||||||
# "params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
|
"params": {"url": "https://www.google.com/search?q=MINISFORUM+MS-A2"}
|
||||||
"params": {"url": "https://ferano.io"}
|
# "params": {"url": "https://ferano.io"}
|
||||||
}))
|
}))
|
||||||
|
|
||||||
print("Send navigate request")
|
print("Send navigate request")
|
||||||
@ -58,7 +61,6 @@ async def scrape():
|
|||||||
|
|
||||||
print("Woot")
|
print("Woot")
|
||||||
|
|
||||||
|
|
||||||
document_id = id_count[0] # Store the ID we just used
|
document_id = id_count[0] # Store the ID we just used
|
||||||
while True:
|
while True:
|
||||||
response = await ws.recv()
|
response = await ws.recv()
|
||||||
@ -86,9 +88,4 @@ async def scrape():
|
|||||||
print("Something happened")
|
print("Something happened")
|
||||||
break
|
break
|
||||||
|
|
||||||
# response = await ws.recv()
|
|
||||||
# root_data = json.loads(response)
|
|
||||||
# root_node_id = root_data["result"]["root"]["nodeId"]
|
|
||||||
# print(root_data)
|
|
||||||
|
|
||||||
asyncio.run(scrape())
|
asyncio.run(scrape())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user