diff --git a/Dockerfile b/Dockerfile index 27ac3ab..870ed99 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,14 @@ -FROM zenika/alpine-chrome:latest +FROM selenium/standalone-chrome:latest + +USER root + +RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --break-system-packages undetected-chromedriver + +COPY driver.py /app/ +WORKDIR /app -# Expose port 3000 for remote debugging EXPOSE 3000 -# Override the default command to use port 3000 -CMD ["chromium-browser", "--headless", "--no-sandbox", "--disable-gpu", "--remote-debugging-port=3000", "--remote-debugging-address=0.0.0.0"] +CMD ["google-chrome", "--headless", "--no-sandbox", "--disable-gpu", "--remote-debugging-port=3000", "--remote-debugging-address=0.0.0.0"] diff --git a/Log.org b/Log.org index ea5d40c..5f07b0b 100644 --- a/Log.org +++ b/Log.org @@ -21,6 +21,9 @@ https://hub.docker.com/r/zenika/alpine-chrome Perhaps the exercise is looking for me to actually build an image from scratch, but let's make progress on all other other tasks before tackling that. +I immediately hit bot detection when just running a normal websocket request to +the docker container, so I started researching what I would need to do to avoid detection. + @@ -39,3 +42,42 @@ I found this resource; https://bot.incolumitas.com/#botChallenge Ok, so it works! I was able to scrape google with the =driver.py= script! + +I could use this, but let's see if I can just build the docker container myself. + +https://hub.docker.com/r/ultrafunk/undetected-chromedriver + +Setting up this with the underlying dockerfile, but I'm hitting this issue; +#+begin_quote +/app $ python driver.py /usr/bin/chromium-browser https://ferano.io +Traceback (most recent call last): + File "/app/driver.py", line 12, in + driver = uc.Chrome( + ^^^^^^^^^^ + File "/usr/lib/python3.11/site-packages/undetected_chromedriver/__init__.py", line 466, in __init__ + super(Chrome, self).__init__( + File "/usr/lib/python3.11/site-packages/selenium/webdriver/chrome/webdriver.py", line 47, in __init__ super().__init__( + File "/usr/lib/python3.11/site-packages/selenium/webdriver/chromium/webdriver.py", line 69, in __init__ + super().__init__(command_executor=executor, options=options) + File "/usr/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 261, in __init__ + self.start_session(capabilities) + File "/usr/lib/python3.11/site-packages/undetected_chromedriver/__init__.py", line 724, in start_session + super(selenium.webdriver.chrome.webdriver.WebDriver, self).start_session( + File "/usr/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 362, in start_session + response = self.execute(Command.NEW_SESSION, caps)["value"] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 454, in execute self.error_handler.check_response(response) + File "/usr/lib/python3.11/site-packages/selenium/webdriver/remote/errorhandler.py", line 232, in check_response + raise exception_class(message, screen, stacktrace) +selenium.common.exceptions.SessionNotCreatedException: Message: session not created: cannot connect to chrome at 127.0.0.1:48747 +from session not created: This version of ChromeDriver only supports Chrome version 138 +Current browser version is 124.0.6367.78; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception +#+end_quote + +So now I need a new docker image, https://hub.docker.com/r/selenium/standalone-chrome + +Updated the docker file. Now this works, I get back my websites HTML! + +#+begin_src sh +docker exec -it search-api python driver.py /usr/bin/google-chrome https://ferano.io +#+end_src diff --git a/driver.py b/driver.py index ea61a67..26bafb9 100644 --- a/driver.py +++ b/driver.py @@ -1,17 +1,25 @@ import undetected_chromedriver as uc +import sys + +if len(sys.argv) < 3: + sys.exit("usage: driver.py ") + +options = uc.ChromeOptions() +options.add_argument('--no-sandbox') +options.add_argument('--disable-dev-shm-usage') +options.add_argument('--disable-gpu') driver = uc.Chrome( - browser_executable_path='/opt/brave.com/brave/brave', - # headless=True, - # use_subprocess=False + browser_executable_path=sys.argv[1], + headless=True, + use_subprocess=False, + options=options ) -driver.get('https://www.google.com/search?q=MINISFORUM+MS-A2') -driver.save_screenshot('nowsecure.png') - -doc = await iframe_tab.send(cdp_generator("DOM.getDocument", {"depth": -1, "pierce": True})) +driver.get(sys.argv[2]) data = driver.execute_cdp_cmd('DOM.getDocument', {}) if data: if 'root' in data: root_node_id = data['root']['nodeId'] html = driver.execute_cdp_cmd('DOM.getOuterHTML', {"nodeId": root_node_id}) + print(html) diff --git a/image-rebuild.sh b/image-rebuild.sh new file mode 100755 index 0000000..72a4b01 --- /dev/null +++ b/image-rebuild.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +docker stop search-api +docker rm search-api +docker build -t search-api . +docker run -d -p 3000:3000 --name search-api search-api +# docker exec -it search-api python driver.py /usr/bin/chromium-browser https://ferano.io