Categories
Development Guest posting

Scrape ‘Ticketmaster’ using Selenium with Python

We’ve got some code provided by Akash D. working on ticketmaster.co.uk. He automates browser (Chrome as well as Edge) using Selenium with Python. The rotating authenticated proxies are leveraged to keep undetected. Yet, the site is protected with Distil network.

How to make HTTP Proxy Authentication with Chrome driver in Selenium?

See the code below:

import os
import zipfile

from selenium import webdriver

PROXY_HOST = 'x.botproxy.net'  # rotating proxy
PROXY_PORT = 8080
PROXY_USER = 'proxy-user'
PROXY_PASS = 'proxy-password'


manifest_json = """
{
    "version": "1.0.0",
    "manifest_version": 2,
    "name": "Chrome Proxy",
    "permissions": [
        "proxy",
        "tabs",
        "unlimitedStorage",
        "storage",
        "<all_urls>",
        "webRequest",
        "webRequestBlocking"
    ],
    "background": {
        "scripts": ["background.js"]
    },
    "minimum_chrome_version":"22.0.0"
}
"""

background_js = """
var config = {
        mode: "fixed_servers",
        rules: {
          singleProxy: {
            scheme: "http",
            host: "%s",
            port: parseInt(%s)
          },
          bypassList: ["localhost"]
        }
      };

chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});

function callbackFn(details) {
    return {
        authCredentials: {
            username: "%s",
            password: "%s"
        }
    };
}

chrome.webRequest.onAuthRequired.addListener(
            callbackFn,
            {urls: ["<all_urls>"]},
            ['blocking']
);
""" % (PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS)


def get_chromedriver(use_proxy=False, user_agent=None):
    path = os.path.dirname(os.path.abspath(__file__))
    chrome_options = webdriver.ChromeOptions()
    if use_proxy:
        pluginfile = 'proxy_auth_plugin.zip'

        with zipfile.ZipFile(pluginfile, 'w') as zp:
            zp.writestr("manifest.json", manifest_json)
            zp.writestr("background.js", background_js)
        chrome_options.add_extension(pluginfile)
    if user_agent:
        chrome_options.add_argument('--user-agent=%s' % user_agent)
    driver = webdriver.Chrome(
        os.path.join(path, 'chromedriver'),
        chrome_options=chrome_options)
    return driver

def main():
    driver = get_chromedriver(use_proxy=True)
    #driver.get('https://www.google.com/search?q=my+ip+address')
    driver.get('https://httpbin.org/ip')

if __name__ == '__main__':
    main()

Source.

The whole code

We put here the whole code for scraping ticketmaster.co.uk. using Selenium with Python.

 from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, SessionNotCreatedException
# from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains	
from selenium.webdriver.common.keys import Keys
import zipfile
# from webdriver_manager.chrome import EdgeChromiumDriverManager
from msedge.selenium_tools import EdgeOptions
from msedge.selenium_tools import Edge
from time import sleep
import undetected_chromedriver as uc
import random




manifest_json = """
{
    "version": "1.0.0",
    "manifest_version": 2,
    "name": "Chrome Proxy",
    "permissions": [
        "proxy",
        "tabs",
        "unlimitedStorage",
        "storage",
        "<all_urls>",
        "webRequest",
        "webRequestBlocking"
    ],
    "background": {
        "scripts": ["background.js"]
    },
    "minimum_chrome_version":"22.0.0"
}
"""

# add your proxies here
proxies = [

]

class tm_bot():
    def __init__(self):
        
        self.driver = self.get_chrome_driver()
        
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'})
        print('excuting script')
        print(self.driver.execute_script("return navigator.userAgent;"))
        # self.driver.get("https://www.google.com")

        sleep(2)
        self.driver.get("https://www.ticketmaster.co.uk/freedom-starring-joe-mcelderry-portsmouth-05-10-2022/event/1F005B56BB598D3B")


    def get_background_js(self, proxy):
        self.prox = proxy.split(":")
        self.PROXY_HOST = self.prox[0]
        self.PROXY_PORT = int(self.prox[1])
        self.PROXY_USER = self.prox[2]
        self.PROXY_PASS = self.prox[3]
        self.background_js = """
            var config = {
                    mode: "fixed_servers",
                    rules: {
                      singleProxy: {
                        scheme: "http",
                        host: "%s",
                        port: parseInt(%s)
                      },
                      bypassList: ["foobar.com"]
                    }
                  };

            chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});

            function callbackFn(details) {
                return {
                    authCredentials: {
                        username: "%s",
                        password: "%s"
                    }
                };
            }

            chrome.webRequest.onAuthRequired.addListener(
                        callbackFn,
                        {urls: ["<all_urls>"]},
                        ['blocking']            
            );
            """%(self.PROXY_HOST, self.PROXY_PORT, self.PROXY_USER, self.PROXY_PASS)

        return self.background_js


    def get_edge_driver(self, use_proxy=True):
        self.edge_options = EdgeOptions()
        self.edge_options.use_chromium = True
        self.edge_options.add_argument('--disable-gpu')
        self.edge_options.add_argument("start-maximized")
        self.edge_options.add_argument("disable-infobars")
        self.edge_options.add_experimental_option("detach", True)
        self.edge_options.add_argument('--DBUS_SESSION_BUS_ADDRESS=/dev/null')
        self.edge_options.add_argument('--disable-blink-features=AutomationControlled')
        self.edge_options.add_experimental_option("excludeSwitches", ["disable-automation"])
        self.edge_options.add_experimental_option('useAutomationExtension', True)
        self.edge_options.add_argument('--user_agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36')
        # self.PROXY = "195.7.7.169:3199:shubkrsharma-tvw8u:KhyMg7k4WN"
        if use_proxy:
            print("using proxy")
            pluginfile = 'proxy_auth_plugin.zip'
            with zipfile.ZipFile(pluginfile, 'w') as zp:
                zp.writestr("manifest.json", manifest_json)
                zp.writestr("background.js", self.get_background_js(self.PROXY))
            self.edge_options.add_extension(pluginfile)

        self.driver = Edge(executable_path='msedgedriver.exe', options=self.edge_options)

        # self.driver.get('https://www.google.com')
        return self.driver


    def get_chrome_driver(self, use_proxy=True):
        self.PROXY = random.choice(proxies)
        print(self.PROXY)
        self.edge_options = webdriver.ChromeOptions()
        self.edge_options.use_chromium = True
        self.edge_options.add_argument('--no-sandbox')
        self.edge_options.add_argument('start-maximized')
        # self.edge_options.add_argument('enable-automation')
        self.edge_options.add_argument('--disable-infobars')
        self.edge_options.add_argument('--disable-dev-shm-usage')
        self.edge_options.add_argument('--disable-browser-side-navigation')
        # self.edge_options.add_argument("--remote-debugging-port=9222")
        # options.add_argument("--headless")
        self.edge_options.add_argument('--disable-gpu')
        # self.edge_options.add_argument("--log-level=3")
        self.edge_options.add_argument('--disable-blink-features=AutomationControlled')
        self.edge_options.add_experimental_option("useAutomationExtension", False)
        if use_proxy:
            print("using proxy")
            pluginfile = 'proxy_auth_plugin.zip'
            with zipfile.ZipFile(pluginfile, 'w') as zp:
                zp.writestr("manifest.json", manifest_json)
                zp.writestr("background.js", self.get_background_js(self.PROXY))
            # self.edge_options.add_extension(pluginfile)
            # self.edge_options.add_extension('google_bot.crx')

        self.driver = webdriver.Chrome(options=self.edge_options)
        # self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'})
        # self.driver.get('https://www.google.com')
        return self.driver


    def clear_cookies(self):
        cookies_accepted = False
        while cookies_accepted == True:
            try:
                self.driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]')
                cookies_accepted = True
            except:
                self.driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]')
                cookies_accepted = True


if __name__ == "__main__":
    test = tm_bot()    
    test.clear_cookies()

One reply on “Scrape ‘Ticketmaster’ using Selenium with Python”

Leave a Reply

Your email address will not be published.

This site uses Akismet to reduce spam. Learn how your comment data is processed.