handle Cookie banner with Selenium

2024-01-15 10:36:36 +01:00
parent 24b64ec248
commit a1a1bc757b
5 changed files with 208 additions and 18 deletions
--- a/parse_urls.py
+++ b/parse_urls.py
@ -0,0 +1,130 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import pandas as pd
+
+# Set up Chrome options
+chrome_options = Options()
+chrome_options.add_argument("--headless")  # Ensure GUI is off
+chrome_options.add_argument("--no-sandbox")
+chrome_options.add_argument("--disable-dev-shm-usage")
+
+# Set path to chromedriver as per your installation
+# chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver'
+
+# Set up driver
+driver = webdriver.Chrome(options=chrome_options)
+
+names = []
+addresses = []
+links = []
+emails = []
+phones = []
+urls = []
+
+first_visit = True
+
+# Function to parse a single URL
+def parse_url(url):
+	global first_visit
+	# print("Parse " + url)
+	# Send a request to the website
+	# Load webpage
+	driver.get(url)
+	wait = WebDriverWait(driver, 10)
+	if first_visit:
+		accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler')))
+		accept_button.click()
+		first_visit = False
+	accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1')))
+
+	# It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites
+	# driver.implicitly_wait(5)
+
+	# Get page source and close the browser
+	page_source = driver.page_source
+	# print(page_source)
+
+	# Check if the request was successful
+	# if response.status_code == 200:
+	#     print(response._content)
+	#     # Parse the HTML content
+	soup = BeautifulSoup(page_source, 'html.parser')
+
+	name = soup.find('h1').get_text(strip=True)
+	# print(name)
+	names.append(name)
+
+	address_spans = soup.find('div', id='exhibitor_details_address')
+	if address_spans is None:
+		addresses.append('-')
+	else:
+		address_spans = address_spans.findAll('span')
+		if address_spans is None:
+			addresses.append('-')
+		else:
+			address = ""
+			for line in address_spans:
+				# print(line.get_text())
+				address = address + line.get_text() + '\n'
+			if address.endswith('\n'):
+				address = address[:-1]
+			# print(address)
+			addresses.append(address)
+
+
+	link = soup.find('div', id='exhibitor_details_website')
+	if link is None:
+		links.append('-')
+	else:
+		link = link.find('a').get_text()
+		links.append(link)
+	# print(link)
+
+	email = soup.find('div', id='exhibitor_details_email')
+	if email is None:
+		emails.append('-')
+	else:
+		email = email.find('a').get_text()
+		emails.append(email)
+	# print(email)
+
+	phone = soup.find('div', id='exhibitor_details_phone')
+	if phone is None:
+		phones.append('-')
+	else:
+		phone = phone.find('a').get_text()
+		phones.append(phone)
+	# print(phone)
+
+	urls.append(url)
+
+# Path to the file containing URLs
+file_path = './urls.txt'
+
+# Read each line from the file and parse the URL
+with open(file_path, 'r') as file:
+    for line in file:
+        url = line.strip()
+        if url:
+            parse_url(url)
+
+driver.quit()
+
+df = pd.DataFrame({
+    'Company': names,
+    'Location': addresses,
+    'Website': links,
+    'Email': emails,
+    'Phone': phones,
+    'URL': urls,
+})
+
+# Now you have the data in a pandas DataFrame, you can manipulate it as you wish
+# print(df)
+
+# Optionally, save the data to a CSV file
+df.to_csv('output_test.csv', index=False)