handle Cookie banner with Selenium

2024-01-15 10:36:36 +01:00
parent 24b64ec248
commit a1a1bc757b
5 changed files with 208 additions and 18 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,10 @@
 # Web Scraper
 Simple web scraping with Beautiful Soup 4 (BS4) and Selenium (headless browser).
 Cookie banner can be handled, look into `parse_urls.py`. Therefore wait until the buttons for the banner is loaded, click on it and wait again until the content of the site is loaded. But do this only for the first URL, for the next URLs, the cookies are already set.
 It's easy with BS4 to scrape infos out of HTML. To get a `div` with id, write `elem = soup.find('div', id='my_id')`. To find children (or children of children, etc.) of that element, write `children = elem.findAll('span')`.
 ## Chromedriver
 To use Selenium with Chrome, you need to download ChromeDriver (just google it).
--- a/main.py
+++ b/main.py
@@ -12,37 +12,57 @@ def pad_list(lst, length, pad_value='Unknown'):
 # Replace 'your_local_file.html' with the path to your local HTML file
-with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file:
+with open('/Users/sbzuc0a/Downloads/scrape_new/index.html', 'r', encoding='utf-8') as html_file:
    content = html_file.read()
 # Parse the HTML content
 soup = BeautifulSoup(content, 'html.parser')
-companies = strip(soup.find_all('h3', class_='media__body__head'))
+companies = []
-locations = strip(soup.find_all('h5'))
+urls = []
-places = strip(soup.find_all('span', class_='link-fix--text'))
+
-urls_T = soup.find_all('a', class_='media-module__link')
+# Find all the company-info divs
 companies_div = soup.find_all('div', class_='company-info')
 # Loop through each company and extract the information
 for company in companies_div:
 	link_tag = company.find('a')
 	if link_tag and link_tag['href']:
 		link = link_tag['href']
 		company_name = link_tag.find('h3')
 		# if company_name:
 		# 	print(f"Company Name: {company_name.text.strip()}, Link: {link}")
 		companies.append(company_name.text.strip())
 		urls.append(link)
 # companies = strip(soup.find_all('h3', class_='company-info'))
 # urls = strip(soup.find_all('a', class_='company-info'))
 # places = strip(soup.find_all('span', class_='link-fix--text'))
 # urls_T = soup.find_all('a', class_='media-module__link')
 # companies = []
 # for company in companies_t:
 # 	companies.append(company.text.strip())
-urls = []
+# urls = []
-for url in urls_T:
+# for url in urls_T:
-	urls.append(url.get('href'))
+# 	urls.append(url.get('href'))
 # print(companies)
-# print(locations)
+# print(urls)
 # print(places)
 # print(urls)
 for url in urls:
 	print(url)
 # print the length of the lists
 # print(len(companies))
 # print(len(locations))
 # print(len(places))
 # print(len(urls))
-locations = pad_list(locations, len(companies))
+# locations = pad_list(locations, len(companies))
 # # Find the data you want to scrape. For example, let's say you want to scrape a table:
 # tables = soup.find_all('table')
@@ -53,15 +73,15 @@ locations = pad_list(locations, len(companies))
 # # Use pandas to read the table
 # data_frame = pd.read_html(str(table))[0]
-df = pd.DataFrame({
+# df = pd.DataFrame({
-    'Company': companies,
+#     'Company': companies,
-    'Location': locations,
+#     'Location': locations,
-    'Place': places,
+#     'Place': places,
-    'URL': urls
+#     'URL': urls
-})
+# })
 # # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
-print(df)
+# print(df)
 # # Optionally, save the data to a CSV file
-df.to_csv('output_bandage.csv', index=False)
+# df.to_csv('output_bandage.csv', index=False)
--- a/parse_urls.py
+++ b/parse_urls.py
@@ -0,0 +1,130 @@
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
 import pandas as pd
 # Set up Chrome options
 chrome_options = Options()
 chrome_options.add_argument("--headless")  # Ensure GUI is off
 chrome_options.add_argument("--no-sandbox")
 chrome_options.add_argument("--disable-dev-shm-usage")
 # Set path to chromedriver as per your installation
 # chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver'
 # Set up driver
 driver = webdriver.Chrome(options=chrome_options)
 names = []
 addresses = []
 links = []
 emails = []
 phones = []
 urls = []
 first_visit = True
 # Function to parse a single URL
 def parse_url(url):
 	global first_visit
 	# print("Parse " + url)
 	# Send a request to the website
 	# Load webpage
 	driver.get(url)
 	wait = WebDriverWait(driver, 10)
 	if first_visit:
 		accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler')))
 		accept_button.click()
 		first_visit = False
 	accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1')))
 	# It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites
 	# driver.implicitly_wait(5)
 	# Get page source and close the browser
 	page_source = driver.page_source
 	# print(page_source)
 	# Check if the request was successful
 	# if response.status_code == 200:
 	#     print(response._content)
 	#     # Parse the HTML content
 	soup = BeautifulSoup(page_source, 'html.parser')
 	name = soup.find('h1').get_text(strip=True)
 	# print(name)
 	names.append(name)
 	address_spans = soup.find('div', id='exhibitor_details_address')
 	if address_spans is None:
 		addresses.append('-')
 	else:
 		address_spans = address_spans.findAll('span')
 		if address_spans is None:
 			addresses.append('-')
 		else:
 			address = ""
 			for line in address_spans:
 				# print(line.get_text())
 				address = address + line.get_text() + '\n'
 			if address.endswith('\n'):
 				address = address[:-1]
 			# print(address)
 			addresses.append(address)
 	link = soup.find('div', id='exhibitor_details_website')
 	if link is None:
 		links.append('-')
 	else:
 		link = link.find('a').get_text()
 		links.append(link)
 	# print(link)
 	email = soup.find('div', id='exhibitor_details_email')
 	if email is None:
 		emails.append('-')
 	else:
 		email = email.find('a').get_text()
 		emails.append(email)
 	# print(email)
 	phone = soup.find('div', id='exhibitor_details_phone')
 	if phone is None:
 		phones.append('-')
 	else:
 		phone = phone.find('a').get_text()
 		phones.append(phone)
 	# print(phone)
 	urls.append(url)
 # Path to the file containing URLs
 file_path = './urls.txt'
 # Read each line from the file and parse the URL
 with open(file_path, 'r') as file:
    for line in file:
        url = line.strip()
        if url:
            parse_url(url)
 driver.quit()
 df = pd.DataFrame({
    'Company': names,
    'Location': addresses,
    'Website': links,
    'Email': emails,
    'Phone': phones,
    'URL': urls,
 })
 # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
 # print(df)
 # Optionally, save the data to a CSV file
 df.to_csv('output_test.csv', index=False)
--- a/parse_website.py
+++ b/parse_website.py
@@ -0,0 +1,28 @@
 from bs4 import BeautifulSoup
 with open('./website.txt', 'r', encoding='utf-8') as file:
 	file_contents = file.read()
 soup = BeautifulSoup(file_contents, 'html.parser')
 name = soup.find('h1').get_text(strip=True)
 print(name)
 address_spans = soup.find('div', id='exhibitor_details_address').findAll('span')
 address = ""
 for line in address_spans:
 	# print(line.get_text())
 	address = address + line.get_text() + '\n'
 if address.endswith('\n'):
    address = address[:-1]
 print(address)
 link = soup.find('div', id='exhibitor_details_website').find('a').get_text()
 print(link)
 email = soup.find('div', id='exhibitor_details_email').find('a').get_text()
 print(email)
 phone = soup.find('div', id='exhibitor_details_phone').find('a').get_text()
 print(phone)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,4 @@
 beautifulsoup4==4.12.2
 pandas==2.1.2
 requests=2.31
 selenium=4.16