handle Cookie banner with Selenium

This commit is contained in:
Andre Heber
2024-01-15 10:36:36 +01:00
parent 24b64ec248
commit a1a1bc757b
5 changed files with 208 additions and 18 deletions

10
README.md Normal file
View File

@ -0,0 +1,10 @@
# Web Scraper
Simple web scraping with Beautiful Soup 4 (BS4) and Selenium (headless browser).
Cookie banner can be handled, look into `parse_urls.py`. Therefore wait until the buttons for the banner is loaded, click on it and wait again until the content of the site is loaded. But do this only for the first URL, for the next URLs, the cookies are already set.
It's easy with BS4 to scrape infos out of HTML. To get a `div` with id, write `elem = soup.find('div', id='my_id')`. To find children (or children of children, etc.) of that element, write `children = elem.findAll('span')`.
## Chromedriver
To use Selenium with Chrome, you need to download ChromeDriver (just google it).

56
main.py
View File

@ -12,37 +12,57 @@ def pad_list(lst, length, pad_value='Unknown'):
# Replace 'your_local_file.html' with the path to your local HTML file # Replace 'your_local_file.html' with the path to your local HTML file
with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file: with open('/Users/sbzuc0a/Downloads/scrape_new/index.html', 'r', encoding='utf-8') as html_file:
content = html_file.read() content = html_file.read()
# Parse the HTML content # Parse the HTML content
soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'html.parser')
companies = strip(soup.find_all('h3', class_='media__body__head')) companies = []
locations = strip(soup.find_all('h5')) urls = []
places = strip(soup.find_all('span', class_='link-fix--text'))
urls_T = soup.find_all('a', class_='media-module__link') # Find all the company-info divs
companies_div = soup.find_all('div', class_='company-info')
# Loop through each company and extract the information
for company in companies_div:
link_tag = company.find('a')
if link_tag and link_tag['href']:
link = link_tag['href']
company_name = link_tag.find('h3')
# if company_name:
# print(f"Company Name: {company_name.text.strip()}, Link: {link}")
companies.append(company_name.text.strip())
urls.append(link)
# companies = strip(soup.find_all('h3', class_='company-info'))
# urls = strip(soup.find_all('a', class_='company-info'))
# places = strip(soup.find_all('span', class_='link-fix--text'))
# urls_T = soup.find_all('a', class_='media-module__link')
# companies = [] # companies = []
# for company in companies_t: # for company in companies_t:
# companies.append(company.text.strip()) # companies.append(company.text.strip())
urls = [] # urls = []
for url in urls_T: # for url in urls_T:
urls.append(url.get('href')) # urls.append(url.get('href'))
# print(companies) # print(companies)
# print(locations) # print(urls)
# print(places) # print(places)
# print(urls) # print(urls)
for url in urls:
print(url)
# print the length of the lists # print the length of the lists
# print(len(companies)) # print(len(companies))
# print(len(locations)) # print(len(locations))
# print(len(places)) # print(len(places))
# print(len(urls)) # print(len(urls))
locations = pad_list(locations, len(companies)) # locations = pad_list(locations, len(companies))
# # Find the data you want to scrape. For example, let's say you want to scrape a table: # # Find the data you want to scrape. For example, let's say you want to scrape a table:
# tables = soup.find_all('table') # tables = soup.find_all('table')
@ -53,15 +73,15 @@ locations = pad_list(locations, len(companies))
# # Use pandas to read the table # # Use pandas to read the table
# data_frame = pd.read_html(str(table))[0] # data_frame = pd.read_html(str(table))[0]
df = pd.DataFrame({ # df = pd.DataFrame({
'Company': companies, # 'Company': companies,
'Location': locations, # 'Location': locations,
'Place': places, # 'Place': places,
'URL': urls # 'URL': urls
}) # })
# # Now you have the data in a pandas DataFrame, you can manipulate it as you wish # # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
print(df) # print(df)
# # Optionally, save the data to a CSV file # # Optionally, save the data to a CSV file
df.to_csv('output_bandage.csv', index=False) # df.to_csv('output_bandage.csv', index=False)

130
parse_urls.py Normal file
View File

@ -0,0 +1,130 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Set path to chromedriver as per your installation
# chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver'
# Set up driver
driver = webdriver.Chrome(options=chrome_options)
names = []
addresses = []
links = []
emails = []
phones = []
urls = []
first_visit = True
# Function to parse a single URL
def parse_url(url):
global first_visit
# print("Parse " + url)
# Send a request to the website
# Load webpage
driver.get(url)
wait = WebDriverWait(driver, 10)
if first_visit:
accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler')))
accept_button.click()
first_visit = False
accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1')))
# It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites
# driver.implicitly_wait(5)
# Get page source and close the browser
page_source = driver.page_source
# print(page_source)
# Check if the request was successful
# if response.status_code == 200:
# print(response._content)
# # Parse the HTML content
soup = BeautifulSoup(page_source, 'html.parser')
name = soup.find('h1').get_text(strip=True)
# print(name)
names.append(name)
address_spans = soup.find('div', id='exhibitor_details_address')
if address_spans is None:
addresses.append('-')
else:
address_spans = address_spans.findAll('span')
if address_spans is None:
addresses.append('-')
else:
address = ""
for line in address_spans:
# print(line.get_text())
address = address + line.get_text() + '\n'
if address.endswith('\n'):
address = address[:-1]
# print(address)
addresses.append(address)
link = soup.find('div', id='exhibitor_details_website')
if link is None:
links.append('-')
else:
link = link.find('a').get_text()
links.append(link)
# print(link)
email = soup.find('div', id='exhibitor_details_email')
if email is None:
emails.append('-')
else:
email = email.find('a').get_text()
emails.append(email)
# print(email)
phone = soup.find('div', id='exhibitor_details_phone')
if phone is None:
phones.append('-')
else:
phone = phone.find('a').get_text()
phones.append(phone)
# print(phone)
urls.append(url)
# Path to the file containing URLs
file_path = './urls.txt'
# Read each line from the file and parse the URL
with open(file_path, 'r') as file:
for line in file:
url = line.strip()
if url:
parse_url(url)
driver.quit()
df = pd.DataFrame({
'Company': names,
'Location': addresses,
'Website': links,
'Email': emails,
'Phone': phones,
'URL': urls,
})
# Now you have the data in a pandas DataFrame, you can manipulate it as you wish
# print(df)
# Optionally, save the data to a CSV file
df.to_csv('output_test.csv', index=False)

28
parse_website.py Normal file
View File

@ -0,0 +1,28 @@
from bs4 import BeautifulSoup
with open('./website.txt', 'r', encoding='utf-8') as file:
file_contents = file.read()
soup = BeautifulSoup(file_contents, 'html.parser')
name = soup.find('h1').get_text(strip=True)
print(name)
address_spans = soup.find('div', id='exhibitor_details_address').findAll('span')
address = ""
for line in address_spans:
# print(line.get_text())
address = address + line.get_text() + '\n'
if address.endswith('\n'):
address = address[:-1]
print(address)
link = soup.find('div', id='exhibitor_details_website').find('a').get_text()
print(link)
email = soup.find('div', id='exhibitor_details_email').find('a').get_text()
print(email)
phone = soup.find('div', id='exhibitor_details_phone').find('a').get_text()
print(phone)

View File

@ -1,2 +1,4 @@
beautifulsoup4==4.12.2 beautifulsoup4==4.12.2
pandas==2.1.2 pandas==2.1.2
requests=2.31
selenium=4.16