handle Cookie banner with Selenium
This commit is contained in:
130
parse_urls.py
Normal file
130
parse_urls.py
Normal file
@ -0,0 +1,130 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
|
||||
# Set up Chrome options
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless") # Ensure GUI is off
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
|
||||
# Set path to chromedriver as per your installation
|
||||
# chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver'
|
||||
|
||||
# Set up driver
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
|
||||
names = []
|
||||
addresses = []
|
||||
links = []
|
||||
emails = []
|
||||
phones = []
|
||||
urls = []
|
||||
|
||||
first_visit = True
|
||||
|
||||
# Function to parse a single URL
|
||||
def parse_url(url):
|
||||
global first_visit
|
||||
# print("Parse " + url)
|
||||
# Send a request to the website
|
||||
# Load webpage
|
||||
driver.get(url)
|
||||
wait = WebDriverWait(driver, 10)
|
||||
if first_visit:
|
||||
accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler')))
|
||||
accept_button.click()
|
||||
first_visit = False
|
||||
accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1')))
|
||||
|
||||
# It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites
|
||||
# driver.implicitly_wait(5)
|
||||
|
||||
# Get page source and close the browser
|
||||
page_source = driver.page_source
|
||||
# print(page_source)
|
||||
|
||||
# Check if the request was successful
|
||||
# if response.status_code == 200:
|
||||
# print(response._content)
|
||||
# # Parse the HTML content
|
||||
soup = BeautifulSoup(page_source, 'html.parser')
|
||||
|
||||
name = soup.find('h1').get_text(strip=True)
|
||||
# print(name)
|
||||
names.append(name)
|
||||
|
||||
address_spans = soup.find('div', id='exhibitor_details_address')
|
||||
if address_spans is None:
|
||||
addresses.append('-')
|
||||
else:
|
||||
address_spans = address_spans.findAll('span')
|
||||
if address_spans is None:
|
||||
addresses.append('-')
|
||||
else:
|
||||
address = ""
|
||||
for line in address_spans:
|
||||
# print(line.get_text())
|
||||
address = address + line.get_text() + '\n'
|
||||
if address.endswith('\n'):
|
||||
address = address[:-1]
|
||||
# print(address)
|
||||
addresses.append(address)
|
||||
|
||||
|
||||
link = soup.find('div', id='exhibitor_details_website')
|
||||
if link is None:
|
||||
links.append('-')
|
||||
else:
|
||||
link = link.find('a').get_text()
|
||||
links.append(link)
|
||||
# print(link)
|
||||
|
||||
email = soup.find('div', id='exhibitor_details_email')
|
||||
if email is None:
|
||||
emails.append('-')
|
||||
else:
|
||||
email = email.find('a').get_text()
|
||||
emails.append(email)
|
||||
# print(email)
|
||||
|
||||
phone = soup.find('div', id='exhibitor_details_phone')
|
||||
if phone is None:
|
||||
phones.append('-')
|
||||
else:
|
||||
phone = phone.find('a').get_text()
|
||||
phones.append(phone)
|
||||
# print(phone)
|
||||
|
||||
urls.append(url)
|
||||
|
||||
# Path to the file containing URLs
|
||||
file_path = './urls.txt'
|
||||
|
||||
# Read each line from the file and parse the URL
|
||||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
url = line.strip()
|
||||
if url:
|
||||
parse_url(url)
|
||||
|
||||
driver.quit()
|
||||
|
||||
df = pd.DataFrame({
|
||||
'Company': names,
|
||||
'Location': addresses,
|
||||
'Website': links,
|
||||
'Email': emails,
|
||||
'Phone': phones,
|
||||
'URL': urls,
|
||||
})
|
||||
|
||||
# Now you have the data in a pandas DataFrame, you can manipulate it as you wish
|
||||
# print(df)
|
||||
|
||||
# Optionally, save the data to a CSV file
|
||||
df.to_csv('output_test.csv', index=False)
|
||||
Reference in New Issue
Block a user