web_scraper/parse_urls.py

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Set path to chromedriver as per your installation
# chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver'

# Set up driver
driver = webdriver.Chrome(options=chrome_options)

names = []
addresses = []
links = []
emails = []
phones = []
urls = []

first_visit = True

# Function to parse a single URL
def parse_url(url):
	global first_visit
	# print("Parse " + url)
	# Send a request to the website
	# Load webpage
	driver.get(url)
	wait = WebDriverWait(driver, 10)
	if first_visit:
		accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler')))
		accept_button.click()
		first_visit = False
	accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1')))

	# It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites
	# driver.implicitly_wait(5)

	# Get page source and close the browser
	page_source = driver.page_source
	# print(page_source)

	# Check if the request was successful
	# if response.status_code == 200:
	#     print(response._content)
	#     # Parse the HTML content
	soup = BeautifulSoup(page_source, 'html.parser')

	name = soup.find('h1').get_text(strip=True)
	# print(name)
	names.append(name)

	address_spans = soup.find('div', id='exhibitor_details_address')
	if address_spans is None:
		addresses.append('-')
	else:
		address_spans = address_spans.findAll('span')
		if address_spans is None:
			addresses.append('-')
		else:
			address = ""
			for line in address_spans:
				# print(line.get_text())
				address = address + line.get_text() + '\n'
			if address.endswith('\n'):
				address = address[:-1]
			# print(address)
			addresses.append(address)


	link = soup.find('div', id='exhibitor_details_website')
	if link is None:
		links.append('-')
	else:
		link = link.find('a').get_text()
		links.append(link)
	# print(link)

	email = soup.find('div', id='exhibitor_details_email')
	if email is None:
		emails.append('-')
	else:
		email = email.find('a').get_text()
		emails.append(email)
	# print(email)

	phone = soup.find('div', id='exhibitor_details_phone')
	if phone is None:
		phones.append('-')
	else:
		phone = phone.find('a').get_text()
		phones.append(phone)
	# print(phone)

	urls.append(url)

# Path to the file containing URLs
file_path = './urls.txt'

# Read each line from the file and parse the URL
with open(file_path, 'r') as file:
    for line in file:
        url = line.strip()
        if url:
            parse_url(url)

driver.quit()

df = pd.DataFrame({
    'Company': names,
    'Location': addresses,
    'Website': links,
    'Email': emails,
    'Phone': phones,
    'URL': urls,
})

# Now you have the data in a pandas DataFrame, you can manipulate it as you wish
# print(df)

# Optionally, save the data to a CSV file
df.to_csv('output_test.csv', index=False)