from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import pandas as pd # Set up Chrome options chrome_options = Options() chrome_options.add_argument("--headless") # Ensure GUI is off chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") # Set path to chromedriver as per your installation # chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver' # Set up driver driver = webdriver.Chrome(options=chrome_options) names = [] addresses = [] links = [] emails = [] phones = [] urls = [] first_visit = True # Function to parse a single URL def parse_url(url): global first_visit # print("Parse " + url) # Send a request to the website # Load webpage driver.get(url) wait = WebDriverWait(driver, 10) if first_visit: accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler'))) accept_button.click() first_visit = False accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1'))) # It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites # driver.implicitly_wait(5) # Get page source and close the browser page_source = driver.page_source # print(page_source) # Check if the request was successful # if response.status_code == 200: # print(response._content) # # Parse the HTML content soup = BeautifulSoup(page_source, 'html.parser') name = soup.find('h1').get_text(strip=True) # print(name) names.append(name) address_spans = soup.find('div', id='exhibitor_details_address') if address_spans is None: addresses.append('-') else: address_spans = address_spans.findAll('span') if address_spans is None: addresses.append('-') else: address = "" for line in address_spans: # print(line.get_text()) address = address + line.get_text() + '\n' if address.endswith('\n'): address = address[:-1] # print(address) addresses.append(address) link = soup.find('div', id='exhibitor_details_website') if link is None: links.append('-') else: link = link.find('a').get_text() links.append(link) # print(link) email = soup.find('div', id='exhibitor_details_email') if email is None: emails.append('-') else: email = email.find('a').get_text() emails.append(email) # print(email) phone = soup.find('div', id='exhibitor_details_phone') if phone is None: phones.append('-') else: phone = phone.find('a').get_text() phones.append(phone) # print(phone) urls.append(url) # Path to the file containing URLs file_path = './urls.txt' # Read each line from the file and parse the URL with open(file_path, 'r') as file: for line in file: url = line.strip() if url: parse_url(url) driver.quit() df = pd.DataFrame({ 'Company': names, 'Location': addresses, 'Website': links, 'Email': emails, 'Phone': phones, 'URL': urls, }) # Now you have the data in a pandas DataFrame, you can manipulate it as you wish # print(df) # Optionally, save the data to a CSV file df.to_csv('output_test.csv', index=False)