Files
web_scraper/parse_urls.py
2024-01-15 10:36:36 +01:00

130 lines
3.3 KiB
Python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Set path to chromedriver as per your installation
# chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver'
# Set up driver
driver = webdriver.Chrome(options=chrome_options)
names = []
addresses = []
links = []
emails = []
phones = []
urls = []
first_visit = True
# Function to parse a single URL
def parse_url(url):
global first_visit
# print("Parse " + url)
# Send a request to the website
# Load webpage
driver.get(url)
wait = WebDriverWait(driver, 10)
if first_visit:
accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler')))
accept_button.click()
first_visit = False
accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1')))
# It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites
# driver.implicitly_wait(5)
# Get page source and close the browser
page_source = driver.page_source
# print(page_source)
# Check if the request was successful
# if response.status_code == 200:
# print(response._content)
# # Parse the HTML content
soup = BeautifulSoup(page_source, 'html.parser')
name = soup.find('h1').get_text(strip=True)
# print(name)
names.append(name)
address_spans = soup.find('div', id='exhibitor_details_address')
if address_spans is None:
addresses.append('-')
else:
address_spans = address_spans.findAll('span')
if address_spans is None:
addresses.append('-')
else:
address = ""
for line in address_spans:
# print(line.get_text())
address = address + line.get_text() + '\n'
if address.endswith('\n'):
address = address[:-1]
# print(address)
addresses.append(address)
link = soup.find('div', id='exhibitor_details_website')
if link is None:
links.append('-')
else:
link = link.find('a').get_text()
links.append(link)
# print(link)
email = soup.find('div', id='exhibitor_details_email')
if email is None:
emails.append('-')
else:
email = email.find('a').get_text()
emails.append(email)
# print(email)
phone = soup.find('div', id='exhibitor_details_phone')
if phone is None:
phones.append('-')
else:
phone = phone.find('a').get_text()
phones.append(phone)
# print(phone)
urls.append(url)
# Path to the file containing URLs
file_path = './urls.txt'
# Read each line from the file and parse the URL
with open(file_path, 'r') as file:
for line in file:
url = line.strip()
if url:
parse_url(url)
driver.quit()
df = pd.DataFrame({
'Company': names,
'Location': addresses,
'Website': links,
'Email': emails,
'Phone': phones,
'URL': urls,
})
# Now you have the data in a pandas DataFrame, you can manipulate it as you wish
# print(df)
# Optionally, save the data to a CSV file
df.to_csv('output_test.csv', index=False)