130 lines
3.3 KiB
Python
130 lines
3.3 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
|
|
# Set up Chrome options
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--headless") # Ensure GUI is off
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
|
|
# Set path to chromedriver as per your installation
|
|
# chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver'
|
|
|
|
# Set up driver
|
|
driver = webdriver.Chrome(options=chrome_options)
|
|
|
|
names = []
|
|
addresses = []
|
|
links = []
|
|
emails = []
|
|
phones = []
|
|
urls = []
|
|
|
|
first_visit = True
|
|
|
|
# Function to parse a single URL
|
|
def parse_url(url):
|
|
global first_visit
|
|
# print("Parse " + url)
|
|
# Send a request to the website
|
|
# Load webpage
|
|
driver.get(url)
|
|
wait = WebDriverWait(driver, 10)
|
|
if first_visit:
|
|
accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler')))
|
|
accept_button.click()
|
|
first_visit = False
|
|
accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1')))
|
|
|
|
# It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites
|
|
# driver.implicitly_wait(5)
|
|
|
|
# Get page source and close the browser
|
|
page_source = driver.page_source
|
|
# print(page_source)
|
|
|
|
# Check if the request was successful
|
|
# if response.status_code == 200:
|
|
# print(response._content)
|
|
# # Parse the HTML content
|
|
soup = BeautifulSoup(page_source, 'html.parser')
|
|
|
|
name = soup.find('h1').get_text(strip=True)
|
|
# print(name)
|
|
names.append(name)
|
|
|
|
address_spans = soup.find('div', id='exhibitor_details_address')
|
|
if address_spans is None:
|
|
addresses.append('-')
|
|
else:
|
|
address_spans = address_spans.findAll('span')
|
|
if address_spans is None:
|
|
addresses.append('-')
|
|
else:
|
|
address = ""
|
|
for line in address_spans:
|
|
# print(line.get_text())
|
|
address = address + line.get_text() + '\n'
|
|
if address.endswith('\n'):
|
|
address = address[:-1]
|
|
# print(address)
|
|
addresses.append(address)
|
|
|
|
|
|
link = soup.find('div', id='exhibitor_details_website')
|
|
if link is None:
|
|
links.append('-')
|
|
else:
|
|
link = link.find('a').get_text()
|
|
links.append(link)
|
|
# print(link)
|
|
|
|
email = soup.find('div', id='exhibitor_details_email')
|
|
if email is None:
|
|
emails.append('-')
|
|
else:
|
|
email = email.find('a').get_text()
|
|
emails.append(email)
|
|
# print(email)
|
|
|
|
phone = soup.find('div', id='exhibitor_details_phone')
|
|
if phone is None:
|
|
phones.append('-')
|
|
else:
|
|
phone = phone.find('a').get_text()
|
|
phones.append(phone)
|
|
# print(phone)
|
|
|
|
urls.append(url)
|
|
|
|
# Path to the file containing URLs
|
|
file_path = './urls.txt'
|
|
|
|
# Read each line from the file and parse the URL
|
|
with open(file_path, 'r') as file:
|
|
for line in file:
|
|
url = line.strip()
|
|
if url:
|
|
parse_url(url)
|
|
|
|
driver.quit()
|
|
|
|
df = pd.DataFrame({
|
|
'Company': names,
|
|
'Location': addresses,
|
|
'Website': links,
|
|
'Email': emails,
|
|
'Phone': phones,
|
|
'URL': urls,
|
|
})
|
|
|
|
# Now you have the data in a pandas DataFrame, you can manipulate it as you wish
|
|
# print(df)
|
|
|
|
# Optionally, save the data to a CSV file
|
|
df.to_csv('output_test.csv', index=False) |