diff --git a/README.md b/README.md new file mode 100644 index 0000000..57ce3cd --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# Web Scraper + +Simple web scraping with Beautiful Soup 4 (BS4) and Selenium (headless browser). + +Cookie banner can be handled, look into `parse_urls.py`. Therefore wait until the buttons for the banner is loaded, click on it and wait again until the content of the site is loaded. But do this only for the first URL, for the next URLs, the cookies are already set. + +It's easy with BS4 to scrape infos out of HTML. To get a `div` with id, write `elem = soup.find('div', id='my_id')`. To find children (or children of children, etc.) of that element, write `children = elem.findAll('span')`. + +## Chromedriver +To use Selenium with Chrome, you need to download ChromeDriver (just google it). diff --git a/main.py b/main.py index a3c91a5..4692f71 100644 --- a/main.py +++ b/main.py @@ -12,37 +12,57 @@ def pad_list(lst, length, pad_value='Unknown'): # Replace 'your_local_file.html' with the path to your local HTML file -with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file: +with open('/Users/sbzuc0a/Downloads/scrape_new/index.html', 'r', encoding='utf-8') as html_file: content = html_file.read() # Parse the HTML content soup = BeautifulSoup(content, 'html.parser') -companies = strip(soup.find_all('h3', class_='media__body__head')) -locations = strip(soup.find_all('h5')) -places = strip(soup.find_all('span', class_='link-fix--text')) -urls_T = soup.find_all('a', class_='media-module__link') +companies = [] +urls = [] + +# Find all the company-info divs +companies_div = soup.find_all('div', class_='company-info') + +# Loop through each company and extract the information +for company in companies_div: + link_tag = company.find('a') + if link_tag and link_tag['href']: + link = link_tag['href'] + company_name = link_tag.find('h3') + # if company_name: + # print(f"Company Name: {company_name.text.strip()}, Link: {link}") + companies.append(company_name.text.strip()) + urls.append(link) + +# companies = strip(soup.find_all('h3', class_='company-info')) +# urls = strip(soup.find_all('a', class_='company-info')) +# places = strip(soup.find_all('span', class_='link-fix--text')) +# urls_T = soup.find_all('a', class_='media-module__link') # companies = [] # for company in companies_t: # companies.append(company.text.strip()) -urls = [] -for url in urls_T: - urls.append(url.get('href')) +# urls = [] +# for url in urls_T: +# urls.append(url.get('href')) # print(companies) -# print(locations) +# print(urls) # print(places) # print(urls) +for url in urls: + print(url) + # print the length of the lists # print(len(companies)) # print(len(locations)) # print(len(places)) # print(len(urls)) -locations = pad_list(locations, len(companies)) +# locations = pad_list(locations, len(companies)) # # Find the data you want to scrape. For example, let's say you want to scrape a table: # tables = soup.find_all('table') @@ -53,15 +73,15 @@ locations = pad_list(locations, len(companies)) # # Use pandas to read the table # data_frame = pd.read_html(str(table))[0] -df = pd.DataFrame({ - 'Company': companies, - 'Location': locations, - 'Place': places, - 'URL': urls -}) +# df = pd.DataFrame({ +# 'Company': companies, +# 'Location': locations, +# 'Place': places, +# 'URL': urls +# }) # # Now you have the data in a pandas DataFrame, you can manipulate it as you wish -print(df) +# print(df) # # Optionally, save the data to a CSV file -df.to_csv('output_bandage.csv', index=False) +# df.to_csv('output_bandage.csv', index=False) diff --git a/parse_urls.py b/parse_urls.py new file mode 100644 index 0000000..568e25c --- /dev/null +++ b/parse_urls.py @@ -0,0 +1,130 @@ +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from bs4 import BeautifulSoup +import pandas as pd + +# Set up Chrome options +chrome_options = Options() +chrome_options.add_argument("--headless") # Ensure GUI is off +chrome_options.add_argument("--no-sandbox") +chrome_options.add_argument("--disable-dev-shm-usage") + +# Set path to chromedriver as per your installation +# chrome_driver_path = '/Users/sbzuc0a/Downloads/chromedriver_mac_arm64/chromedriver' + +# Set up driver +driver = webdriver.Chrome(options=chrome_options) + +names = [] +addresses = [] +links = [] +emails = [] +phones = [] +urls = [] + +first_visit = True + +# Function to parse a single URL +def parse_url(url): + global first_visit + # print("Parse " + url) + # Send a request to the website + # Load webpage + driver.get(url) + wait = WebDriverWait(driver, 10) + if first_visit: + accept_button = wait.until(method=EC.element_to_be_clickable((By.ID, 'onetrust-reject-all-handler'))) + accept_button.click() + first_visit = False + accept_button = wait.until(method=EC.element_to_be_clickable((By.CSS_SELECTOR, 'h1'))) + + # It's a good idea to let the script wait for a few seconds before scraping, especially for dynamic websites + # driver.implicitly_wait(5) + + # Get page source and close the browser + page_source = driver.page_source + # print(page_source) + + # Check if the request was successful + # if response.status_code == 200: + # print(response._content) + # # Parse the HTML content + soup = BeautifulSoup(page_source, 'html.parser') + + name = soup.find('h1').get_text(strip=True) + # print(name) + names.append(name) + + address_spans = soup.find('div', id='exhibitor_details_address') + if address_spans is None: + addresses.append('-') + else: + address_spans = address_spans.findAll('span') + if address_spans is None: + addresses.append('-') + else: + address = "" + for line in address_spans: + # print(line.get_text()) + address = address + line.get_text() + '\n' + if address.endswith('\n'): + address = address[:-1] + # print(address) + addresses.append(address) + + + link = soup.find('div', id='exhibitor_details_website') + if link is None: + links.append('-') + else: + link = link.find('a').get_text() + links.append(link) + # print(link) + + email = soup.find('div', id='exhibitor_details_email') + if email is None: + emails.append('-') + else: + email = email.find('a').get_text() + emails.append(email) + # print(email) + + phone = soup.find('div', id='exhibitor_details_phone') + if phone is None: + phones.append('-') + else: + phone = phone.find('a').get_text() + phones.append(phone) + # print(phone) + + urls.append(url) + +# Path to the file containing URLs +file_path = './urls.txt' + +# Read each line from the file and parse the URL +with open(file_path, 'r') as file: + for line in file: + url = line.strip() + if url: + parse_url(url) + +driver.quit() + +df = pd.DataFrame({ + 'Company': names, + 'Location': addresses, + 'Website': links, + 'Email': emails, + 'Phone': phones, + 'URL': urls, +}) + +# Now you have the data in a pandas DataFrame, you can manipulate it as you wish +# print(df) + +# Optionally, save the data to a CSV file +df.to_csv('output_test.csv', index=False) \ No newline at end of file diff --git a/parse_website.py b/parse_website.py new file mode 100644 index 0000000..b903df2 --- /dev/null +++ b/parse_website.py @@ -0,0 +1,28 @@ +from bs4 import BeautifulSoup + +with open('./website.txt', 'r', encoding='utf-8') as file: + file_contents = file.read() + +soup = BeautifulSoup(file_contents, 'html.parser') + +name = soup.find('h1').get_text(strip=True) +print(name) + +address_spans = soup.find('div', id='exhibitor_details_address').findAll('span') +address = "" +for line in address_spans: + # print(line.get_text()) + address = address + line.get_text() + '\n' +if address.endswith('\n'): + address = address[:-1] +print(address) + + +link = soup.find('div', id='exhibitor_details_website').find('a').get_text() +print(link) + +email = soup.find('div', id='exhibitor_details_email').find('a').get_text() +print(email) + +phone = soup.find('div', id='exhibitor_details_phone').find('a').get_text() +print(phone) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f01ee25..605d4b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ beautifulsoup4==4.12.2 pandas==2.1.2 +requests=2.31 +selenium=4.16