web_scraper/main.py

from bs4 import BeautifulSoup
import pandas as pd

def strip(string):
	strings = []
	for s in string:
		strings.append(s.text.strip())
	return strings

def pad_list(lst, length, pad_value='Unknown'):
    return lst + [pad_value] * (length - len(lst))


# Replace 'your_local_file.html' with the path to your local HTML file
with open('/Users/sbzuc0a/Downloads/scrape_new/index.html', 'r', encoding='utf-8') as html_file:
    content = html_file.read()

# Parse the HTML content
soup = BeautifulSoup(content, 'html.parser')

companies = []
urls = []

# Find all the company-info divs
companies_div = soup.find_all('div', class_='company-info')

# Loop through each company and extract the information
for company in companies_div:
	link_tag = company.find('a')
	if link_tag and link_tag['href']:
		link = link_tag['href']
		company_name = link_tag.find('h3')
		# if company_name:
		# 	print(f"Company Name: {company_name.text.strip()}, Link: {link}")
		companies.append(company_name.text.strip())
		urls.append(link)

# companies = strip(soup.find_all('h3', class_='company-info'))
# urls = strip(soup.find_all('a', class_='company-info'))
# places = strip(soup.find_all('span', class_='link-fix--text'))
# urls_T = soup.find_all('a', class_='media-module__link')

# companies = []
# for company in companies_t:
# 	companies.append(company.text.strip())

# urls = []
# for url in urls_T:
# 	urls.append(url.get('href'))

# print(companies)
# print(urls)
# print(places)
# print(urls)

for url in urls:
	print(url)

# print the length of the lists
# print(len(companies))
# print(len(locations))
# print(len(places))
# print(len(urls))

# locations = pad_list(locations, len(companies))

# # Find the data you want to scrape. For example, let's say you want to scrape a table:
# tables = soup.find_all('table')

# # Assuming you want to scrape the first table
# table = tables[0]

# # Use pandas to read the table
# data_frame = pd.read_html(str(table))[0]

# df = pd.DataFrame({
#     'Company': companies,
#     'Location': locations,
#     'Place': places,
#     'URL': urls
# })

# # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
# print(df)

# # Optionally, save the data to a CSV file
# df.to_csv('output_bandage.csv', index=False)