from bs4 import BeautifulSoup import pandas as pd def strip(string): strings = [] for s in string: strings.append(s.text.strip()) return strings def pad_list(lst, length, pad_value='Unknown'): return lst + [pad_value] * (length - len(lst)) # Replace 'your_local_file.html' with the path to your local HTML file with open('/Users/sbzuc0a/Downloads/scrape_new/index.html', 'r', encoding='utf-8') as html_file: content = html_file.read() # Parse the HTML content soup = BeautifulSoup(content, 'html.parser') companies = [] urls = [] # Find all the company-info divs companies_div = soup.find_all('div', class_='company-info') # Loop through each company and extract the information for company in companies_div: link_tag = company.find('a') if link_tag and link_tag['href']: link = link_tag['href'] company_name = link_tag.find('h3') # if company_name: # print(f"Company Name: {company_name.text.strip()}, Link: {link}") companies.append(company_name.text.strip()) urls.append(link) # companies = strip(soup.find_all('h3', class_='company-info')) # urls = strip(soup.find_all('a', class_='company-info')) # places = strip(soup.find_all('span', class_='link-fix--text')) # urls_T = soup.find_all('a', class_='media-module__link') # companies = [] # for company in companies_t: # companies.append(company.text.strip()) # urls = [] # for url in urls_T: # urls.append(url.get('href')) # print(companies) # print(urls) # print(places) # print(urls) for url in urls: print(url) # print the length of the lists # print(len(companies)) # print(len(locations)) # print(len(places)) # print(len(urls)) # locations = pad_list(locations, len(companies)) # # Find the data you want to scrape. For example, let's say you want to scrape a table: # tables = soup.find_all('table') # # Assuming you want to scrape the first table # table = tables[0] # # Use pandas to read the table # data_frame = pd.read_html(str(table))[0] # df = pd.DataFrame({ # 'Company': companies, # 'Location': locations, # 'Place': places, # 'URL': urls # }) # # Now you have the data in a pandas DataFrame, you can manipulate it as you wish # print(df) # # Optionally, save the data to a CSV file # df.to_csv('output_bandage.csv', index=False)