commit 24b64ec24891eb085fa6ec4009f6ef5dc4e96ffb Author: Andre Heber Date: Sun Jan 14 19:53:13 2024 +0100 ready for medica diff --git a/main.py b/main.py new file mode 100644 index 0000000..a3c91a5 --- /dev/null +++ b/main.py @@ -0,0 +1,67 @@ +from bs4 import BeautifulSoup +import pandas as pd + +def strip(string): + strings = [] + for s in string: + strings.append(s.text.strip()) + return strings + +def pad_list(lst, length, pad_value='Unknown'): + return lst + [pad_value] * (length - len(lst)) + + +# Replace 'your_local_file.html' with the path to your local HTML file +with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file: + content = html_file.read() + +# Parse the HTML content +soup = BeautifulSoup(content, 'html.parser') + +companies = strip(soup.find_all('h3', class_='media__body__head')) +locations = strip(soup.find_all('h5')) +places = strip(soup.find_all('span', class_='link-fix--text')) +urls_T = soup.find_all('a', class_='media-module__link') + +# companies = [] +# for company in companies_t: +# companies.append(company.text.strip()) + +urls = [] +for url in urls_T: + urls.append(url.get('href')) + +# print(companies) +# print(locations) +# print(places) +# print(urls) + +# print the length of the lists +# print(len(companies)) +# print(len(locations)) +# print(len(places)) +# print(len(urls)) + +locations = pad_list(locations, len(companies)) + +# # Find the data you want to scrape. For example, let's say you want to scrape a table: +# tables = soup.find_all('table') + +# # Assuming you want to scrape the first table +# table = tables[0] + +# # Use pandas to read the table +# data_frame = pd.read_html(str(table))[0] + +df = pd.DataFrame({ + 'Company': companies, + 'Location': locations, + 'Place': places, + 'URL': urls +}) + +# # Now you have the data in a pandas DataFrame, you can manipulate it as you wish +print(df) + +# # Optionally, save the data to a CSV file +df.to_csv('output_bandage.csv', index=False) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f01ee25 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4==4.12.2 +pandas==2.1.2