ready for medica

2024-01-14 19:53:13 +01:00
commit 24b64ec248
2 changed files with 69 additions and 0 deletions
--- a/main.py
+++ b/main.py
@ -0,0 +1,67 @@
+from bs4 import BeautifulSoup
+import pandas as pd
+
+def strip(string):
+	strings = []
+	for s in string:
+		strings.append(s.text.strip())
+	return strings
+
+def pad_list(lst, length, pad_value='Unknown'):
+    return lst + [pad_value] * (length - len(lst))
+
+
+# Replace 'your_local_file.html' with the path to your local HTML file
+with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file:
+    content = html_file.read()
+
+# Parse the HTML content
+soup = BeautifulSoup(content, 'html.parser')
+
+companies = strip(soup.find_all('h3', class_='media__body__head'))
+locations = strip(soup.find_all('h5'))
+places = strip(soup.find_all('span', class_='link-fix--text'))
+urls_T = soup.find_all('a', class_='media-module__link')
+
+# companies = []
+# for company in companies_t:
+# 	companies.append(company.text.strip())
+
+urls = []
+for url in urls_T:
+	urls.append(url.get('href'))
+
+# print(companies)
+# print(locations)
+# print(places)
+# print(urls)
+
+# print the length of the lists
+# print(len(companies))
+# print(len(locations))
+# print(len(places))
+# print(len(urls))
+
+locations = pad_list(locations, len(companies))
+
+# # Find the data you want to scrape. For example, let's say you want to scrape a table:
+# tables = soup.find_all('table')
+
+# # Assuming you want to scrape the first table
+# table = tables[0]
+
+# # Use pandas to read the table
+# data_frame = pd.read_html(str(table))[0]
+
+df = pd.DataFrame({
+    'Company': companies,
+    'Location': locations,
+    'Place': places,
+    'URL': urls
+})
+
+# # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
+print(df)
+
+# # Optionally, save the data to a CSV file
+df.to_csv('output_bandage.csv', index=False)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+beautifulsoup4==4.12.2
+pandas==2.1.2