ready for medica
This commit is contained in:
67
main.py
Normal file
67
main.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def strip(string):
|
||||||
|
strings = []
|
||||||
|
for s in string:
|
||||||
|
strings.append(s.text.strip())
|
||||||
|
return strings
|
||||||
|
|
||||||
|
def pad_list(lst, length, pad_value='Unknown'):
|
||||||
|
return lst + [pad_value] * (length - len(lst))
|
||||||
|
|
||||||
|
|
||||||
|
# Replace 'your_local_file.html' with the path to your local HTML file
|
||||||
|
with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file:
|
||||||
|
content = html_file.read()
|
||||||
|
|
||||||
|
# Parse the HTML content
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
|
companies = strip(soup.find_all('h3', class_='media__body__head'))
|
||||||
|
locations = strip(soup.find_all('h5'))
|
||||||
|
places = strip(soup.find_all('span', class_='link-fix--text'))
|
||||||
|
urls_T = soup.find_all('a', class_='media-module__link')
|
||||||
|
|
||||||
|
# companies = []
|
||||||
|
# for company in companies_t:
|
||||||
|
# companies.append(company.text.strip())
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for url in urls_T:
|
||||||
|
urls.append(url.get('href'))
|
||||||
|
|
||||||
|
# print(companies)
|
||||||
|
# print(locations)
|
||||||
|
# print(places)
|
||||||
|
# print(urls)
|
||||||
|
|
||||||
|
# print the length of the lists
|
||||||
|
# print(len(companies))
|
||||||
|
# print(len(locations))
|
||||||
|
# print(len(places))
|
||||||
|
# print(len(urls))
|
||||||
|
|
||||||
|
locations = pad_list(locations, len(companies))
|
||||||
|
|
||||||
|
# # Find the data you want to scrape. For example, let's say you want to scrape a table:
|
||||||
|
# tables = soup.find_all('table')
|
||||||
|
|
||||||
|
# # Assuming you want to scrape the first table
|
||||||
|
# table = tables[0]
|
||||||
|
|
||||||
|
# # Use pandas to read the table
|
||||||
|
# data_frame = pd.read_html(str(table))[0]
|
||||||
|
|
||||||
|
df = pd.DataFrame({
|
||||||
|
'Company': companies,
|
||||||
|
'Location': locations,
|
||||||
|
'Place': places,
|
||||||
|
'URL': urls
|
||||||
|
})
|
||||||
|
|
||||||
|
# # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
# # Optionally, save the data to a CSV file
|
||||||
|
df.to_csv('output_bandage.csv', index=False)
|
||||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
beautifulsoup4==4.12.2
|
||||||
|
pandas==2.1.2
|
||||||
Reference in New Issue
Block a user