handle Cookie banner with Selenium
This commit is contained in:
56
main.py
56
main.py
@ -12,37 +12,57 @@ def pad_list(lst, length, pad_value='Unknown'):
|
||||
|
||||
|
||||
# Replace 'your_local_file.html' with the path to your local HTML file
|
||||
with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file:
|
||||
with open('/Users/sbzuc0a/Downloads/scrape_new/index.html', 'r', encoding='utf-8') as html_file:
|
||||
content = html_file.read()
|
||||
|
||||
# Parse the HTML content
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
companies = strip(soup.find_all('h3', class_='media__body__head'))
|
||||
locations = strip(soup.find_all('h5'))
|
||||
places = strip(soup.find_all('span', class_='link-fix--text'))
|
||||
urls_T = soup.find_all('a', class_='media-module__link')
|
||||
companies = []
|
||||
urls = []
|
||||
|
||||
# Find all the company-info divs
|
||||
companies_div = soup.find_all('div', class_='company-info')
|
||||
|
||||
# Loop through each company and extract the information
|
||||
for company in companies_div:
|
||||
link_tag = company.find('a')
|
||||
if link_tag and link_tag['href']:
|
||||
link = link_tag['href']
|
||||
company_name = link_tag.find('h3')
|
||||
# if company_name:
|
||||
# print(f"Company Name: {company_name.text.strip()}, Link: {link}")
|
||||
companies.append(company_name.text.strip())
|
||||
urls.append(link)
|
||||
|
||||
# companies = strip(soup.find_all('h3', class_='company-info'))
|
||||
# urls = strip(soup.find_all('a', class_='company-info'))
|
||||
# places = strip(soup.find_all('span', class_='link-fix--text'))
|
||||
# urls_T = soup.find_all('a', class_='media-module__link')
|
||||
|
||||
# companies = []
|
||||
# for company in companies_t:
|
||||
# companies.append(company.text.strip())
|
||||
|
||||
urls = []
|
||||
for url in urls_T:
|
||||
urls.append(url.get('href'))
|
||||
# urls = []
|
||||
# for url in urls_T:
|
||||
# urls.append(url.get('href'))
|
||||
|
||||
# print(companies)
|
||||
# print(locations)
|
||||
# print(urls)
|
||||
# print(places)
|
||||
# print(urls)
|
||||
|
||||
for url in urls:
|
||||
print(url)
|
||||
|
||||
# print the length of the lists
|
||||
# print(len(companies))
|
||||
# print(len(locations))
|
||||
# print(len(places))
|
||||
# print(len(urls))
|
||||
|
||||
locations = pad_list(locations, len(companies))
|
||||
# locations = pad_list(locations, len(companies))
|
||||
|
||||
# # Find the data you want to scrape. For example, let's say you want to scrape a table:
|
||||
# tables = soup.find_all('table')
|
||||
@ -53,15 +73,15 @@ locations = pad_list(locations, len(companies))
|
||||
# # Use pandas to read the table
|
||||
# data_frame = pd.read_html(str(table))[0]
|
||||
|
||||
df = pd.DataFrame({
|
||||
'Company': companies,
|
||||
'Location': locations,
|
||||
'Place': places,
|
||||
'URL': urls
|
||||
})
|
||||
# df = pd.DataFrame({
|
||||
# 'Company': companies,
|
||||
# 'Location': locations,
|
||||
# 'Place': places,
|
||||
# 'URL': urls
|
||||
# })
|
||||
|
||||
# # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
|
||||
print(df)
|
||||
# print(df)
|
||||
|
||||
# # Optionally, save the data to a CSV file
|
||||
df.to_csv('output_bandage.csv', index=False)
|
||||
# df.to_csv('output_bandage.csv', index=False)
|
||||
|
||||
Reference in New Issue
Block a user