handle Cookie banner with Selenium

2024-01-15 10:36:36 +01:00
parent 24b64ec248
commit a1a1bc757b
5 changed files with 208 additions and 18 deletions
--- a/main.py
+++ b/main.py
@ -12,37 +12,57 @@ def pad_list(lst, length, pad_value='Unknown'):


 # Replace 'your_local_file.html' with the path to your local HTML file
-with open('/Users/sbzuc0a/Downloads/Scrape/index.html', 'r', encoding='utf-8') as html_file:
+with open('/Users/sbzuc0a/Downloads/scrape_new/index.html', 'r', encoding='utf-8') as html_file:
    content = html_file.read()

 # Parse the HTML content
 soup = BeautifulSoup(content, 'html.parser')

-companies = strip(soup.find_all('h3', class_='media__body__head'))
-locations = strip(soup.find_all('h5'))
-places = strip(soup.find_all('span', class_='link-fix--text'))
-urls_T = soup.find_all('a', class_='media-module__link')
+companies = []
+urls = []
+
+# Find all the company-info divs
+companies_div = soup.find_all('div', class_='company-info')
+
+# Loop through each company and extract the information
+for company in companies_div:
+	link_tag = company.find('a')
+	if link_tag and link_tag['href']:
+		link = link_tag['href']
+		company_name = link_tag.find('h3')
+		# if company_name:
+		# 	print(f"Company Name: {company_name.text.strip()}, Link: {link}")
+		companies.append(company_name.text.strip())
+		urls.append(link)
+
+# companies = strip(soup.find_all('h3', class_='company-info'))
+# urls = strip(soup.find_all('a', class_='company-info'))
+# places = strip(soup.find_all('span', class_='link-fix--text'))
+# urls_T = soup.find_all('a', class_='media-module__link')

 # companies = []
 # for company in companies_t:
 # 	companies.append(company.text.strip())

-urls = []
-for url in urls_T:
-	urls.append(url.get('href'))
+# urls = []
+# for url in urls_T:
+# 	urls.append(url.get('href'))

 # print(companies)
-# print(locations)
+# print(urls)
 # print(places)
 # print(urls)

+for url in urls:
+	print(url)
+
 # print the length of the lists
 # print(len(companies))
 # print(len(locations))
 # print(len(places))
 # print(len(urls))

-locations = pad_list(locations, len(companies))
+# locations = pad_list(locations, len(companies))

 # # Find the data you want to scrape. For example, let's say you want to scrape a table:
 # tables = soup.find_all('table')
@ -53,15 +73,15 @@ locations = pad_list(locations, len(companies))
 # # Use pandas to read the table
 # data_frame = pd.read_html(str(table))[0]

-df = pd.DataFrame({
-    'Company': companies,
-    'Location': locations,
-    'Place': places,
-    'URL': urls
-})
+# df = pd.DataFrame({
+#     'Company': companies,
+#     'Location': locations,
+#     'Place': places,
+#     'URL': urls
+# })

 # # Now you have the data in a pandas DataFrame, you can manipulate it as you wish
-print(df)
+# print(df)

 # # Optionally, save the data to a CSV file
-df.to_csv('output_bandage.csv', index=False)
+# df.to_csv('output_bandage.csv', index=False)