import requests
from selectolax.parser import HTMLParser
import csv
def get_html(url):
try:
result = requests.get(url)
result.raise_for_status()
return result.text
except(requests.RequestException, ValueError):
print(‘Server error’)
return False
def write_csv(data):
with open(‘firms.csv’, ‘a’, encoding=’utf-8′, newline=”) as file:
order = [‘url’, ‘name’, ‘city’, ‘category’, ‘site’, ‘social’, ‘phone’]
writer = csv.DictWriter(file, fieldnames=order)
writer.writerow(data)
def get_data(html, url):
dom = HTMLParser(html)
business_card_view = dom.css_first(‘.business-card-view’)
breadcrumbs = business_card_view.css(‘.breadcrumbs-view__breadcrumb’)
try:
city = breadcrumbs[1].text()
category = breadcrumbs[3].text()
except:
city = ”
category = ”
name = business_card_view.css_first(‘h1’).text()
try:
site = business_card_view.css_first(‘.business-urls-view__url > a’).attrs[‘href’]
except:
site = ”
try:
social = business_card_view.css_first(‘.business-contacts-view__social-button > a’).attrs[‘href’]
except:
social = ”
try:
phone = business_card_view.css_first(‘.card-phones-view__number > span’).text()
except:
phone = ”
data = {‘url’: url,
‘name’: name,
‘city’: city,
‘category’: category,
‘site’: site,
‘social’: social,
‘phone’: phone,
}
write_csv(data)
print(f'{city} | {category} | {name}’)
def main():
with open(‘firms-url.txt’, ‘r’, ) as file:
urls = [line.strip() for line in file.readlines()]
n = 0
for url in urls:
n += 1
print(n)
html = get_html(url)
get_data(html, url)
if __name__ == ‘__main__’:
main()