Answer the question
In order to leave comments, you need to log in
When prasing the page, the Cyrillic alphabet is displayed incorrectly. How to fix?
When parsing the site, all text in Cyrillic is displayed in this form "Ñ\x80еÑ\x81Ñ\x82айлинг".
if in some places it is possible to replace it with something readable, but in some places it is impossible ....
How to fix it?
And yes ... again .. when uploading to .csv, what I indicate in Cyrillic is also displayed in "Klingon"
import requests
from bs4 import BeautifulSoup
import csv
URL = "https://cars.av.by/subaru"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" , "Accept": "*/*"}
FILE = "cars.csv"
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS,)
return r
def get_pages_count(html):
soup = BeautifulSoup(html, "html.parser")
pagination = soup.find_all("li",class_="pages-arrows-index")
if pagination:
return int(soup.find("li",class_="pages-arrows-index").get_text().replace("1 из ",""))
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all('div', class_="listing-item")
cars = []
for item in items:
cars.append({
"title": item.find("div", class_="listing-item-title").find("a").get_text().replace("\n ","").replace(" ","").replace("(Ñ\x80еÑ\x81Ñ\x82айлинг)","(рестайлинг)"),
"link": item.find("div", class_="listing-item-title").find("a").get("href"),
"bny": item.find("div", class_="listing-item-price").find("strong").get_text().replace("Ñ\x80.","бел.руб"),
"usd": item.find("div", class_="listing-item-price").find("small").get_text() + " $",
#"сity": item.find("div", class_="listing-item-location").find("p").get_text(), ТУТ ХЗ ЧЕГО
})
return cars
def save_files(items, path):
with open(path, "w", newline="", encoding='utf-8') as file:
writer = csv.writer(file, delimiter=";")
writer.writerow(["Марка", "Ссылка", "Цена в BNY", "Цена в $"])
for item in items:
writer.writerow([item["title"], item["link"], item["bny"], item["usd"]])
def parse():
html = get_html(URL)
print(html.url)
if html.status_code == 200:
cars = []
pages_count = get_pages_count(html.text)
for page in range(1, pages_count + 1):
#html = get_html(URL, params={"page":page})
html = get_html(URL + f'/page/{page}')
print(f"Парсинг страницы {page} из {pages_count}...{html.url}")
cars.extend(get_content(html.text))
save_files(cars, FILE)
print(cars)
print(f'Получено {len(cars)} автомобилей')
else:
print("Error")
parse()
Answer the question
In order to leave comments, you need to log in
This is how you get "readable" html:
url = 'https://cars.av.by/subaru'
response = requests.get(url,headers=headers)
response.encoding = response.apparent_encoding
print(response.text)
replace("1 из ",""))
Before outputting item to csv add
.encode('latin1').decode('utf8')
Sometimes
writer.writerow([item["city"].encode('latin1').decode('utf8')]) helps
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question