When prasing the page, the Cyrillic alphabet is displayed incorrectly. How to fix?

A

Andrei Sayevich2020-06-02 15:58:01

Python

Andrei Sayevich, 2020-06-02 15:58:01

When parsing the site, all text in Cyrillic is displayed in this form "Ñ\x80ÐµÑ\x81Ñ\x82Ð°Ð¹Ð»Ð¸Ð½Ð³".
if in some places it is possible to replace it with something readable, but in some places it is impossible ....

How to fix it?

And yes ... again .. when uploading to .csv, what I indicate in Cyrillic is also displayed in "Klingon"

import requests
from bs4 import BeautifulSoup
import csv

URL = "https://cars.av.by/subaru"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" , "Accept": "*/*"}
FILE = "cars.csv"

def get_html(url, params=None):
    r = requests.get(url, headers=HEADERS,)
    return r


def get_pages_count(html):
    soup = BeautifulSoup(html, "html.parser")
    pagination = soup.find_all("li",class_="pages-arrows-index")
    if pagination:
        return int(soup.find("li",class_="pages-arrows-index").get_text().replace("1 Ð¸Ð· ",""))
    else:
        return 1


def get_content(html):
    soup = BeautifulSoup(html, "html.parser")
    items = soup.find_all('div', class_="listing-item")
    cars = []
    for item in items:
        cars.append({
            "title":    item.find("div", class_="listing-item-title").find("a").get_text().replace("\n                            ","").replace("                        ","").replace("(Ñ\x80ÐµÑ\x81Ñ\x82Ð°Ð¹Ð»Ð¸Ð½Ð³)","(рестайлинг)"),
            "link":     item.find("div", class_="listing-item-title").find("a").get("href"),
            "bny":      item.find("div", class_="listing-item-price").find("strong").get_text().replace("Ñ\x80.","бел.руб"),
            "usd":      item.find("div", class_="listing-item-price").find("small").get_text() + " $",
            #"сity":     item.find("div", class_="listing-item-location").find("p").get_text(), ТУТ ХЗ ЧЕГО 
        })
    return cars


def save_files(items, path):
    with open(path, "w", newline="", encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=";")
        writer.writerow(["Марка", "Ссылка", "Цена в BNY", "Цена в $"])
        for item in items:
            writer.writerow([item["title"], item["link"], item["bny"], item["usd"]])


def parse():
    html = get_html(URL)
    print(html.url)
    if html.status_code == 200:
        cars = []
        pages_count = get_pages_count(html.text)
        for page in range(1, pages_count + 1):
            #html = get_html(URL, params={"page":page})
            html = get_html(URL + f'/page/{page}')
            print(f"Парсинг страницы {page} из {pages_count}...{html.url}")
            cars.extend(get_content(html.text))

        save_files(cars, FILE)
        print(cars)
        print(f'Получено {len(cars)} автомобилей')
    else:
        print("Error")
parse()

Reply

Answer the question

In order to leave comments, you need to log in

3 answer(s)

S

Sergey Karbivnichy, 2020-06-02
@Soerrrrrrr

This is how you get "readable" html:

url = 'https://cars.av.by/subaru'
response = requests.get(url,headers=headers)
response.encoding = response.apparent_encoding
print(response.text)

Only you need to correct the lines of the form in the parser: replace the lines of the form '1 Ð¸Ð·' with Russian ones.
replace("1 Ð¸Ð· ",""))

H

HemulGM, 2020-06-02
@HemulGM

UTF8 encoding because

D

Dmitry Kirey, 2020-09-17
@neck0081

Before outputting item to csv add
.encode('latin1').decode('utf8')
Sometimes
writer.writerow([item["city"].encode('latin1').decode('utf8')]) helps