A
A
Andrei Sayevich2020-06-02 15:58:01
Python
Andrei Sayevich, 2020-06-02 15:58:01

When prasing the page, the Cyrillic alphabet is displayed incorrectly. How to fix?

When parsing the site, all text in Cyrillic is displayed in this form "Ñ\x80еÑ\x81Ñ\x82айлинг".
if in some places it is possible to replace it with something readable, but in some places it is impossible ....

How to fix it?

And yes ... again .. when uploading to .csv, what I indicate in Cyrillic is also displayed in "Klingon"

import requests
from bs4 import BeautifulSoup
import csv

URL = "https://cars.av.by/subaru"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36" , "Accept": "*/*"}
FILE = "cars.csv"

def get_html(url, params=None):
    r = requests.get(url, headers=HEADERS,)
    return r


def get_pages_count(html):
    soup = BeautifulSoup(html, "html.parser")
    pagination = soup.find_all("li",class_="pages-arrows-index")
    if pagination:
        return int(soup.find("li",class_="pages-arrows-index").get_text().replace("1 из ",""))
    else:
        return 1


def get_content(html):
    soup = BeautifulSoup(html, "html.parser")
    items = soup.find_all('div', class_="listing-item")
    cars = []
    for item in items:
        cars.append({
            "title":    item.find("div", class_="listing-item-title").find("a").get_text().replace("\n                            ","").replace("                        ","").replace("(Ñ\x80еÑ\x81Ñ\x82айлинг)","(рестайлинг)"),
            "link":     item.find("div", class_="listing-item-title").find("a").get("href"),
            "bny":      item.find("div", class_="listing-item-price").find("strong").get_text().replace("Ñ\x80.","бел.руб"),
            "usd":      item.find("div", class_="listing-item-price").find("small").get_text() + " $",
            #"сity":     item.find("div", class_="listing-item-location").find("p").get_text(), ТУТ ХЗ ЧЕГО 
        })
    return cars


def save_files(items, path):
    with open(path, "w", newline="", encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=";")
        writer.writerow(["Марка", "Ссылка", "Цена в BNY", "Цена в $"])
        for item in items:
            writer.writerow([item["title"], item["link"], item["bny"], item["usd"]])


def parse():
    html = get_html(URL)
    print(html.url)
    if html.status_code == 200:
        cars = []
        pages_count = get_pages_count(html.text)
        for page in range(1, pages_count + 1):
            #html = get_html(URL, params={"page":page})
            html = get_html(URL + f'/page/{page}')
            print(f"Парсинг страницы {page} из {pages_count}...{html.url}")
            cars.extend(get_content(html.text))

        save_files(cars, FILE)
        print(cars)
        print(f'Получено {len(cars)} автомобилей')
    else:
        print("Error")
parse()

Answer the question

In order to leave comments, you need to log in

3 answer(s)
S
Sergey Karbivnichy, 2020-06-02
@Soerrrrrrr

This is how you get "readable" html:

url = 'https://cars.av.by/subaru'
response = requests.get(url,headers=headers)
response.encoding = response.apparent_encoding
print(response.text)

Only you need to correct the lines of the form in the parser: replace the lines of the form '1 из' with Russian ones.
replace("1 из ",""))

H
HemulGM, 2020-06-02
@HemulGM

UTF8 encoding because

D
Dmitry Kirey, 2020-09-17
@neck0081

Before outputting item to csv add
.encode('latin1').decode('utf8')
Sometimes
writer.writerow([item["city"].encode('latin1').decode('utf8')]) helps

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question