Is there a solution to the problem of changing the number of ads on the page when parsing?

V

Vladimir Smolyarenko2021-02-01 14:28:10

Python

Vladimir Smolyarenko, 2021-02-01 14:28:10

import requests
from bs4 import BeautifulSoup
import csv

URL = 'https://auto.ria.com/uk/car/lancia/'
HEADERS = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
    'accept': '*/*'}
FILE = 'cars.csv'


def get_html(url, params=None):
    r = requests.get(url, headers=HEADERS, params=params)
    return r


def get_pages_count(html):
    soup = BeautifulSoup(html, 'html.parser')
    pagination = soup.find_all('span', class_='mhide')
    if pagination:
        return int(pagination[-1].get_text())
    else:
        return 1


def get_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.find_all('div', class_='content-bar')

    cars = []
    for item in items:
        uah_price = item.find('span', class_='i-block')
        cars.append({
            'title': item.find('a', class_='address').get_text(strip=True),
            'link': item.find('a', class_='address').get('href'),
            'usd_price': item.find('span', class_='bold green '
                                                  'size22').get_text() + ' $',
            'uah_price': uah_price.get_text().replace('\xa0', ' '),
            'city': item.find('li', class_='item-char view-location '
                                           'js-location').get_text(
                strip=True).replace('(від)', '')
        })
    return cars


def save_file(items, path):
    with open(path, 'w', newline='') as file:
        writer = csv.writer(file, delimiter=';')
        writer.writerow(['Заглавие', 'ссылка', 'Цена в USD', 'Цена в UAH',
                         'Город'])
        for item in items:
            writer.writerow([item['title'], item['link'],
                             item['usd_price'],
                             item['uah_price'], item['city']])


def parse():
    html = get_html(URL)
    if html.status_code == 200:
        cars = []
        pages_count = get_pages_count(html.text)
        print(pages_count)
        for page in range(1, pages_count + 1):
            print(f'Парсинг страницы {page} из {pages_count}...')
            html = get_html(URL, params={'page': page})
            cars.extend(get_content(html.text))
            save_file(cars, FILE,)
            print(len(cars))
        print(f'Получено {len(cars)} автомобилей')
    else:
        print('Error')


parse()

1. Creates a csv file with unreadable (hieroglyphs) Russian language. I tried to specify encoding='utf8' in the save_file function, but it did not help, I have IOS and the preview through the space on the csv file shows Russian text, and when I open the file in Excel, Russian hieroglyphs are replaced. I also tried to change the text format to utf8 through the Exel settings, but the same thing happened only with other hieroglyphs. Is there a solution to this problem, can anyone tell me?
2. Incorrectly determines the number of pages due to the fact that the number of ads on one page can be changed and not all information is taken from a page of 10 ads 9. The number of ads is 53 and I have 28 in total, 9 on the first page 10. What can anyone do? then explain, is there a solution?