V
V
Vladimir Smolyarenko2021-02-01 14:28:10
Python
Vladimir Smolyarenko, 2021-02-01 14:28:10

Is there a solution to the problem of changing the number of ads on the page when parsing?

import requests
from bs4 import BeautifulSoup
import csv

URL = 'https://auto.ria.com/uk/car/lancia/'
HEADERS = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
    'accept': '*/*'}
FILE = 'cars.csv'


def get_html(url, params=None):
    r = requests.get(url, headers=HEADERS, params=params)
    return r


def get_pages_count(html):
    soup = BeautifulSoup(html, 'html.parser')
    pagination = soup.find_all('span', class_='mhide')
    if pagination:
        return int(pagination[-1].get_text())
    else:
        return 1


def get_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.find_all('div', class_='content-bar')

    cars = []
    for item in items:
        uah_price = item.find('span', class_='i-block')
        cars.append({
            'title': item.find('a', class_='address').get_text(strip=True),
            'link': item.find('a', class_='address').get('href'),
            'usd_price': item.find('span', class_='bold green '
                                                  'size22').get_text() + ' $',
            'uah_price': uah_price.get_text().replace('\xa0', ' '),
            'city': item.find('li', class_='item-char view-location '
                                           'js-location').get_text(
                strip=True).replace('(від)', '')
        })
    return cars


def save_file(items, path):
    with open(path, 'w', newline='') as file:
        writer = csv.writer(file, delimiter=';')
        writer.writerow(['Заглавие', 'ссылка', 'Цена в USD', 'Цена в UAH',
                         'Город'])
        for item in items:
            writer.writerow([item['title'], item['link'],
                             item['usd_price'],
                             item['uah_price'], item['city']])


def parse():
    html = get_html(URL)
    if html.status_code == 200:
        cars = []
        pages_count = get_pages_count(html.text)
        print(pages_count)
        for page in range(1, pages_count + 1):
            print(f'Парсинг страницы {page} из {pages_count}...')
            html = get_html(URL, params={'page': page})
            cars.extend(get_content(html.text))
            save_file(cars, FILE,)
            print(len(cars))
        print(f'Получено {len(cars)} автомобилей')
    else:
        print('Error')


parse()


1. Creates a csv file with unreadable (hieroglyphs) Russian language. I tried to specify encoding='utf8' in the save_file function, but it did not help, I have IOS and the preview through the space on the csv file shows Russian text, and when I open the file in Excel, Russian hieroglyphs are replaced. I also tried to change the text format to utf8 through the Exel settings, but the same thing happened only with other hieroglyphs. Is there a solution to this problem, can anyone tell me?
2. Incorrectly determines the number of pages due to the fact that the number of ads on one page can be changed and not all information is taken from a page of 10 ads 9. The number of ads is 53 and I have 28 in total, 9 on the first page 10. What can anyone do? then explain, is there a solution?

Answer the question

In order to leave comments, you need to log in

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question