J
J
jecer_inside2021-08-11 13:20:54
Python
jecer_inside, 2021-08-11 13:20:54

How to return to the previous page?

there is a site
parser parser takes the text of subcategories from breadcrumbs, but when you are in the product, the last subcategory is not in them
how to return to a higher level from the product and copy the name of the subcategory of this product?

# -*- coding: utf-8 -*-
import json
import xlsxwriter

import requests
from bs4 import BeautifulSoup

PAGES_COUNT = 1
OUT_FILENAME = 'out1.json'
OUT_XLSX_FILENAME = 'out.xlsx'


def dump_to_json(filename, data, **kwargs):
    kwargs.setdefault('ensure_ascii', False)
    kwargs.setdefault('indent', 1)

    with open(OUT_FILENAME, 'w') as f:
        json.dump(data, f, **kwargs)


def dump_to_xlsx(filename, data):
    if not len(data):
        return None

    with xlsxwriter.Workbook(filename) as workbook:
        ws = workbook.add_worksheet()
        bold = workbook.add_format({'bold': True})

        headers = ['Название товара', 'Цена', 'Категория', 'Подкатегория', 'ПодПодкатегория']
        # headers.extend(data[0]['techs'].keys())

        for col, h in enumerate(headers):
            ws.write_string(0, col, h, cell_format=bold)

        for row, item in enumerate(data, start=1):
            ws.write_string(row, 0, item['name'])
            ws.write_string(row, 1, item['amount'])
            ws.write_string(row, 2, item['category'])
            ws.write_string(row, 3, item['category2'])
            ws.write_string(row, 4, item['category3'])
            # for prop_name, prop_value in item['techs'].items():
            #     col = headers.index(prop_name)
            #     ws.write_string(row, col, prop_value)


def get_soup(url, **kwargs):
    response = requests.get(url, **kwargs)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, features='html.parser')
    else:
        soup = None
    return soup


def crawl_products(pages_count):
    """
    Собирает со страниц с 1 по pages_count включительно ссылки на товары.
    :param pages_count:     номер последней страницы с товарами.
    :return:                список URL товаров.
    """
    urls = []
    fmt = 'https://autostol63.ru/granta/grantatun/?page={page}'

    for page_n in range(1, 1 + pages_count):
        print('page: {}'.format(page_n))

        page_url = fmt.format(page=page_n)
        soup = get_soup(page_url)
        if soup is None:
            break

        for tag in soup.select('div.caption > a'):
            href = tag.attrs['href']
            url = '{}'.format(href)
            urls.append(url)

    return urls



def parse_products(urls):
    """
    Парсинг полей:
        название, цена и таблица характеристик
    по каждому товару.
    :param urls:            список URL на карточки товаров.
    :return:                массив спарсенных данных по каждому из товаров.
    """
    data = []

    for url in urls:
        print('\tproduct: {}'.format(url))

        soup = get_soup(url)
        if soup is None:
            break

        name = soup.select_one('h1 > span').text.strip()
        amount = soup.select_one('span.autocalc-product-price').text.strip()
        category = soup.select_one('ul.breadcrumb > li:nth-child(2) > a > span').text.strip()

        def get_text(el) -> str:
            if not el:
                return ""

            return el.get_text(strip=True)
        category2 = get_text(soup.select_one('ul.breadcrumb > li:nth-child(3) > a > span'))

        def get_text(el) -> str:
            if not el:
                return ""

            return el.get_text(strip=True)
        category3 = get_text(soup.select_one('#main_content > div:nth-child(1) > ul > li:nth-child(4)'))


        # techs = {}
        # for row in soup.select('div.attribute > div'):
        #     cols = row.select('span')
        #     cols = [c.text.strip() for c in cols]
        #     techs[cols[0]] = cols[1]

        item = {
            'name': name,
            'amount': amount,
            'category': category,
            'category2': category2,
            'category3': category3,
        }

        data.append(item)

    return data


def main():
    urls = crawl_products(PAGES_COUNT)
    data = parse_products(urls)
    dump_to_json(OUT_FILENAME, data)
    dump_to_xlsx(OUT_XLSX_FILENAME, data)


if __name__ == '__main__':
    main()

Answer the question

In order to leave comments, you need to log in

1 answer(s)
K
KrimsN, 2021-08-16
@KrimsN

Prepare data incrementally. Come up with a template for the final data, for example:

{
    "name":  ...,
    "amount":  ...,
    "url": ...
}

And fill in as you get the data.
In this case, you do not have to go back and spend loading time on this.

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question