Parsing in python. How to fix pagination?

L

lexansk2020-02-21 05:49:15

Python

lexansk, 2020-02-21 05:49:15

Hello! There is such a program that parses products by reference:

import requests
import csv
from bs4 import BeautifulSoup as bs


headers = {'accept': '*/*', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0'}
MAIN_URL = 'https://www.yoox.com' # для формирования полной ссылки

base_url = 'https://www.yoox.com/ru/%D0%B4%D0%BB%D1%8F%20%D0%BC%D1%83%D0%B6%D1%87%D0%B8%D0%BD/%D0%BE%D0%B4%D0%B5%D0%B6%D0%B4%D0%B0/shoponline#/dept=clothingmen&gender=U&page=1&season=X'


def yoox_parse(base_url, headers):
    session = requests.Session()
    request = session.get(base_url, headers=headers)
    clothes = []
    urls = []
    urls.append(base_url)

    if request.status_code == 200:
        soup = bs(request.content, 'html.parser')
        try:
            pagination = soup.find_all('li', attrs={'class': 'text-light'})
            count = int(pagination[-1].text)
            for i in range(1,count):
                url = f'https://www.yoox.com/ru/%D0%B4%D0%BB%D1%8F%20%D0%BC%D1%83%D0%B6%D1%87%D0%B8%D0%BD/%D0%BE%D0%B4%D0%B5%D0%B6%D0%B4%D0%B0/shoponline#/dept=clothingmen&gender=U&page={i}&season=X'
                if url not in urls:
                    urls.append(url)


        except:
             pass

    for url in urls:
        request = session.get(url, headers=headers)
        soup = bs(request.content, 'html.parser')
        divs = soup.find_all('div', attrs={'class': 'col-8-24'})


        for div in divs:
            brand = div.find('div', attrs={'class': 'brand font-bold text-uppercase'})
            group = div.find('div', attrs={'class': 'microcategory font-sans'})
            old_price = div.find('span', attrs={'class': 'oldprice text-linethrough text-light'})
            new_price = div.find('span', attrs={'class': 'newprice font-bold'})
            price = div.find('span', attrs={'class': 'fullprice font-bold'})
            sizes = div.find_all('span', attrs={'class': 'aSize'})
            href = div.find('a', attrs={'class': 'itemlink'})
            art = div.find('div', attrs={'class': ''})






            if brand and group and new_price: # new_price выводит только товары со скидкой
                clothes.append({
                    'art': art,
                    'href': MAIN_URL + href.get('href'),
                    'sizes': [size.get_text() for size in sizes],
                    'brand': brand.get_text(),
                    'group': group.get_text(strip=True),
                    'old_price': old_price.get_text().replace(' ', '').replace('руб', '') if old_price else None,
                    'new_price': new_price.get_text().replace(' ', '').replace('руб', '') if new_price else None,
                    'price': price.get_text().replace(' ', '').replace('руб', '') if price else None,



                })
        print(len(clothes))
    else:
        print('ERROR or Done')

    return clothes

def files_writer(clothes):
    with open('parsed_yoox_man_clothes.csv', 'w', newline='') as file:
        a_pen = csv.writer(file)
        a_pen.writerow(('Артикул', 'Ссылка', 'Размер', 'Марка', 'Категория', 'Старая цена', 'Новая цена', 'Цена'))
        for clothe in clothes:
            a_pen.writerow((clothe['art'], clothe['href'], clothe['sizes'], clothe['brand'], clothe['group'], clothe['old_price'], clothe['new_price'], clothe['price']))




clothes = yoox_parse(base_url, headers)
files_writer(clothes)

When I started to deal with the content of the parsing, I realized that the program collects data only from base_url , but in a number of times equal to the number of pages on the site.

Then I started to deal with the code, putting intermediate outputs of the result and realized that the program generates correct links to all pages, but still collects only from the first one.

Ideas are over, it remains only to ask for help from the collective mind.
If you run the script, it's better to replace for i in range(1,count): with for i in range(1,5): so as not to wait a long time for the result.

Thanks in advance. I'm still just learning and studying in practice seems to me more interesting.

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

S

Sergey Karbivnichy, 2020-02-21
@lexansk

You are using the wrong link. You need this - "www.yoox.com/RU/shoponline?dept=clothingmen&gender=U&page={x}&season=X&clientabt=SmsMultiChannel_ON%2CSrRecommendations_ON%2CNewDelivery_ON%2CRecentlyViewed_ON%2CmyooxNew_ON
" . Only results with discounts are written to the file!
Here is the working code, maybe it will be useful for someone:

import requests
from bs4 import BeautifulSoup
from lxml import html
import csv

url = 'https://www.yoox.com/ru/%D0%B4%D0%BB%D1%8F%20%D0%BC%D1%83%D0%B6%D1%87%D0%B8%D0%BD/%D0%BE%D0%B4%D0%B5%D0%B6%D0%B4%D0%B0/shoponline#/dept=clothingmen&gender=U&page=1&season=X'

headers = {'user-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
      'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}

def getClothes(url,page_id):
  clothes = []
  respones = requests.get(url,headers=headers)
  soup = BeautifulSoup(respones.text,'lxml')

  mainContent = soup.find('div',id=f'srpage{page_id}')
  products = mainContent.find_all('div',class_='col-8-24')

  for product in products:

    brand = product.find('div',class_='itemContainer')['data-brand'] # Бренд
    cod10 = product.find('div',class_='itemContainer')['data-current-cod10'] # Для формирования ссылки yoox.com/ru/{cod10}/item
    category = product.find('div',class_='itemContainer')['data-category']	# Категория
    oldPrice = product.find('span',class_='oldprice text-linethrough text-light') # Старая цена (может не быть)
    newPrice = product.find('span',class_='newprice font-bold') # Новая цена (может не быть)
    
    if oldPrice is not None:
      # Данный код выполняется только, если на товар есть скидка
      sizes = product.find_all('div',class_='size text-light')
      str_sizes = ''

      for x in sizes:
        str_sizes += x.text.strip().replace('\n',';')
      clothes.append({'art':cod10,
              'brand':brand,
              'category':category,
              'url':f'https://yoox.com/ru/{cod10}/item',
              'oldPrice':oldPrice.text,
              'newPrice':newPrice.text,
              'sizes':str_sizes
              })
  return clothes


def getLastPage(url):
  respones = requests.get(url,headers=headers)
  soup = BeautifulSoup(respones.text,'lxml')
  id = soup.find_all('li', class_ = 'text-light')[2]
  
  return int(id.a['data-total-page']) + 1

def writeCsvHeader():
  with open('yoox_man_clothes.csv', 'a', newline='') as file:
    a_pen = csv.writer(file)
    a_pen.writerow(('Артикул', 'Ссылка', 'Размеры', 'Бренд', 'Категория', 'Старая цена', 'Новая цена'))	

def files_writer(clothes):
  with open('yoox_man_clothes.csv', 'a', newline='') as file:
    a_pen = csv.writer(file)
    for clothe in clothes:
      a_pen.writerow((clothe['art'], clothe['url'], clothe['sizes'], clothe['brand'], clothe['category'], clothe['oldPrice'], clothe['newPrice']))


if __name__ == '__main__':
  writeCsvHeader() # Запись заголовка в csv файл
  lastPage = getLastPage(url) # Получаем последнею страницу
  for x in range(1,lastPage): # Вместо 1 и lastPage можно указать диапазон страниц. Не начинайте парсить с нулевой страницы!
    print(f'Скачавается: {x} из {lastPage-1}')
    url = f'https://www.yoox.com/RU/shoponline?dept=clothingmen&gender=U&page={x}&season=X&clientabt=SmsMultiChannel_ON%2CSrRecommendations_ON%2CNewDelivery_ON%2CRecentlyViewed_ON%2CmyooxNew_ON'
    files_writer(getClothes(url,x)) # Парсим и одновременно заносим данные в csv