How to parse all articles from an infinite scroll page in Beautifulsoup + Selenium?

G

Gullveig2021-03-13 09:50:09

Python

Gullveig, 2021-03-13 09:50:09

The idea is a parser that collects articles only without pictures, i.e. with src content as a file with svg extension.

But the page scrolls completely, and only the first 30 articles are parsed.

import bs4
import requests
import collections
import logging
import csv
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup as bs

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('wb')

ParseResult = collections.namedtuple(
'ParseResult',
(
'brand_name',
'url_image',
),
)
HEADERS = (
'Brand',
'Link',
)

driver = webdriver.Chrome(
'C://Users/roman/AppData/Local/Programs/Python/Python37-32/Lib/site-packages/selenium/common/chromedriver_win32(1)/chromedriver.exe ')
driver.get(' https://upakovka-spb.ru/category/2-odnorazovaya-po... ')
SCROLL_PAUSE_TIME = 0.5

# Get scroll height

import time
counter = 0
for _ in range(8):
driver .execute_script("window.scrollBy(0, arguments[0]);", counter)
counter += 1000
time.sleep(2)

source_data = driver.page_source
soup = bs(source_data)

class Client:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 84.0.4147.89 Safari / 537.36'
}
self.result = [ ]

def load_page(self, page: int = None):
url = (' https://upakovka-spb.ru/category/2-odnorazovaya-po... ')
res = self.session.get(url=url )
res.raise_for_status()
return res.text

def parse_page(self, text: str):
soup = bs4.BeautifulSoup(text, 'lxml')
container = soup.select('div.s-product-block')
for block in container:
self.parse_block(block=block)

def parse_block(self, block):
# logger.info(block)
# logger.info(' ' * 100)
url_image = block.select_one('img[src$="svg"]')
if not url_image:
logger.error ('image')
return
image = url_image.get('src')
if not image:
logger.error('yesimage')
brand_name = block.select_one('h5.s-product-header')
if not brand_name:
logger. error(f'no brand_name on {url}')
return
brand_name = brand_name.text
brand_name = brand_name.replace('/', '').strip()
logger.info('%s, %s', url_image, brand_name)

def save_result(self):
path = 'C:/Users/roman/PycharmProjects/new/product_scraper/result.csv'
with open(path, 'w') as f:
writer = csv.writer(f, quoting=csv. QUOTE_MINIMAL)
writer.writerow(HEADERS)
for item in self.result:
writer.writerows(item)

def run(self):
text = self.load_page()
self.parse_page(text=text)

self.save_result()

if __name__ = = '__main__':
parser = Client()
parser.run()

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

S

Sergey Karbivnichy, 2021-03-13
@gullveig

I don't recommend scrolling! Imagine that there are +100500 pages. And if you scroll, after a while the RAM will end, and in addition to the parser, the computer will also freeze.
Instead, it's better to add the parameter "?page=1" to the link - where the number is the page number. And in a loop, increase the number until the text appears on the page like - "There are no products in this category."