Why might Scrapy's memory consumption go up?

M

Max Payne2019-06-01 22:34:53

Python

Max Payne, 2019-06-01 22:34:53

prefs() says that the Selector() and Response() objects take up the most memory.
A lot - 10-12 gigabytes for a few hours of work.
I work with scrapie like this (basic spider):

class BaseAcrossSearchSpider(BaseAcrossSearchMixin, BaseSpider):
    ITEMS_OBJECTS: str = ''
    ITEM_URL_OBJECT: str = ''
    NEXT_BUTTON_OBJECT: str = ''
    CONTINUE_IF_NEXT_BUTTON_OBJECT_IS: bool = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._last_page: int = self.START_PAGE

    def start_requests(self) -> None:
        yield Request(self._search_url(self.START_PAGE), callback=self.parse)

    def parse(self, response: Response) -> Any:
        for short_item in response.css(self.ITEMS_OBJECTS):
            yield Request(
                self._page_url(short_item.css(self.ITEM_URL_OBJECT).extract_first()),
                self._process_item(short_item)
            )

        if self.CONTINUE_IF_NEXT_BUTTON_OBJECT_IS is bool(response.css(self.NEXT_BUTTON_OBJECT)):
            yield Request(self._search_url(self._last_page + 1), self.parse)

    def _process_item(self, short_item: Selector) -> Callable:
        def wrapper(response: Response):
            """
            downloader мидлвейр проверяет, ходил ли спайдер уже по этому адресу (redis).
            Если ходил, возвращает пустой Response()
            """
            if response.body:
                return self._parse(self.FULL_MODEL,
                                   self.full_loader,
                                   response=response,
                                   url=response.url,
                                   utc_created_at=datetime.utcnow(),
                                   utc_actually_at=datetime.utcnow())
            else:
                return self._parse(self.SHORT_MODEL,
                                   self.short_loader,
                                   selector=short_item,
                                   url=response.url,
                                   utc_actually_at=datetime.utcnow())
        return wrapper

    def _parse(self,
               model: dict,
               loader,
               selector: Selector = None,
               response: Response = None,
               **kwargs):

        if not selector and response:
            selector = response.selector

        loader = loader(item=self.item(), selector=selector)

        for element, handler in model.items():
            if callable(handler):
                deque(map(loader.add_value, element, handler(selector)))
            else:
                loader.add_css(element, handler)

        for k, v in kwargs.items():
            loader.add_value(k, v)

        return loader.load_item()

    def _search_url(self, page: Optional[int]) -> str:
        ...

(child spider)

class ChildaSpider(ChildaMixin, BaseAcrossSearchSpider):
    SHORT_MODEL = {
        ('price_base', 'price_total'): _get_prices,
    }

    FULL_MODEL = {
        'price_base': 'p.basePrice__price span::text',
        'price_total': 'p.totalPrice__price span::text',
        ...
        ('body_type', 'color', 'vin', 'engine_size', 'engine_type', 'drive_type',
         'steering_location', 'transmission', 'passengers_count', 'doors_count'):
            _get_elements_from_table(range(1, 11), 2)
    }

    ITEMS_OBJECTS = 'div.casetMain'
    ITEM_URL_OBJECT = 'a::attr("href")'
    NEXT_BUTTON_OBJECT = 'button.btnFunc pager__btn__next[disabled]'
    CONTINUE_IF_NEXT_BUTTON_OBJECT_IS = False

Am I doing something wrong somewhere?

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

M

Max Payne, 2019-06-02
@YardalGedal

In general, as far as I understood, my memory did not flow.
The problem is in the algorithm, which is in the parse () method and the settings that I used for parsing. My concurrency and thread values are too high, and the DEPTH_PRIORITY value was set to the default (0).
Thus, it turned out that search pages were parsed faster than items were generated based on records from them, a long queue was created and the memory was overflowing. Setting the value DEPTH_PRIORITY = 1 helped.
However, the parsing speed, unfortunately, decreased.
Starting two spiders in two different processes improved things a bit.