Answer the question
In order to leave comments, you need to log in
Why might Scrapy's memory consumption go up?
prefs() says that the Selector() and Response() objects take up the most memory.
A lot - 10-12 gigabytes for a few hours of work.
I work with scrapie like this (basic spider):
class BaseAcrossSearchSpider(BaseAcrossSearchMixin, BaseSpider):
ITEMS_OBJECTS: str = ''
ITEM_URL_OBJECT: str = ''
NEXT_BUTTON_OBJECT: str = ''
CONTINUE_IF_NEXT_BUTTON_OBJECT_IS: bool = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._last_page: int = self.START_PAGE
def start_requests(self) -> None:
yield Request(self._search_url(self.START_PAGE), callback=self.parse)
def parse(self, response: Response) -> Any:
for short_item in response.css(self.ITEMS_OBJECTS):
yield Request(
self._page_url(short_item.css(self.ITEM_URL_OBJECT).extract_first()),
self._process_item(short_item)
)
if self.CONTINUE_IF_NEXT_BUTTON_OBJECT_IS is bool(response.css(self.NEXT_BUTTON_OBJECT)):
yield Request(self._search_url(self._last_page + 1), self.parse)
def _process_item(self, short_item: Selector) -> Callable:
def wrapper(response: Response):
"""
downloader мидлвейр проверяет, ходил ли спайдер уже по этому адресу (redis).
Если ходил, возвращает пустой Response()
"""
if response.body:
return self._parse(self.FULL_MODEL,
self.full_loader,
response=response,
url=response.url,
utc_created_at=datetime.utcnow(),
utc_actually_at=datetime.utcnow())
else:
return self._parse(self.SHORT_MODEL,
self.short_loader,
selector=short_item,
url=response.url,
utc_actually_at=datetime.utcnow())
return wrapper
def _parse(self,
model: dict,
loader,
selector: Selector = None,
response: Response = None,
**kwargs):
if not selector and response:
selector = response.selector
loader = loader(item=self.item(), selector=selector)
for element, handler in model.items():
if callable(handler):
deque(map(loader.add_value, element, handler(selector)))
else:
loader.add_css(element, handler)
for k, v in kwargs.items():
loader.add_value(k, v)
return loader.load_item()
def _search_url(self, page: Optional[int]) -> str:
...
class ChildaSpider(ChildaMixin, BaseAcrossSearchSpider):
SHORT_MODEL = {
('price_base', 'price_total'): _get_prices,
}
FULL_MODEL = {
'price_base': 'p.basePrice__price span::text',
'price_total': 'p.totalPrice__price span::text',
...
('body_type', 'color', 'vin', 'engine_size', 'engine_type', 'drive_type',
'steering_location', 'transmission', 'passengers_count', 'doors_count'):
_get_elements_from_table(range(1, 11), 2)
}
ITEMS_OBJECTS = 'div.casetMain'
ITEM_URL_OBJECT = 'a::attr("href")'
NEXT_BUTTON_OBJECT = 'button.btnFunc pager__btn__next[disabled]'
CONTINUE_IF_NEXT_BUTTON_OBJECT_IS = False
Answer the question
In order to leave comments, you need to log in
In general, as far as I understood, my memory did not flow.
The problem is in the algorithm, which is in the parse () method and the settings that I used for parsing. My concurrency and thread values are too high, and the DEPTH_PRIORITY value was set to the default (0).
Thus, it turned out that search pages were parsed faster than items were generated based on records from them, a long queue was created and the memory was overflowing. Setting the value DEPTH_PRIORITY = 1 helped.
However, the parsing speed, unfortunately, decreased.
Starting two spiders in two different processes improved things a bit.
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question