Why does a thread in QThreads crash?

E

eellazy2021-03-22 16:33:24

Python

eellazy, 2021-03-22 16:33:24

Hello!
Maybe someone has run into a problem. It has the latest Python and the latest version of the Beautiful soup library.
I want to start parsing pages in the stream. The flow without Beautiful soup works with a bang. But as soon as I substitute the parsing functionality, an error pops up
QThread: Destroyed while thread is still running

. The most interesting thing is that when I start it, I go through two iterations in a loop, and then it crashes with an error

Thread Creation Class

class ParserController:
    def __init__(self, model, view):
        self._model = model
        self._view = view

    def query_add(self, data):
        if self._model.insert_data(data):
            print('Запись добавлена в базу')
        else:
            print('Запись НЕ добавлена в базу')

    def start_parse_autovia(self):
        self.thread_1 = QtCore.QThread()
        self.autovia_thread = ParseAutovia()
        self.autovia_thread.moveToThread(self.thread_1)
        self.thread_1.started.connect(self.autovia_thread.run)
        self.autovia_thread.query.connect(self.query_add)
        self.autovia_thread.finished.connect(self.thread_1.quit)
        self.thread_1.start()

The class of the parser itself

class ParseAutovia(QtCore.QObject):
    finished = QtCore.pyqtSignal()
    query = QtCore.pyqtSignal(dict)

    START_PAGE = 'https://www.autovia.sk/osobne-auta'
    PAGINATION_PAGE = 'https://www.autovia.sk/osobne-auta/?p[page]='

    def get_data(self, html):
        # Get title
        try:
            title = html.find('h1').text.strip()
        except:
            title = ''

        # Get phone
        try:
            phone = html.find('div', class_='resp-buttons').find_all('a')[0]['href']
        except:
            phone = ''

        # Get name
        try:
            name = html.find('div', class_='resp-contact-top').find('span', class_='resp-subject').text.strip()
        except:
            name = ''

        # Get location
        try:
            location = html.find('div', class_='resp-contact-bottom').find('div', class_='resp-location').find('span').text.strip()
        except:
            location = ''

        # Get date
        try:
            date = html.find('div', class_='resp-meta').find_all('div', class_='col-6')[0].text.strip()
            date = date.replace('Aktualizované: ', '')
        except:
            date = ''

        # Get login
        try:  # Логин продавца
            login = html.find('div', class_='resp-contact-top').find('a', class_='resp-subject').text.strip()
        except:
            login = ''

        # Get login link
        try:
            login_link = html.find('div', class_='resp-contact-top').find('a', class_='resp-subject')['href']
        except:
            login_link = ''

        # Get page link
        try:
            page_link = html.find('meta', attrs={'property': 'og:url'})['content']
        except:
            page_link = ''

        data = {
            'title': title,
            'phone': phone,
            'name': name,
            'location': location,
            'date': date,
            'login': login,
            'login_link': login_link,
            'page_link': page_link
        }

        return data

    def get_html(self, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'lxml')
        return soup

    def get_pagination_count(self):
        soup = self.get_html(self.START_PAGE)
        count = soup.find('div', class_='resp-pager').find('label', class_='resp-after').text.strip()
        count = int(count.replace('z', '').replace(' ', ''))
        return count

    def get_pagination_links(self, pagination):
        url = self.PAGINATION_PAGE + str(pagination)
        soup = self.get_html(url)

        links = []
        items = soup.find('section', class_='resp-search-results').find_all('div', class_='resp-item')

        for item in items:
            link = item.find('h2').find('a').get('href')
            links.append(link)

        return links

    def run(self):
        pagination_count = self.get_pagination_count()
        pagination = 1

        while pagination <= pagination_count:
            pagination_links = self.get_pagination_links(pagination)

            for url in pagination_links:
                html = self.get_html(url)
                data = self.get_data(html)

                self.query.emit(data)
                print(data['phone'])

            pagination += 1

        self.finished.emit()

If you run this in a thread, it will work without problems

def run(self):
        for i in range(100):
            print(i)
            i += 1
            time.sleep(2)

Attached is the entry point to the program

import sys

from PyQt5 import QtWidgets
from model.ParserModel import ParserModel
from controller.ParserController import ParserController


class AppParser(QtWidgets.QApplication):
    def __init__(self, sys_args):
        super(AppParser, self).__init__(sys_args)
        self._model = ParserModel()
        self._controller = ParserController(self._model)

def main():
    app = AppParser(sys.argv)
    sys.exit(app.exec())

if __name__ == '__main__':
    main()

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

H

HemulGM, 2021-03-22
@HemulGM

You keep the program running, otherwise, after starting the threads, it will go to completion and now you get an error