Python invalid literal for int() with base 10: ''?

E

emeryee2021-03-31 16:18:51

Python

emeryee, 2021-03-31 16:18:51

import requests
from bs4 import BeautifulSoup
from time import sleep


print('Telegram Parser v1.4\nCreator: vk.com/lucifer\nLast update: 07.03.2021\n')
print('\nЗапускаю бота...\n')

class code():
    def __init__(self):
        with open('cookie.txt',mode='r') as file2:
            cookie = file2.read()
        self.headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0",
            "Cookie":cookie
        }
        self.session = requests.session()
        self.page_count = int(self.count_page())
        self.parse_page()
    def zapis_in_file(self,telegram):
        with open('list.txt',mode='a') as file:
            file.write(f'{telegram}\n')
    def parse_tg_from_profile(self,massiv):
        for b in massiv:
            try:
                info2 = self.session.get(b,headers=self.headers).text
                main_2 = BeautifulSoup(info2,'lxml')
                simp = main_2.find_all('div',class_='count')[1].get_text()
                if int(simp) >= 180:
                    telega = main_2.find_all('a',rel="nofollow noopener")
                    if str(telega) == '[]':
                        pass
                    else:
                        if 'tg:' in str(telega[0]): 
                            self.zapis_in_file(telega[0]['href'].split('=')[1])
                            print('{} | {} симпатий | TG: {}'.format(b,simp,telega[0]['href'].split('=')[1]))
                        elif len(telega) == 2:
                            if 'tg:' in str(telega[1]):
                                self.zapis_in_file(telega[1]['href'].split('=')[1])
                                print('{} | @{}'.format(b,telega[1]['href'].split('=')[1]))
                sleep(0.8)
            except:
                sleep(15)
    def parse_page(self):
        for stranitsa in range(1,self.page_count+1):
            try:
                info = self.session.get(f'https://lolz.guru/online/?type=registered&page={stranitsa}',headers=self.headers)
                main_2 = BeautifulSoup(info.text,'lxml')
                links_to_profile = main_2.find_all('a',class_='username StatusTooltip')
                links_lolz = []
                for link in links_to_profile:
                    links_lolz.append('https://lolz.guru/{}'.format(link['href']))
                self.parse_tg_from_profile(links_lolz)
            except:
                sleep(15)
        
    def count_page(self):
        page_count = self.session.get('https://lolz.guru/online/?type=registered&page=1',headers=self.headers)
        main_text = BeautifulSoup(page_count.text,'лксмл')
        result = main_text.find_all('a',_class="")[81].get_text()
        print('Получено {} страниц. Начинаю парсинг:'.format(result))
        return result
        
code()

Gives errors https://imgur.com/kb5qyGB
Please help.
I've already looked everywhere, even on StackOwerflowm, but couldn't find a solution(

Reply

Answer the question

In order to leave comments, you need to log in

2 answer(s)

A

Alexa2007, 2021-03-31
@emeryee

disabled cookies
changed "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
This method got the result

def count_page(self):
        page_count = self.session.get('https://lolz.guru/online/?type=registered&page=1',headers=self.headers)
        print(page_count.text)

<!doctype html><html><head><script src="/process-qv9ypsgmv9.js"></script></head><body><script>window.onload=function(){process();}</script><noscript><p>Please enable JavaScript and Cookies in your browser.</p></noscript></body></html>

I had to manually dig out cookies from chrome, but it seems to work now, but one question, what class are you looking for? It seems that you are trying to find a class in which the maximum number of pages is written and parse them all in turn.
In short, your code works, you just take the wrong one , tried to manually catch it, but it didn’t work. On fast it will be selenium. And for parsing already use the class. And if I offer you to simply parse them in a multithread.<a>

import requests
from multiprocessing.dummy import Pool as ThreadPool 

url = 'https://lolz.guru/online/?type=registered&page='
# Не забудь хедеры и куки прикрутить, а то точно работать не будет
urls = [url+str(i) for i in range(1,8)]# С первой по максимальную страницу
print(urls)

def get_url(url):
    r = requests.get(url)
    print(r.text)

pool = ThreadPool(20) # Кол-во потоков - по числу ядер, но можно забить и больше, просто работать будет по кол-ву ядер
results = pool.map(get_url, urls)
pool.close() 
pool.join()

S

Sergey Karbivnichy, 2021-03-31
@hottabxp

1) As an option, it's better to write code in python's native language. It will work better if the line: lxml is replaced by lxml 2)
main_text = BeautifulSoup(page_count.text,'лксмл')

result = main_text.find_all('a',_class="")[81].get_text()

I have an IndexError here, most likely due to the fact that the cookie.txt file is empty. This is not a very good method of finding links.