How to be authorized on a Python 3 Requests site?

D

Daniil2018-05-05 23:10:52

Python

Daniil, 2018-05-05 23:10:52

Good evening. There is, for example, a site with a login form:
https://lolzteam.net/login
I don't quite understand how to be an authorized user when web scraping this resource. For example, I want to collect information from several pages, but it says "You can't view the text because you're not authorized." I'd like to understand how to get around this. If possible, with an example, please.

import requests
from bs4 import BeautifulSoup, NavigableString, Tag

header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
    'Connection': 'keep-alive',
    'X-Requested-With': 'XMLHttpRequest'
}

payload = {
    'login': 'Some name',
    'password': 'Some pass'
}

blacklist_lolz = 'https://lolzteam.net/forums/774/'
main_lolz = 'https://lolzteam.net/'

def beautifulsoup(url_content):
    return BeautifulSoup(url_content, 'lxml')


with requests.Session() as s:
    p = s.post('https://lolzteam.net/login', data=payload)
    print(p.text)

    req = s.get(blacklist_lolz)
    print(req.text)

    soup = beautifulsoup(req.content)

    divs_all = soup.find_all('a', class_='PreviewTooltip ')

    links_all = [div['href'] for div in divs_all]

for link in links_all:
    number_list = []
    link_page = requests.get(main_lolz + link, headers=header)
    link_soup = beautifulsoup(link_page.content)
   # Some code ...

Why do I feel like I'm doing something wrong. In a loop, I go to each link and observe, sometimes, "You are not logged in, cannot read the text, etc., etc."

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

F

FeNUMe, 2018-05-06
@FeNUMe

Your last loop is out of session. In theory this should help.

import requests
from bs4 import BeautifulSoup, NavigableString, Tag

header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
    'Connection': 'keep-alive',
    'X-Requested-With': 'XMLHttpRequest'
}

payload = {
    'login': 'Some name',
    'password': 'Some pass'
}

blacklist_lolz = 'https://lolzteam.net/forums/774/'
main_lolz = 'https://lolzteam.net/'

def beautifulsoup(url_content):
    return BeautifulSoup(url_content, 'lxml')


with requests.Session() as s:
    p = s.post('https://lolzteam.net/login', data=payload)
    print(p.text)

    req = s.get(blacklist_lolz)
    print(req.text)

    soup = beautifulsoup(req.content)

    divs_all = soup.find_all('a', class_='PreviewTooltip ')

    links_all = [div['href'] for div in divs_all]

    for link in links_all:
        number_list = []
        link_page = s.get(main_lolz + link, headers=header)
        link_soup = beautifulsoup(link_page.content)
       # Some code ...