Why, if I run the parser through the google app engine, do I get cracks instead of html?

M

maam832015-09-02 10:52:51

Flask

maam83, 2015-09-02 10:52:51

There is a parser:

# -*- coding: utf-8 -*-
import requests
import lxml.html

class Rutor:
    def __init__(self, title, year='', qu=''):
        self.title = title
        self.year = year
        self.qu = qu
        self.main_domain = 'http://www.rutor.org/'
        self.search_params = '/search/0/1/100/0/'  # only New movies
        self.search_text = ""
        self.count = 0
        self.result = {}

    def construct_search_text(self):
        l = [self.title, self.year, self.qu]
        l = filter(None, l)
        search_text = " ".join(l)
        self.search_text = search_text
        return self.search_text[:]  # [:] - magic

    def construct_search_url(self):
        search_link = "".join((self.main_domain, self.search_params, self.construct_search_text()))
        print(search_link)
        return search_link

    def get_page_sourse(self):
        r = requests.get(self.construct_search_url())
        print("encoding is: "+r.encoding)
        return r.text.encode(r.encoding)  # r.encoding return used codec

    def parse_it(self):
        all_torrent_links_xpath = "//div[@id='index']//a[starts-with(@href, '/torrent')]"
        page = lxml.html.document_fromstring(self.get_page_sourse())
        print(self.get_page_sourse()) #here I printing source core for stackowerflow
        all_torrent_links = page.xpath(all_torrent_links_xpath)
        if all_torrent_links:
            for link in all_torrent_links:
                print(link)
                if not (link.text.lower()).find(u'трейлер') != -1:  # we don't need trailers
                    title = link.text_content()
                    torrent_file = link.getprevious().getprevious().attrib['href']
                    magnet = link.getprevious().attrib['href']
                    self.result[self.count] = {'title': title[:], 'torrent_file': torrent_file, 'magnet': magnet}
                    # I used [:] c'z title type is 'lxml.etree._ElementUnicodeResult' but not <unicode>
                    # because of lxml.html fromstring()
                    self.count += 1


if __name__ == '__main__':
    m = Rutor('Avengers: Age of Ultron', '2015', '1080p')
    m.parse_it()
    print(m.result)

If I run it on my machine, I get beautiful html and m.result filled in.
However, if I run it through google app engine (flask):

import Rutor
...
@app.route('/test')
def test():
    m = Rutor('Avengers: Age of Ultron', '2015', '1080p')
    m.parse_it()
    pprint(m.result)
    return 'test'

Then instead of the source code of the page, I get a kryakozyaby:

and an empty m.result

How to fix this?

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

D

dollar, 2018-09-15
@dollar

rutor.org has long been blocked.
Now the main addresses are rutor.info and rutor.is