Criticize the RSS script code, please.

A

akosarev2012-05-23 05:21:28

Python

akosarev, 2012-05-23 05:21:28

Good afternoon, or rather, good morning!

Gradually mastering the 3rd Python, trying to do something useful on it. But I often catch myself thinking that it is not yet possible to completely “think in a new way” on this PL. And the code is still far from ideal. Therefore, I ask the help of habrazhiteley to point out problem areas.

The idea of the script: a simple RSS parser with the ability to output both lines and download more or less cleaned articles via links into separate files.

     1 #!/usr/bin/python3
     2 
     3 ###
     4 ###  TODO: clean_text: NOT WORKING WELL
     5 ###  TODO: \x7f problem needs tests
     6 ###
     7 
     8 import codecs
     9 import sys
    10 import os
    11 import feedparser
    12 import urllib.request, urllib.error, urllib.parse
    13 import re
    14 import types
    15 from bs4 import BeautifulSoup
    16 from html.parser import HTMLParser
    17 
    18 stop_string_list=("li><a","<div","href","<par","</div")
    19 
    20 def clean_text(html_text):
    21     def char_from_entity(match):
    22         code = html.entities.name2codepoint.get(match.group(1), 0xFFFD)
    23         return chr(code)
    24 
    25     def clean_str(tmp_str):
    26         while tmp_str.find("\x7f")>0:
    27             pos=tmp_str.find("\x7f")
    28             tmp_str=tmp_str[0:pos-1]+tmp_str[pos+1:]    
    29         return tmp_str    
    30 
    31     text = re.sub(r"<!--(?:.|\n)*?-->", "", html_text)
    32     text = re.sub(r"<[Pp][^>]*?(?!</)>", "\n\n", text)
    33     text = re.sub(r"<[^>]*?>", "", text)
    34     text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
    35     text = re.sub(r"&([A-Za-z]+);", char_from_entity, text)
    36     text = re.sub(r"\n(?:[ \xA0\t]+\n)+", "\n", text)
    37     t = re.sub(r"\n\n+", "\n\n", text.strip()) 
    38     return clean_str(t)
    39 
    40 def visible(element):
    41     if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'dates']:
    42         return False
    43     elif re.match('<!--.*-->', str(element),re.UNICODE):
    44         return False
    45     return True
    46 
    47 if(sys.argv[1:] == []):
    48 
    49     print("Simple RSS Parser")
    50     print("")
    51     print("Usage:")
    52     print("")
    53     print("    rss_parser (rss_link)(mode) [encoding]")
    54     print("")
    55     print("Possible modes:")
    56     print("")
    57     print("    line    -  line by line output(format: YYYY-MM-DD|header|body|link)")
    58     print("    pandoc  -  document compatible with Pandoc")
    59     print("    plain   -  plain text output")
    60     print("    dump    -  dump HTML content using the specified encoding")
    61     print("    fplain  -  dump date, header, body and link to separate files using the specified encoding")
    62     print("    fdump   -  dump HTML content to separate files using the specified encoding")
    63     print("")
    64     
    65 else:
    66 
    67     try:
    68         rss_link = sys.argv[1]    
    69         out_format = sys.argv[2]
    70     except:
    71         sys.exit("ERROR: Some argument is missing.")
    72         
    73     try:
    74         d = feedparser.parse(rss_link)
    75     except:
    76         sys.exit("ERROR: Unable to parse RSS file. Check the RSS link and internet connection.")
    77 
    78     if(out_format == "line"):
    79 
    80         for entry in d["entries"]:
    81             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
    82             print(time_stamp + " | " + entry.title + " | " + clean_text(entry.summary) + " | " + entry.link)
    83 
    84     elif(out_format == "pandoc"):
    85 
    86         for entry in d["entries"]:
    87             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
    88             print("#", time_stamp, " ",entry.title, "#")
    89             print("")
    90             print(clean_text(entry.summary))
    91             print("")
    92             print(entry.link)
    93             print("")
    94             print("")
    95 
    96     elif(out_format == "plain"):
    97 
    98         for entry in d["entries"]:
    99             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
   100             print(time_stamp, " ", entry.title)
   101             print(clean_text(entry.summary))
   102             print(entry.link)
   103             print("")
   104             print("")
   105 
   106     elif(out_format == "dump"):
   107 
   108         try:
   109             page_enc = sys.argv[3]
   110         except:
   111             sys.exit("ERROR: Encoding not specified.")
   112 
   113         for entry in d["entries"]:                                        
   114             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
   115             print(time_stamp, " ", entry.title)
   116             print(clean_text(entry.summary))
   117             print(entry.link)
   118             f=urllib.request.urlopen(entry.link)
   119             print(clean_text(f.read().decode(page_enc)))
   120             print("")
   121             print("")       
   122     
   123     elif(out_format == "fplain"):
   124 
   125         wrk_dir = os.getcwd()
   126 
   127         for entry in d["entries"]:                                        
   128             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
   129             # prepare file name for file: date + 8 chars of title
   130             f_name = entry.title
   131             f_name = time_stamp + f_name.replace(" ", "")[0:8]
   132                     
   133             not_exist=False        
   134             try:
   135                 open(wrk_dir + "/" + f_name)
   136             except IOError:
   137                 not_exist=True
   138             
   139             if (not_exist):
   140                 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
   141                 
   142                 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
   143                 f_hndl.write(clean_text(entry.summary) + "\n\n")
   144                 f_hndl.write(entry.link + "\n")
   145                 f_hndl.close()
   146 
   147     elif(out_format == "fdump"):
   148 
   149         try:
   150             page_enc = sys.argv[3]
   151         except:
   152             sys.exit("ERROR: Encoding not specified.")
   153 
   154         wrk_dir = os.getcwd()
   155 
   156         for entry in d["entries"]:                                        
   157             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
   158             # prepare file name for file: date + 8 chars of title
   159             f_name = entry.title
   160             f_name = time_stamp + f_name.replace(" ", "")[0:8]
   161             
   162             not_exist=False        
   163             try:
   164                 open(wrk_dir + "/" + f_name)
   165             except IOError:
   166                 not_exist=True
   167             
   168             if (not_exist):
   169                 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
   170                 
   171                 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
   172                 f_hndl.write(clean_text(entry.summary) + "\n\n")
   173                 f_hndl.write(entry.link + "\n\n")
   174                 
   175                 web_page=urllib.request.urlopen(entry.link)
   176                 page=web_page.read().decode(page_enc)
   177                 
   178                 soup=BeautifulSoup(page)
   179                 texts=soup.findAll(text=True)
   180                 visible_texts = filter(visible, texts)
   181                 
   182                 out_file = ""
   183                 for item in visible_texts:
   184                     st_item=str(item)
   185                     not_in_stop=True
   186                     for stop_item in stop_string_list:
   187                         not_in_stop=not_in_stop and (st_item.find(stop_item)<0)
   188                     if not_in_stop:
   189                         out_file+=item
   190                 
   191                 out_file.replace("\t","")
   192                 out_file=re.sub("\n\s*\n*", "\n", out_file)
   193                 
   194                 f_hndl.write(out_file)
   195                 f_hndl.close()
   196             
   197     else:
   198         print("ERROR: Argument not defined.")
   199

Same on pastebin: pastebin.com/fjedL8pe

Reply

Answer the question

In order to leave comments, you need to log in

3 answer(s)

A

Arsen, 2012-05-23
@mekegi

Get rid of the copy-paste in the huge if-elif-else block.
Alternatively, create a dictionary with templates like:

{ 
'line': '{TimeStamp}  | {Title}  | {Summary}  | {Link}',
'plain': '{TimeStamp}  {Title}\n{Summary}\n{Link}\n\n',
//и тд...
}

A

avalak, 2012-05-23
@avalak

#!/usr/bin/env python3.2
# coding: utf-8

Use argparse (rest in peace with optparse).
Avoid open(wrk_dir + "/" + f_name)and use os.path.join()
You have a lot of regular expressions. It is worth giving them meaningful names (re.compile) or commenting them out.
In many places, the code can be made more elegant / redone

# единообразие
sys.exit("ERROR: Encoding not specified.")
print("ERROR: Argument not defined.")

def visible(element):
  if cond1:
    return False
  elif cond2:
    return False
  return True

  # почему не просто
  if cond1 or cond2:
    return False
  return True

At least this. But in my opinion there are a lot of places that should be rewritten.

G

gelas, 2012-05-23
@gelas

And more about formatting and generally working with strings, read something
http://docs.python.org/library/string.html#formatstrings
http://google-styleguide.googlecode.com/svn/trunk/pyguide.html# Strings
For example:
time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
can be rewritten as
time_stamp = "{0.tm_year} -{0.tm_mon}-{0.tm_mday}".format(entry.updated_parsed)