Answer the question
In order to leave comments, you need to log in
Criticize the RSS script code, please.
Good afternoon, or rather, good morning!
Gradually mastering the 3rd Python, trying to do something useful on it. But I often catch myself thinking that it is not yet possible to completely “think in a new way” on this PL. And the code is still far from ideal. Therefore, I ask the help of habrazhiteley to point out problem areas.
The idea of the script: a simple RSS parser with the ability to output both lines and download more or less cleaned articles via links into separate files.
1 #!/usr/bin/python3
2
3 ###
4 ### TODO: clean_text: NOT WORKING WELL
5 ### TODO: \x7f problem needs tests
6 ###
7
8 import codecs
9 import sys
10 import os
11 import feedparser
12 import urllib.request, urllib.error, urllib.parse
13 import re
14 import types
15 from bs4 import BeautifulSoup
16 from html.parser import HTMLParser
17
18 stop_string_list=("li><a","<div","href","<par","</div")
19
20 def clean_text(html_text):
21 def char_from_entity(match):
22 code = html.entities.name2codepoint.get(match.group(1), 0xFFFD)
23 return chr(code)
24
25 def clean_str(tmp_str):
26 while tmp_str.find("\x7f")>0:
27 pos=tmp_str.find("\x7f")
28 tmp_str=tmp_str[0:pos-1]+tmp_str[pos+1:]
29 return tmp_str
30
31 text = re.sub(r"<!--(?:.|\n)*?-->", "", html_text)
32 text = re.sub(r"<[Pp][^>]*?(?!</)>", "\n\n", text)
33 text = re.sub(r"<[^>]*?>", "", text)
34 text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
35 text = re.sub(r"&([A-Za-z]+);", char_from_entity, text)
36 text = re.sub(r"\n(?:[ \xA0\t]+\n)+", "\n", text)
37 t = re.sub(r"\n\n+", "\n\n", text.strip())
38 return clean_str(t)
39
40 def visible(element):
41 if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'dates']:
42 return False
43 elif re.match('<!--.*-->', str(element),re.UNICODE):
44 return False
45 return True
46
47 if(sys.argv[1:] == []):
48
49 print("Simple RSS Parser")
50 print("")
51 print("Usage:")
52 print("")
53 print(" rss_parser (rss_link)(mode) [encoding]")
54 print("")
55 print("Possible modes:")
56 print("")
57 print(" line - line by line output(format: YYYY-MM-DD|header|body|link)")
58 print(" pandoc - document compatible with Pandoc")
59 print(" plain - plain text output")
60 print(" dump - dump HTML content using the specified encoding")
61 print(" fplain - dump date, header, body and link to separate files using the specified encoding")
62 print(" fdump - dump HTML content to separate files using the specified encoding")
63 print("")
64
65 else:
66
67 try:
68 rss_link = sys.argv[1]
69 out_format = sys.argv[2]
70 except:
71 sys.exit("ERROR: Some argument is missing.")
72
73 try:
74 d = feedparser.parse(rss_link)
75 except:
76 sys.exit("ERROR: Unable to parse RSS file. Check the RSS link and internet connection.")
77
78 if(out_format == "line"):
79
80 for entry in d["entries"]:
81 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
82 print(time_stamp + " | " + entry.title + " | " + clean_text(entry.summary) + " | " + entry.link)
83
84 elif(out_format == "pandoc"):
85
86 for entry in d["entries"]:
87 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
88 print("#", time_stamp, " ",entry.title, "#")
89 print("")
90 print(clean_text(entry.summary))
91 print("")
92 print(entry.link)
93 print("")
94 print("")
95
96 elif(out_format == "plain"):
97
98 for entry in d["entries"]:
99 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
100 print(time_stamp, " ", entry.title)
101 print(clean_text(entry.summary))
102 print(entry.link)
103 print("")
104 print("")
105
106 elif(out_format == "dump"):
107
108 try:
109 page_enc = sys.argv[3]
110 except:
111 sys.exit("ERROR: Encoding not specified.")
112
113 for entry in d["entries"]:
114 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
115 print(time_stamp, " ", entry.title)
116 print(clean_text(entry.summary))
117 print(entry.link)
118 f=urllib.request.urlopen(entry.link)
119 print(clean_text(f.read().decode(page_enc)))
120 print("")
121 print("")
122
123 elif(out_format == "fplain"):
124
125 wrk_dir = os.getcwd()
126
127 for entry in d["entries"]:
128 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
129 # prepare file name for file: date + 8 chars of title
130 f_name = entry.title
131 f_name = time_stamp + f_name.replace(" ", "")[0:8]
132
133 not_exist=False
134 try:
135 open(wrk_dir + "/" + f_name)
136 except IOError:
137 not_exist=True
138
139 if (not_exist):
140 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
141
142 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
143 f_hndl.write(clean_text(entry.summary) + "\n\n")
144 f_hndl.write(entry.link + "\n")
145 f_hndl.close()
146
147 elif(out_format == "fdump"):
148
149 try:
150 page_enc = sys.argv[3]
151 except:
152 sys.exit("ERROR: Encoding not specified.")
153
154 wrk_dir = os.getcwd()
155
156 for entry in d["entries"]:
157 time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
158 # prepare file name for file: date + 8 chars of title
159 f_name = entry.title
160 f_name = time_stamp + f_name.replace(" ", "")[0:8]
161
162 not_exist=False
163 try:
164 open(wrk_dir + "/" + f_name)
165 except IOError:
166 not_exist=True
167
168 if (not_exist):
169 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
170
171 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
172 f_hndl.write(clean_text(entry.summary) + "\n\n")
173 f_hndl.write(entry.link + "\n\n")
174
175 web_page=urllib.request.urlopen(entry.link)
176 page=web_page.read().decode(page_enc)
177
178 soup=BeautifulSoup(page)
179 texts=soup.findAll(text=True)
180 visible_texts = filter(visible, texts)
181
182 out_file = ""
183 for item in visible_texts:
184 st_item=str(item)
185 not_in_stop=True
186 for stop_item in stop_string_list:
187 not_in_stop=not_in_stop and (st_item.find(stop_item)<0)
188 if not_in_stop:
189 out_file+=item
190
191 out_file.replace("\t","")
192 out_file=re.sub("\n\s*\n*", "\n", out_file)
193
194 f_hndl.write(out_file)
195 f_hndl.close()
196
197 else:
198 print("ERROR: Argument not defined.")
199
Same on pastebin: pastebin.com/fjedL8pe
Answer the question
In order to leave comments, you need to log in
Get rid of the copy-paste in the huge if-elif-else block.
Alternatively, create a dictionary with templates like:
{
'line': '{TimeStamp} | {Title} | {Summary} | {Link}',
'plain': '{TimeStamp} {Title}\n{Summary}\n{Link}\n\n',
//и тд...
}
#!/usr/bin/env python3.2
# coding: utf-8
open(wrk_dir + "/" + f_name)
and use os.path.join()
# единообразие
sys.exit("ERROR: Encoding not specified.")
print("ERROR: Argument not defined.")
def visible(element):
if cond1:
return False
elif cond2:
return False
return True
# почему не просто
if cond1 or cond2:
return False
return True
And more about formatting and generally working with strings, read something
http://docs.python.org/library/string.html#formatstrings
http://google-styleguide.googlecode.com/svn/trunk/pyguide.html# Strings
For example:
time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
can be rewritten as
time_stamp = "{0.tm_year} -{0.tm_mon}-{0.tm_mday}".format(entry.updated_parsed)
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question