Answer the question
In order to leave comments, you need to log in
Encoding Chinese characters when parsing?
I am writing a parser for a Chinese online store.
from urllib.request import urlopen
from urllib.parse import urljoin
from lxml.html import fromstring
URL = 'http://list.suning.com/0-258003-0.html'
ITEM_PATH = '.clearfix .product .border-out .border-in .wrap .res-info .sell-point'
def parse_items():
f = urlopen(URL)
list_html = f.read().decode('utf-8')
list_doc = fromstring(list_html)
for elem in list_doc.cssselect(ITEM_PATH):
a = elem.cssselect('a')[0]
href = a.get('href')
title = a.text
em = elem.cssselect('em')[0]
title2 = em.text
print(href, title, title2)
def main():
parse_items()
if __name__ == '__main__':
main()
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: ordinal not in range(128)
Answer the question
In order to leave comments, you need to log in
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question