Answer the question
In order to leave comments, you need to log in
Why doesn't Python save to excel as it should?
There is a code that parses the forum page.
Essence: parses information about the user, a link to his profile and the message that the user left, saves all this in Excel.
But the problem is this, the list of users is created in the file as normal as links to their profile, but the message itself binds only the first from the page, well, or that
from urllib.request import urlopen
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from time import sleep
from lxml.html import fromstring
from lxml.etree import XMLSyntaxError
import xlsxwriter
URL = 'http://www.banki.ru/forum/?PAGE_NAME=read&FID=51&TID=150146'
ITEM_PATH = '.forum-user-name'
DESCR_PATH = '.forum-post-entry'
def parse_posts():
f = urlopen(URL)
list_html = f.read().decode('cp1251')
list_doc = fromstring(list_html)
posts = []
for elem in list_doc.cssselect(ITEM_PATH):
a = elem.cssselect('a')[0]
href = a.get('href')
url = urljoin(URL, href)
title = a.get('title')
name = a.text_content()
descr = list_doc.cssselect(DESCR_PATH)[0].text_content()
post = {'name': name, 'url': url, 'descr': descr}
posts.append(post)
return posts
def export_excel(filename, posts):
workbook = xlsxwriter.Workbook(filename)
worksheet = workbook.add_worksheet()
bold = workbook.add_format({'bold': True})
field_names = ('Название темы', 'URL', 'ДиФ')
for i, field in enumerate(field_names):
worksheet.write(0, i, field, bold)
fields = ('name', 'url', 'descr')
for row, post in enumerate(posts, start=1):
for col, field in enumerate(fields):
worksheet.write(row, col, post[field])
workbook.close()
def main():
posts = parse_posts()
export_excel('posts1.xlsx', posts)
if __name__ == '__main__':
main()
descr = list_doc.cssselect(DESCR_PATH)[0].text_content()
Answer the question
In order to leave comments, you need to log in
for post in list_doc.cssselect('.forum-post-table'):
user = post.cssselect('.forum-user-name a')[0]
name = user.text_content()
url = urljoin(URL, user.get('href'))
# title = user_name.get('title')
descr = post.cssselect('.forum-post-text')[0].text_content()
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question