How to optimize write in excel?

A

Anton272021-09-27 14:03:18

Python

Anton27, 2021-09-27 14:03:18

Hello everyone, I ran into such a problem - I have 26,000 xml files in my folder, I check each file and write it to an excel table.

from bs4 import BeautifulSoup
import requests as req
from fake_useragent import *
import xml.etree.ElementTree as ET
import os
import openpyxl

myDOC = r'XMLfiles/1.xml'
#
# beginTree = ET.parse(myDOC)
# file_root = beginTree.getroot()
# elem = file_root.find('vendor')
# print(elem.text)
allFiles = os.listdir('C:/Content/SplitFiles')
Description_item = 'У данного товара нет этого поля'
DOP_Description_item = 'У данного товара нет этого поля'
BRAND_item = 'Информация не указана'
NAME_item = 'Название товара не указанно'
print(len(allFiles))
statusList = []
countList = []
priceList = []
barcodeList = []
itemIDlist = []
urlList = []

nameList = []
brandList = []
descriptionList = []
DOPDescriptionList = []
book = openpyxl.Workbook()
sheet = book.active
for i in range(2, len(allFiles)):
    beginTree = ET.parse(r'C:/Content/SplitFiles/' + allFiles[i])

    file_root = beginTree.getroot()
    barcode = file_root.find('barcode')
    if barcode is not None:
        bar = barcode.text
        sheet['D' + str(i)].value = bar
    else:
        sheet['D' + str(i)].value = 'Информация отсутствует'
    #
    elem = file_root.find('price')
    PRICE_item = elem.text
    sheet['B' + str(i)].value = PRICE_item
    #
    count = file_root.find('count')
    if count is not None:
        cnt = count.text
        sheet['E' + str(i)].value = cnt
    else:
        sheet['E' + str(i)].value = 'Информация отсутствует'
    #
    elem = file_root.find('name')
    if elem is not None:
        NAME_item = elem.text
        sheet['A' + str(i)].value = NAME_item
    #
    status = file_root.find('status')
    if status is not None:
        if status.text == '200' or status.text == '2':
            sheet['C' + str(i)].value = 'Нет в наличии'
        if status.text == '100' or status.text == '1':
            sheet['C' + str(i)].value = 'Нет в наличии'
    #
    urlItem = file_root.find('url')
    link = urlItem.text
    res = req.get(link, headers={'User-Agent': UserAgent().chrome})
    html = BeautifulSoup(res.text, 'html.parser')
    IDs = html.select('#catalog-model-supertop #catalog-item-code')
    for ID in IDs:
        a = ID.text.replace('Код товара: ','')
        sheet['F' + str(i)].value = a
        print('Выполнено {0} из {1}'.format(i, len(allFiles)))
    #
    elem = file_root.find('url')
    URL_item = elem.text
    sheet['G' + str(i)].value = URL_item
    book.save('items1.xlsx')


sheet['A1'], sheet['B1'], sheet['C1'] = 'Название', 'Цена', 'Статус',
sheet['D1'], sheet['E1'], sheet['F1'] =  'Бар-код', 'Count', 'ID'
sheet['G1'] = 'URL'

and downloading even 4,000 files took me about 2-3 hours, can you tell me what's wrong?

Reply

Answer the question

In order to leave comments, you need to log in

1 answer(s)

S

Soul1, 2021-09-28
@Soul1

Why are you saving the book on each iteration of the for loop? Try to put the save after the loop, maybe it will give an increase in speed.