Answer the question
In order to leave comments, you need to log in
How to solve IndexError: index list out of range python?
I have a code that takes data ( page content pull rule ) for parsing from a database. This data is then passed to different functions.
import requests
from bs4 import BeautifulSoup
import pymysql
def get_html(url):
r = requests.get(url)
return r.text
# < Получаем ссылки
def get_resource_links(resource_page,links_rule,resource_domain):
resource_links = []
soup = BeautifulSoup(resource_page,'lxml')
resource_links_blocks = soup.findAll(links_rule[0],{links_rule[1]:links_rule[2]})
for resource_link_block in resource_links_blocks:
a_tag = resource_link_block .find("a")
if a_tag:
link = a_tag.get("href")
resource_links.append(resource_domain + link)
return resource_links
# < Собираем заголовки.
def get_item_title(item_page,title_rule):
soup = BeautifulSoup(item_page,'lxml')
item_title = soup.find(title_rule[0],{title_rule[1]:title_rule[2]})
return item_title
# < Собираем даты.
def get_item_datetime(item_page,datetime_rule,datetime1_rule):
soup = BeautifulSoup(item_page,'lxml')
item_datetime = soup.find(datetime_rule[0],{datetime_rule[1]:datetime_rule[2]})
item_datetime1= soup.find(datetime1_rule[0],{datetime1_rule[1]:datetime1_rule[2]})
return item_datetime,item_datetime1
# < Подключение к базе данных.
connection = pymysql.connect(host='localhost',
user='root',
password='',
db='news_portal',
charset='utf8',
autocommit=True)
cursor = connection.cursor()
# < Запрос правил выдергивания контента.
cursor.execute('SELECT * FROM `resource`')
resources=cursor.fetchall()
# < Цикл для перебора из кортежа.
for resource in resources:
resource_name=resource[1]
resource_link=resource[2]
resource_url=resource[3]
link_rule=resource[4]
title_rule=resource[6]
datetime_rule=resource[7]
datetime1_rule=resource[8]
text_rule=resource[9]
text1_rule=resource[10]
print(resource_name)
resource_domain=resource_link
# < Разбиваю данные из кортежа в массив
links_rule=link_rule.split(',')
title_rule=title_rule.split(',')
datetime_rule=datetime_rule.split(',')
datetime1_rule=datetime1_rule.split(',')
text_rule=text_rule.split(',')
text1_rule=text1_rule.split(',')
resource_page = get_html(resource_url)
resource_links = get_resource_links(resource_page,links_rule,resource_domain)
print('кол-во ссылок: '+str(len(resource_links)))
for resource_link in resource_links:
item_page = get_html(resource_link)
item_title = get_item_title(item_page,title_rule)
item_datetime= get_item_datetime(item_page,datetime_rule,datetime1_rule)
print(item_datetime)
connection.close()
Traceback (most recent call last): File "C:/Users/Администратор/PycharmProjects/Task/sql_parser.py", line 70, in item_datetime= get_item_datetime(item_page,datetime_rule,datetime1_rule) File "C:/Users/Администратор/PycharmProjects/Task/sql_parser.py", line 28, in get_item_datetime item_datetime1= soup.find(datetime1_rule[0],{datetime1_rule[1]:datetime1_rule[2]}) IndexError: list index out of range
Process finished with exit code 1
Answer the question
In order to leave comments, you need to log in
Found the answer, just created a condition:
def get_item_datetime(item_page,datetime_rule,datetime1_rule):
soup = BeautifulSoup(item_page,'lxml')
item_datetime = soup.find(datetime_rule[0],{datetime_rule[1]:datetime_rule[2]})
if item_datetime is not None:
item_datetime = soup.find(datetime_rule[0],{datetime_rule[1]:datetime_rule[2]}).text
item_datetime=dateparser.parse(item_datetime, date_formats=['%d %B %Y %H'])
else:
item_datetime = soup.find(datetime1_rule[0],{datetime1_rule[1]:datetime1_rule[2]}).text
item_datetime=dateparser.parse(item_datetime, date_formats=['%d %B %Y %H'])
return str(item_datetime)
You have an error in that you are trying to access the elements of a list in a variable that does not contain a list.
Check what you have in datetime_rule and datetime1_rule here
def get_item_datetime(item_page,datetime_rule,datetime1_rule):
soup = BeautifulSoup(item_page,'lxml')
--->print(datetime_rule, datetime1_rule)
item_datetime = soup.find(datetime_rule[0],{datetime_rule[1]:datetime_rule[2]})
item_datetime1= soup.find(datetime1_rule[0],{datetime1_rule[1]:datetime1_rule[2]})
return item_datetime,item_datetime1
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question