Answer the question
In order to leave comments, you need to log in
How to parse a site if it has the same class?
I'm learning to parse, I ran into a problem. They have the same name, and I will only get the content of the element that was before. (i.e. if there are 10 of them I will get only 1 the other 9 are ignored)
In my case
<span itemprop="value"> 4/4 </span>
<span itemprop="value"> Китай </span>
# https://www.glinki.ru/catalog/strunnye/gitary/klassicheskie-gitary/gitara-klassicheskaya-flight-c-100-4-4/
import requests
from bs4 import BeautifulSoup
import time
from random import randrange
import json
headers = {
'Accept': '',
'User-Agent': ''
}
def get_articles_urls(url):
s = requests.Session()
responce = s.get(url=url, headers=headers)
articles_urls_list = []
for page in range(1, 37, 1):
responce = s.get(url=f'https://www.glinki.ru/catalog/strunnye/gitary/?PAGEN_1={page}', headers=headers)
soup = BeautifulSoup(responce.text, 'lxml')
articles_urls = soup.find_all('a', class_='thumb shine')
for au in articles_urls:
art_url = 'https://www.glinki.ru' + au.get('href')
articles_urls_list.append(art_url)
print(f'Обработал {page}')
with open('articles_urls.txt', 'w') as file:
for url in articles_urls_list:
file.write(f'{url}\n')
return 'Работа по сбору ссылок выполнена!'
fest_list_result = []
def get_data(file_path):
with open(file_path) as file:
urls_list = [line.strip() for line in file.readlines()]
s = requests.Session()
for url in urls_list[:4]:
response = s.get(url=url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
article_title = soup.find('div', class_='page-top-main').find('h1', id='pagetitle').text.strip()
article_harac = soup.find('div', class_="tabs_section").find('table', class_="props_list").text.strip()
print(article_title, article_harac)
# fest_list_result.append(
# {
# "Fest name": article_title,
# "Fest date": article_harac
# }
# )
# with open("fest_list_result.json", "w", encoding="utf-8") as file:
# json.dump(fest_list_result, file, indent=2, ensure_ascii=False)
# with open('index.html', 'w', encoding='utf8') as file:
# file.write(responce.text)
def main():
#print(get_articles_urls(url='https://www.glinki.ru/catalog/strunnye/gitary/'))
get_data('articles_urls.txt')
if __name__ == '__main__':
main()
article_harac = soup.find('div', class_="tabs_section").find('table', class_="props_list").text.strip()
"Fest name": "Гитара классическая Flight C 100 4/4",
"Fest date": "Размер\n\n\n\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t4/4\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\n\n\n\n\nСтрана (фирма)\n\n\n\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tКитай"
Answer the question
In order to leave comments, you need to log in
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question