How to parse a site if it has the same class?

G

Groslog2021-11-15 11:39:09

Python

Groslog, 2021-11-15 11:39:09

I'm learning to parse, I ran into a problem. They have the same name, and I will only get the content of the element that was before. (i.e. if there are 10 of them I will get only 1 the other 9 are ignored)
In my case

<span itemprop="value"> 4/4 </span>
<span itemprop="value"> Китай </span>

# https://www.glinki.ru/catalog/strunnye/gitary/klassicheskie-gitary/gitara-klassicheskaya-flight-c-100-4-4/
import requests
from bs4 import BeautifulSoup
import time
from random import randrange
import json

headers = {
    'Accept': '',
    'User-Agent': ''
}


def get_articles_urls(url):
    s = requests.Session()
    responce = s.get(url=url, headers=headers)

    articles_urls_list = []
    for page in range(1, 37, 1):
        responce = s.get(url=f'https://www.glinki.ru/catalog/strunnye/gitary/?PAGEN_1={page}', headers=headers)
        soup = BeautifulSoup(responce.text, 'lxml')

        articles_urls = soup.find_all('a', class_='thumb shine')

        for au in articles_urls:
            art_url = 'https://www.glinki.ru' + au.get('href')
            articles_urls_list.append(art_url)

            print(f'Обработал {page}')

    with open('articles_urls.txt', 'w') as file:
        for url in articles_urls_list:
            file.write(f'{url}\n')

        return 'Работа по сбору ссылок выполнена!'

fest_list_result = []

def get_data(file_path):
    with open(file_path) as file:
        urls_list = [line.strip() for line in file.readlines()]

        s = requests.Session()

        for url in urls_list[:4]:
            response = s.get(url=url, headers=headers)
            soup = BeautifulSoup(response.text, 'lxml')

            article_title = soup.find('div', class_='page-top-main').find('h1', id='pagetitle').text.strip()
            article_harac = soup.find('div', class_="tabs_section").find('table', class_="props_list").text.strip()
            print(article_title, article_harac)
        #     fest_list_result.append(
        #             {
        #             "Fest name": article_title,
        #             "Fest date": article_harac
        #             }
        #         )
        # with open("fest_list_result.json", "w", encoding="utf-8") as file:
        #     json.dump(fest_list_result, file, indent=2, ensure_ascii=False)
    # with open('index.html', 'w', encoding='utf8') as file:
    #     file.write(responce.text)




def main():
    #print(get_articles_urls(url='https://www.glinki.ru/catalog/strunnye/gitary/'))
    get_data('articles_urls.txt')


if __name__ == '__main__':
    main()

Parsil in this way

article_harac = soup.find('div', class_="tabs_section").find('table', class_="props_list").text.strip()

I tried other methods, all lines were parsed, but output something like this:

"Fest name": "Гитара классическая Flight C 100 4/4",
"Fest date": "Размер\n\n\n\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t4/4\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n\n\n\n\n\nСтрана (фирма)\n\n\n\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tКитай"

Thanks in advance =)

Reply

Answer the question

In order to leave comments, you need to log in

0 answer(s)