How to calculate ID for articles?

M

Mr. Anderson2021-06-19 10:21:47

Python

Mr. Anderson, 2021-06-19 10:21:47

Good day.

There is a small script that parses a news site and puts the news into a JSON file. The problem is with my ID. Now I get them from time (I remove all characters and leave only numbers and it turns out 202116061234 as a result). This is very cumbersome and I just want to get the ID from 0, and add 1 for each article.

The first time there are no problems, the articles are assigned from 0. But I personally have a problem when checking for updates. How can I pull up the latest ID, assign a number to a variable and dance from it?

Please help. In Python, I have, well, the very initial level. This script was taken from the video and remade for myself.

Below is a listing under the spoiler.

test.py

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import json

headers = {
    # Paste User-Agent
}

url = "https://www.irk.ru/news/"
r = requests.get(url=url, headers=headers)
soup = BeautifulSoup(r.text, "lxml")
articles_cards = soup.find_all("li", class_="b-news-article-list-item")


# Функция получения новостей в первый раз
# Новости с сайта irk.ru
def get_first_news_irk():
    # Словарь для новостей с сайта irk.ru
    irk_news_dict = {}

    # Хочу ID присваивать с 0, что у каждой новости был понятный ID
    ids = 0

    for article in articles_cards:
        # Получаем заголовки новостей
        article_title = article.find("a").text.strip()
        # Получаем описание новостей
        article_desc = article.find("p").text.strip()
        # Получаем url новостей
        article_url = f'https://www.irk.ru{article.find("a").get("href")}'

        # Получаем время новостей
        article_date_time = article.find("time").get("datetime")
        # Преобразовываем время
        date_from_iso = datetime.fromisoformat(article_date_time)
        date_time = datetime.strftime(date_from_iso, "%Y-%m-%d %H:%M:%S")
        article_date_timestamp = time.mktime(datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S").timetuple())

        # Делаем ID из даты новости
        # article_id = "".join(
        #     [article_date_time[i] for i in range(len(article_date_time)) if article_date_time[i] in '0123456789']
        # )

        # На каждой итерации заполняем словарь новостей
        # irk_news_dict[article_id] = {
        #     "article_date_timestamp": article_date_timestamp,
        #     "article_title": article_title,
        #     "article_url": article_url,
        #     "article_desc": article_desc
        # }

        irk_news_dict[ids] = {
            "article_date_timestamp": article_date_timestamp,
            "article_title": article_title,
            "article_url": article_url,
            "article_desc": article_desc
        }

        ids += 1

        # Записываем результат в json файл
        with open("src/test_dict.json", "w") as file:
            json.dump(irk_news_dict, file, indent=4, ensure_ascii=False)


# Функция для проверки на появление новых новостей
def check_irk_news_update():
    with open("src/test_dict.json") as file:
        irk_news_dict = json.load(file)

    # Словарь для заполнения свежими новостями
    irk_fresh_news_dict = {}
    for k, v in irk_news_dict.items():
        ids = k
    for article in articles_cards:
        article_date_time = article.find("time").get("datetime")

        # Делаем ID из даты новости
        # Глупое решение, пока не придумал чем заменить
        # article_id = "".join(
        #     [article_date_time[i] for i in range(len(article_date_time)) if article_date_time[i] in '0123456789']
        # )

        # Если id уже есть в подгружаемом словаре, то пропускаем
        # Иначе добавляем новую новость в новый словарь со свежими новостями
        # И записываем свежие новости в JSON файл
        if ids in irk_news_dict:
            continue
        else:
            article_url = f'https://www.irk.ru{article.find("a").get("href")}'

            article_title = article.find("a").text.strip()
            article_desc = article.find("p").text.strip()

            article_date_time = article.find("time").get("datetime")
            date_from_iso = datetime.fromisoformat(article_date_time)
            date_time = datetime.strftime(date_from_iso, "%Y-%m-%d %H:%M:%S")
            article_date_timestamp = time.mktime(datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S").timetuple())

            irk_news_dict[ids] = {
                "article_date_timestamp": article_date_timestamp,
                "article_title": article_title,
                "article_url": article_url,
                "article_desc": article_desc
            }

            irk_fresh_news_dict[ids] = {
                "article_date_timestamp": article_date_timestamp,
                "article_title": article_title,
                "article_url": article_url,
                "article_desc": article_desc
            }

            ids += 1

    # Записываем результат в json файл
    with open("src/test_dict.json", "w") as file:
        json.dump(irk_news_dict, file, indent=4, ensure_ascii=False)

    # Функция возвращает словарь со свежими новостями
    return irk_fresh_news_dict


def main():
    # 1я функция вызывается единожды для получения новостей
    # get_first_news_irk()
    # 2я функция проверяет обновления на сайте
    # Если новости нет в json то добавляет в словарь и выводит на печать
    print(check_irk_news_update())


if __name__ == '__main__':
    main()

Reply

Answer the question

In order to leave comments, you need to log in

2 answer(s)

S

Sergey Sokolov, 2021-06-19
@roman_tonkoshkurov

I would proceed from the source: they have an article identified by a date and a short word (slag): for example, /news/20210619/party/- here is the "key" of the article 20210619/party- why don't you take this as a unique key. In addition, lexicographic sorting will arrange articles in chronological order, by date.
The next time you start, see what is the most recent date for which there are already downloaded materials. And then drag, starting from the same day. Skip or overwrite existing articles of the day.

D

Daniil Shevkunov, 2021-06-19
@danila763

The problem here is that when you run the script again, it creates an id for old news by their date and checks whether there is a news item in the dictionary. Of course, you can continue id in order every time you run the script (as you want), but then how to check if the news is already in the dictionary?

My version of your code, simplified

import requests
from bs4 import BeautifulSoup
import json


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}

def get_news_info(card):
    """
    Функция для получения информации об одной новости
    """

    # Получаем заголовки новости
    article_title = card.find("a").text.strip()

    # Получаем описание новости
    article_desc = card.find("p").text.strip()

    # Получаем url новости
    article_url = f'https://www.irk.ru{card.find("a").get("href")}'

    # Получаем время новости
    article_date_time = card.find("time").get("datetime")

    id = article_date_time.replace(' ', '').replace('-', '').replace(':', '')

    news = {
            "article_date_timestamp": article_date_time,
            "article_title": article_title,
            "article_url": article_url,
            "article_desc": article_desc
        }

    return (id, news)


def get_site_news(file_news_dict, articles_cards):
    """
    Функция для добавления новых новостей в словарь,
    возвращает словари со всеми новостями и с новыми новостями
    """

    new_news_dict = dict()

    for article in articles_cards:

        id, news = get_news_info(article)

        if id not in file_news_dict.keys():

            file_news_dict[id] = news

            new_news_dict[id] = news

    return (file_news_dict, new_news_dict)



def main():
    url = "https://www.irk.ru/news/"
    r = requests.get(url=url, headers=headers)
    soup = BeautifulSoup(r.text, "lxml")
    articles_cards = soup.find_all("li", class_="b-news-article-list-item")

    # Открываем файл с новостями
    with open("src/test_dict.json", "r+") as news_file:

        # Получаем словарь новостей из файла
        try:
            file_news_dict = json.load(news_file)
        except:
            # Если в файле не словарь, создаем пустой словарь
            file_news_dict = dict()

        # Обновляем новости
        file_news_dict, new_news_dict = get_site_news(file_news_dict, articles_cards)

        # Сохраняем новости
        json.dump(file_news_dict, news_file, indent=4, ensure_ascii=False)

        print(new_news_dict)


if __name__ == '__main__':
    main()