How to automate parser launch in python?

R

r4khic2019-08-26 12:59:38

Python

r4khic, 2019-08-26 12:59:38

I welcome everyone! There is a parser that parses news resources. How best to automate it? That is, you run it once in the morning, after which it parses news all day. And turn it off in the evening.

Here is the parser code:

import requests
import pymysql
import dateparser
from bs4 import BeautifulSoup

# < Получаем html код.
def get_html(url):
    r = requests.get(url)
    return r.text

# < Получаем ссылки.
def get_resource_links(resource_page,links_rule,resource_domain):
    resource_links = []
    soup = BeautifulSoup(resource_page,'lxml')
    resource_links_blocks = soup.find_all(links_rule[0],{links_rule[1]:links_rule[2]})
    for resource_link_block in resource_links_blocks:
        a_tag = resource_link_block .find("a")
        if a_tag:
            link = a_tag.get("href")
            resource_links.append(resource_domain + link)
    return resource_links

# < Собираем заголовки с страницы.
def get_item_title(item_page,title_rule):
    soup = BeautifulSoup(item_page, 'lxml')
    item_title = soup.find(title_rule[0],{title_rule[1]:title_rule[2]})
    return item_title['content']

# < Собираем даты с страницы.
def get_item_datetime(item_page,datetime_rule,datetime1_rule):
    soup = BeautifulSoup(item_page, 'lxml')
    item_datetime = soup.find(datetime_rule[0],{datetime_rule[1]:datetime_rule[2]})
    if item_datetime is not None:
        item_datetime = soup.find(datetime_rule[0],{datetime_rule[1]:datetime_rule[2]}).text
        item_datetime = dateparser.parse(item_datetime, date_formats=['%d %B %Y %H'])
    else:
        if (len(datetime1_rule) == 3):
            item_datetime = soup.find(datetime1_rule[0],{datetime1_rule[1]:datetime1_rule[2]}).text
            item_datetime = dateparser.parse(item_datetime, date_formats=['%d %B %Y %H'])
        else:
            item_datetime = ''
    return item_datetime

# < Собираем контент с страницы.
def get_text_content(item_page,text_rule,text1_rule):
    soup = BeautifulSoup(item_page, 'lxml')
    item_text = soup.find(text_rule[0],{text_rule[1]:text_rule[2]})
    if item_text is not None:
        item_text = soup.find(text_rule[0],{text_rule[1]:text_rule[2]}).text
    else:
        if (len(text1_rule) == 3):
            item_text = soup.find(text1_rule[0],{text1_rule[1]:text1_rule[2]}).text
        else:
            item_text = ''
    return item_text

# < Подключение к базе данных.
connection = pymysql.connect(host = 'localhost',
                             user = 'root',
                             password = '',
                             db = 'news_portal',
                             charset = 'utf8',
                             autocommit = True)
cursor = connection.cursor()

# < Запрос правил выдергивания из таблицы resource контента.
cursor.execute('SELECT * FROM `resource`')
resources = cursor.fetchall()

# < Цикл для перебора из кортежа.
for resource in resources:
    resource_name = resource[1]
    resource_link = resource[2]
    resource_url = resource[3]
    link_rule = resource[4]
    title_rule = resource[5]
    datetime_rule = resource[6]
    datetime1_rule = resource[7]
    text_rule = resource[8]
    text1_rule = resource[9]
    print(resource_name)
    resource_domain=resource_link
# < Разбиваю данные из кортежа в массив.
    links_rule = link_rule.split(',')
    title_rule = title_rule.split(',')
    datetime_rule = datetime_rule.split(',')
    datetime1_rule = datetime1_rule.split(',')
    text_rule = text_rule.split(',')
    text1_rule = text1_rule.split(',')
    resource_page = get_html(resource_url)
    resource_links = get_resource_links(resource_page,links_rule,resource_domain)
    print('кол-во ссылок: '+str(len(resource_links)))

# < Цикл для вызова функции.
    for resource_link in resource_links:
        item_page = get_html(resource_link)
        item_title = get_item_title(item_page,title_rule)
        item_datetime = get_item_datetime(item_page,datetime_rule,datetime1_rule)
        item_text_content = get_text_content(item_page,text_rule,text1_rule)

        try:
            # < Запись новостей в БД.
            sql = "insert into items (`item_link`,`item_title`,`item_datetime`,`item_text_content`) values (%s,%s,%s,%s)"
            cursor=connection.cursor()
            cursor.execute(sql,(str(resource_link),str(item_title),str(item_datetime),str(item_text_content)))
            print('Запись в базу данных успешно завершена!')
        except pymysql.err.IntegrityError:
            print('ah shit ! duplicate error!')
        except pymysql.err.InternalError:
            print('ah shit ! error')
connection.close()

Reply

Answer the question

In order to leave comments, you need to log in

5 answer(s)

G

Gsetko, 2019-08-27
@r4khic

schedule library in python. It is very convenient to run methods by wrapping them in scheduler decorators.

O

OnYourLips, 2019-08-26
@OnYourLips

systemd unit write

D

Dug Face, 2019-08-26
@DugFace

It is convenient to run such scripts through cron.
Add to crontab
* run script once an hour
Here you can play around with cron intervals.

O

One Miay, 2019-08-27
@Miay

Subscribe to these resources, email / twitter / vk / fb .... and from there trigger your script