How to parse big data in python?

J

juventino962020-09-25 13:17:10

Payment systems

juventino96, 2020-09-25 13:17:10

How to parse big data in python for example you need to parse from 1000 sources how to do it quickly. What technology should be used to quickly scrape a site? I use this code to parse sites in 1 minute, but I need to do it quickly.

rssParser.py

import feedparser

def rssParser(url) :
    parse = feedparser.parse(url)

    articles = []

    for item in parse.entries:
        summary = item.summary if hasattr(item, 'summary') else ''
        thumbnail = ''

        if hasattr(item, 'media_content'):
            thumbnail = item.media_content

        articles.append({
            'title': item.title,
            'link': item.link,
            'summary': summary,
            'thumbnail': thumbnail,
            'published': item.published
        })

    return articles

views.py

from django.http import JsonResponse
import logging, sys
from api.rssParser import rssParser

def index(request) :
    chicagoTribuneBusiness = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/business/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
    chicagoTribuneEntertainment = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/entertainment/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
    chicagoTribuneLifestyles = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/lifestyles/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
    chicagoTribuneWorld = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/nation-world/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
    chicagoTribuneOpinion = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/opinion/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
    chicagoTribunePolitics = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/politics/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
    chicagoTribuneRealEstate = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/real-estate/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
    chicagoTribuneSports = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/sports/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')

    nyTimesUS = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml')
    nyTimesWorld = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/World.xml')
    nyTimesNYRegion = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/NYRegion.xml')
    nyTimesBusiness = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Business.xml')
    nyTimesTechnology = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml')
    nyTimesSports = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml')
    nyTimesScience = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Science.xml')
    nyTimesClimate = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Climate.xml')
    nyTimesSpace = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Space.xml')
    nyTimesArts = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml')
    nyTimesFashionandStyle = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml')
    nyTimesTravel = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Travel.xml')
    nyTimesRealEstate = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/RealEstate.xml')
    nyTimesAutomobiles = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Automobiles.xml')

    return JsonResponse({
        'chicagoTribuneBusiness': chicagoTribuneBusiness,
        'chicagoTribuneEntertainment': chicagoTribuneEntertainment,
        'chicagoTribuneLifestyles': chicagoTribuneLifestyles,
        'chicagoTribuneWorld': chicagoTribuneWorld,
        'chicagoTribuneOpinion': chicagoTribuneOpinion,
        'chicagoTribunePolitics': chicagoTribunePolitics,
        'chicagoTribuneRealEstate': chicagoTribuneRealEstate,
        'chicagoTribuneSports': chicagoTribuneSports,
        'nyTimesUS': nyTimesUS,
        'nyTimesWorld': nyTimesWorld,
        'nyTimesNYRegion': nyTimesNYRegion,
        'nyTimesBusiness': nyTimesBusiness,
        'nyTimesTechnology': nyTimesTechnology,
        'nyTimesSports': nyTimesSports,
        'nyTimesScience': nyTimesScience,
        'nyTimesClimate': nyTimesClimate,
        'nyTimesSpace': nyTimesSpace,
        'nyTimesArts': nyTimesArts,
        'nyTimesFashionandStyle': nyTimesFashionandStyle,
        'nyTimesTravel': nyTimesTravel,
        'nyTimesRealEstate': nyTimesRealEstate,
        'nyTimesAutomobiles': nyTimesAutomobiles
    })

Reply

Answer the question

In order to leave comments, you need to log in

3 answer(s)

A

Alexander, 2015-11-25
@Grebenshchikov_Alex

Not one bank will let you enter data on their site until you receive at least a PCI DSS certificate.

D

Dr. Bacon, 2020-09-25
@bacon

All this footcloth is replaced by iteration through the dictionary. And acceleration is done due to multithreading or asynchrony, examples are easily googled.
Threat, you need to go to such an article on your own https://chriskiehl.com/article/parallelism-in-one-line

V

Vladimir, 2020-09-25
@AstraVlad

This video breaks down Bulk Request Acceleration step by step:
https://www.youtube.com/watch?v=R4Oz8JUuM4s