Answer the question
In order to leave comments, you need to log in
How to parse big data in python?
How to parse big data in python for example you need to parse from 1000 sources how to do it quickly. What technology should be used to quickly scrape a site? I use this code to parse sites in 1 minute, but I need to do it quickly.
rssParser.py
import feedparser
def rssParser(url) :
parse = feedparser.parse(url)
articles = []
for item in parse.entries:
summary = item.summary if hasattr(item, 'summary') else ''
thumbnail = ''
if hasattr(item, 'media_content'):
thumbnail = item.media_content
articles.append({
'title': item.title,
'link': item.link,
'summary': summary,
'thumbnail': thumbnail,
'published': item.published
})
return articles
from django.http import JsonResponse
import logging, sys
from api.rssParser import rssParser
def index(request) :
chicagoTribuneBusiness = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/business/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneEntertainment = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/entertainment/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneLifestyles = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/lifestyles/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneWorld = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/nation-world/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneOpinion = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/opinion/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribunePolitics = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/politics/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneRealEstate = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/real-estate/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
chicagoTribuneSports = rssParser(url='https://www.chicagotribune.com/arcio/rss/category/sports/?query=display_date:%5Bnow-2d+TO+now%5D+AND+revision.published:true&sort=display_date:desc#nt=instory-link')
nyTimesUS = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml')
nyTimesWorld = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/World.xml')
nyTimesNYRegion = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/NYRegion.xml')
nyTimesBusiness = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Business.xml')
nyTimesTechnology = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml')
nyTimesSports = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml')
nyTimesScience = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Science.xml')
nyTimesClimate = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Climate.xml')
nyTimesSpace = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Space.xml')
nyTimesArts = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml')
nyTimesFashionandStyle = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/FashionandStyle.xml')
nyTimesTravel = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Travel.xml')
nyTimesRealEstate = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/RealEstate.xml')
nyTimesAutomobiles = rssParser(url='https://rss.nytimes.com/services/xml/rss/nyt/Automobiles.xml')
return JsonResponse({
'chicagoTribuneBusiness': chicagoTribuneBusiness,
'chicagoTribuneEntertainment': chicagoTribuneEntertainment,
'chicagoTribuneLifestyles': chicagoTribuneLifestyles,
'chicagoTribuneWorld': chicagoTribuneWorld,
'chicagoTribuneOpinion': chicagoTribuneOpinion,
'chicagoTribunePolitics': chicagoTribunePolitics,
'chicagoTribuneRealEstate': chicagoTribuneRealEstate,
'chicagoTribuneSports': chicagoTribuneSports,
'nyTimesUS': nyTimesUS,
'nyTimesWorld': nyTimesWorld,
'nyTimesNYRegion': nyTimesNYRegion,
'nyTimesBusiness': nyTimesBusiness,
'nyTimesTechnology': nyTimesTechnology,
'nyTimesSports': nyTimesSports,
'nyTimesScience': nyTimesScience,
'nyTimesClimate': nyTimesClimate,
'nyTimesSpace': nyTimesSpace,
'nyTimesArts': nyTimesArts,
'nyTimesFashionandStyle': nyTimesFashionandStyle,
'nyTimesTravel': nyTimesTravel,
'nyTimesRealEstate': nyTimesRealEstate,
'nyTimesAutomobiles': nyTimesAutomobiles
})
Answer the question
In order to leave comments, you need to log in
Not one bank will let you enter data on their site until you receive at least a PCI DSS certificate.
All this footcloth is replaced by iteration through the dictionary. And acceleration is done due to multithreading or asynchrony, examples are easily googled.
Threat, you need to go to such an article on your own https://chriskiehl.com/article/parallelism-in-one-line
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question