What is the fastest way to download many images (Python)?

U

u on2021-01-25 20:15:48

Python

u on, 2021-01-25 20:15:48

I have an array with links to images of the format

site.com/image.png

Right now I'm using requests, code snippet from the loop:

# тут цикл
img = requests.get(img_url)
img_file = open(filename, 'wb')
img_file.write(img.content)
img_file.close()

What is the fastest way to download all images from a list of links?

Reply

Answer the question

In order to leave comments, you need to log in

3 answer(s)

U

u on, 2021-01-25
@mihalik35

Solved the problem using httplib2 with
/thread caching

S

Sergey Gornostaev, 2021-01-25
@sergey-gornostaev

Use asynchrony, of course.

A

Andrey Dugin, 2021-01-25
@adugin

I recently wrote a script, you can take as a basis:

#!/usr/bin/env python
# coding: utf-8

import asyncio
import aiohttp
import xml.etree.ElementTree as ET

from zipfile import ZipFile
from collections import Counter
from pathlib import Path
from itertools import islice

TARGET = 'plate_number_image_url'
# TARGET = 'photo_url'

def url_to_filename(url, base_dir=TARGET, last_n_parts=3) -> Path:
    path = Path(base_dir).joinpath(*url.strip().split('/')[-last_n_parts:])
    path.parent.mkdir(parents=True, exist_ok=True)
    return path

async def download_content_as_bytes(url: str) -> bytes:
    content = None
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                content = await response.read()
    except aiohttp.client_exceptions.ClientConnectorError:
        # await asyncio.sleep(60)
        pass
    finally:
        return content

async def write_bytes_to_file(content: bytes, filename: str) -> None:
    if content:
        with open(filename, 'wb') as file:
            file.write(content)
            print(f'SAVED {filename}')

async def file_download_task(url: str) -> None:
    filename = url_to_filename(url)
    if not filename.exists():
        content = await download_content_as_bytes(url)
        await write_bytes_to_file(content, filename)
    else:
        pass
        # print(f'SKIPPED {filename}')

async def main(batch_size=1000) -> None:
    with open(f'{TARGET}.txt') as f:
        urls = [line.strip() for line in f]
        print(f'TOTAL: {len(urls)}')
    while urls:
        tasks = map(file_download_task, urls[:batch_size])
        await asyncio.wait(tasks)
        del urls[:batch_size]

if __name__ == '__main__':
    while True:
        asyncio.run(main())

Well, then check / download the downloaded files:

#!/usr/bin/env python
# coding: utf-8

import cv2
import requests
from PIL import Image
from pathlib import Path

for root in ['photo_url', 'plate_number_image_url']:
    for path in Path(root).glob('**/*.*'):
        if path.stat().st_size > 0:
            try:
                if any(Image.open(path).size):
                    image = cv2.imread(str(path))
            except:
                pass
            else:
                continue
        url = list(path.parts)
        url[0] = 'http://img03.platesmania.com'
        url = '/'.join(url)
        response = requests.get(url)
        if response.ok:
            with open(path, 'wb') as fo:
                fo.write(response.content)
                print(f'LOADED {path}')
        else:
            # Повреждённый или отсутствующий файл
            print(path)