Source code for stable_datasets.images.imagenet

# Based on a script by Seiya Tokui. With the following copyright
# Copyright (c) 2014 Seiya Tokui
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# Given the wnid of a synset, the wnid of hyponym synsets can be obtained at
# http://www.image-net.org/api/text/wordnet.structure.hyponym?wnid=[wnid]
#
# To obtain the full hyponym (the synset of the whole subtree starting
# from wnid), you can request
# http://www.image-net.org/api/text/wordnet.structure.hyponym?wnid=[wnid]&full=1
#
# to get the word of s synset
# http://www.image-net.org/api/text/wordnet.synset.getwords?wnid=[wnid]
#
# Given the wnid of a synset, the URLs of its images can be obtained at
# http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=[wnid]
#
# mappingfrom all synset to words
# http://image-net.org/archive/words.txt
#


import argparse
import imghdr
import math
import os
import sys
import threading
import time
import urllib.error
import urllib.parse
import urllib.request


[docs] class DownloadError(Exception): """Base class for exceptions in this module.""" def __init__(self, message=""): self.message = message
[docs] def download(n_images, min_size, n_threads, wnids_list, out_dir): wnid_thread_lists = [] wnid_list_len = len(wnids_list) wnid_thread_sizes = int(math.ceil(float(wnid_list_len) / n_threads)) for i in range(n_threads): wnid_thread_lists.append(wnids_list[i * wnid_thread_sizes : (i + 1) * wnid_thread_sizes]) # Define the threads def downloader(wnid_list): for wnid in wnid_list: dir_name = wnid print("Downloading " + dir_name) dir_path = os.path.join(out_dir, dir_name) if os.path.isdir(dir_path): print("skipping: already have " + dir_name) else: image_url_list = get_image_urls(wnid) download_images(dir_path, image_url_list, n_images, min_size) # initialize the threads print(wnid_thread_lists[0]) download_threads = [threading.Thread(target=downloader, args=([wnid_thread_lists[i]])) for i in range(n_threads)] for t in download_threads: t.start() is_alive = True while is_alive: is_alive = False for t in download_threads: is_alive = is_alive or t.isAlive() time.sleep(0.1) for t in download_threads: t.join() print("finished")
[docs] def mkdir(path): if not os.path.isdir(path): os.makedirs(path)
[docs] def get_url_request_list_function(request_url): def get_url_request_list(wnid, timeout=5, retry=3): url = request_url + wnid f = urllib.request.urlopen(url) response = f.read().decode() f.close() print("response: " + response) list = str.split(response) return list return get_url_request_list
get_image_urls = get_url_request_list_function("http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=") get_subtree_wnid = get_url_request_list_function("http://www.image-net.org/api/text/wordnet.structure.hyponym?wnid=") get_full_subtree_wnid = get_url_request_list_function( "http://www.image-net.org/api/text/wordnet.structure.hyponym?full=1&wnid=" )
[docs] def get_words_wnid(wnid): url = "http://www.image-net.org/api/text/wordnet.synset.getwords?wnid=" + wnid f = urllib.request.urlopen(url) content = f.read().decode() f.close() return content
[docs] def download_images(dir_path, image_url_list, n_images, min_size): mkdir(dir_path) image_count = 0 for url in image_url_list: if image_count == n_images: break try: f = urllib.request.urlopen(url) image = f.read() f.close() extension = imghdr.what("", image) # check if valid image if extension == "jpeg": extension = "jpg" if sys.getsizeof(image) > min_size: image_name = "image_" + str(image_count) + "." + extension image_path = os.path.join(dir_path, image_name) image_file = open(image_path, "wb") image_file.write(image) image_file.close() image_count += 1 except Exception: print("skipping ", url)
[docs] def main(wnid, out_dir, n_threads, n_images, fullsubtree, noroot, nosubtree, min_size): wnids_list = [] # First get the list of wnids if not noroot: wnids_list.append(wnid) if not nosubtree: if fullsubtree: subtree = get_full_subtree_wnid(wnid) else: timeout = None retry = None subtree = get_subtree_wnid(wnid, timeout, retry) for i in range(1, len(subtree)): subtree[i] = subtree[i][1:] # removes dash wnids_list.extend(subtree) # create root directory mkdir(out_dir) download(n_images, min_size, n_threads, wnids_list, out_dir)
if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("wnid", help="Imagenet wnid, example n03489162") p.add_argument("outdir", help="Output directory") p.add_argument( "--jobs", "-j", type=int, default=1, help="Number of parallel threads to download", ) p.add_argument( "--images", "-i", type=int, default=20, metavar="N_IMAGES", help="Number of images per category to download", ) p.add_argument("--fullsubtree", "-F", action="store_true", help="Downloads the full subtree") p.add_argument("--noroot", "-R", action="store_true", help="Do not Downloads the root") p.add_argument("--nosubtree", "-S", action="store_true", help="Do not Downloads the subtree") p.add_argument( "--humanreadable", "-H", action="store_true", help="Makes the folders human readable", ) p.add_argument( "--minsize", "-m", type=float, default=7000, help="Min size of the images in bytes", ) args = p.parse_args() main( wnid=args.wnid, out_dir=args.outdir, n_threads=args.jobs, n_images=args.images, fullsubtree=args.fullsubtree, noroot=args.noroot, nosubtree=args.nosubtree, min_size=args.minsize, )