# Based on a script by Seiya Tokui. With the following copyright
# Copyright (c) 2014 Seiya Tokui
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# Given the wnid of a synset, the wnid of hyponym synsets can be obtained at
# http://www.image-net.org/api/text/wordnet.structure.hyponym?wnid=[wnid]
#
# To obtain the full hyponym (the synset of the whole subtree starting
# from wnid), you can request
# http://www.image-net.org/api/text/wordnet.structure.hyponym?wnid=[wnid]&full=1
#
# to get the word of s synset
# http://www.image-net.org/api/text/wordnet.synset.getwords?wnid=[wnid]
#
# Given the wnid of a synset, the URLs of its images can be obtained at
# http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=[wnid]
#
# mappingfrom all synset to words
# http://image-net.org/archive/words.txt
#
import argparse
import imghdr
import math
import os
import sys
import threading
import time
import urllib.error
import urllib.parse
import urllib.request
[docs]
class DownloadError(Exception):
"""Base class for exceptions in this module."""
def __init__(self, message=""):
self.message = message
[docs]
def download(n_images, min_size, n_threads, wnids_list, out_dir):
wnid_thread_lists = []
wnid_list_len = len(wnids_list)
wnid_thread_sizes = int(math.ceil(float(wnid_list_len) / n_threads))
for i in range(n_threads):
wnid_thread_lists.append(wnids_list[i * wnid_thread_sizes : (i + 1) * wnid_thread_sizes])
# Define the threads
def downloader(wnid_list):
for wnid in wnid_list:
dir_name = wnid
print("Downloading " + dir_name)
dir_path = os.path.join(out_dir, dir_name)
if os.path.isdir(dir_path):
print("skipping: already have " + dir_name)
else:
image_url_list = get_image_urls(wnid)
download_images(dir_path, image_url_list, n_images, min_size)
# initialize the threads
print(wnid_thread_lists[0])
download_threads = [threading.Thread(target=downloader, args=([wnid_thread_lists[i]])) for i in range(n_threads)]
for t in download_threads:
t.start()
is_alive = True
while is_alive:
is_alive = False
for t in download_threads:
is_alive = is_alive or t.isAlive()
time.sleep(0.1)
for t in download_threads:
t.join()
print("finished")
[docs]
def mkdir(path):
if not os.path.isdir(path):
os.makedirs(path)
[docs]
def get_url_request_list_function(request_url):
def get_url_request_list(wnid, timeout=5, retry=3):
url = request_url + wnid
f = urllib.request.urlopen(url)
response = f.read().decode()
f.close()
print("response: " + response)
list = str.split(response)
return list
return get_url_request_list
get_image_urls = get_url_request_list_function("http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=")
get_subtree_wnid = get_url_request_list_function("http://www.image-net.org/api/text/wordnet.structure.hyponym?wnid=")
get_full_subtree_wnid = get_url_request_list_function(
"http://www.image-net.org/api/text/wordnet.structure.hyponym?full=1&wnid="
)
[docs]
def get_words_wnid(wnid):
url = "http://www.image-net.org/api/text/wordnet.synset.getwords?wnid=" + wnid
f = urllib.request.urlopen(url)
content = f.read().decode()
f.close()
return content
[docs]
def download_images(dir_path, image_url_list, n_images, min_size):
mkdir(dir_path)
image_count = 0
for url in image_url_list:
if image_count == n_images:
break
try:
f = urllib.request.urlopen(url)
image = f.read()
f.close()
extension = imghdr.what("", image) # check if valid image
if extension == "jpeg":
extension = "jpg"
if sys.getsizeof(image) > min_size:
image_name = "image_" + str(image_count) + "." + extension
image_path = os.path.join(dir_path, image_name)
image_file = open(image_path, "wb")
image_file.write(image)
image_file.close()
image_count += 1
except Exception:
print("skipping ", url)
[docs]
def main(wnid, out_dir, n_threads, n_images, fullsubtree, noroot, nosubtree, min_size):
wnids_list = []
# First get the list of wnids
if not noroot:
wnids_list.append(wnid)
if not nosubtree:
if fullsubtree:
subtree = get_full_subtree_wnid(wnid)
else:
timeout = None
retry = None
subtree = get_subtree_wnid(wnid, timeout, retry)
for i in range(1, len(subtree)):
subtree[i] = subtree[i][1:] # removes dash
wnids_list.extend(subtree)
# create root directory
mkdir(out_dir)
download(n_images, min_size, n_threads, wnids_list, out_dir)
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("wnid", help="Imagenet wnid, example n03489162")
p.add_argument("outdir", help="Output directory")
p.add_argument(
"--jobs",
"-j",
type=int,
default=1,
help="Number of parallel threads to download",
)
p.add_argument(
"--images",
"-i",
type=int,
default=20,
metavar="N_IMAGES",
help="Number of images per category to download",
)
p.add_argument("--fullsubtree", "-F", action="store_true", help="Downloads the full subtree")
p.add_argument("--noroot", "-R", action="store_true", help="Do not Downloads the root")
p.add_argument("--nosubtree", "-S", action="store_true", help="Do not Downloads the subtree")
p.add_argument(
"--humanreadable",
"-H",
action="store_true",
help="Makes the folders human readable",
)
p.add_argument(
"--minsize",
"-m",
type=float,
default=7000,
help="Min size of the images in bytes",
)
args = p.parse_args()
main(
wnid=args.wnid,
out_dir=args.outdir,
n_threads=args.jobs,
n_images=args.images,
fullsubtree=args.fullsubtree,
noroot=args.noroot,
nosubtree=args.nosubtree,
min_size=args.minsize,
)