Source code for stable_datasets.images.imagenette

import importlib.util
import sys
from pathlib import Path

import datasets
from huggingface_hub import snapshot_download


_IN10_classes = [
    "n01440764",
    "n02102040",
    "n02979186",
    "n03000684",
    "n03028079",
    "n03394916",
    "n03417042",
    "n03425413",
    "n03445777",
    "n03888257",
]
_IN100_CLASSES = [
    "n02869837",
    "n01749939",
    "n02488291",
    "n02107142",
    "n13037406",
    "n02091831",
    "n04517823",
    "n04589890",
    "n03062245",
    "n01773797",
    "n01735189",
    "n07831146",
    "n07753275",
    "n03085013",
    "n04485082",
    "n02105505",
    "n01983481",
    "n02788148",
    "n03530642",
    "n04435653",
    "n02086910",
    "n02859443",
    "n13040303",
    "n03594734",
    "n02085620",
    "n02099849",
    "n01558993",
    "n04493381",
    "n02109047",
    "n04111531",
    "n02877765",
    "n04429376",
    "n02009229",
    "n01978455",
    "n02106550",
    "n01820546",
    "n01692333",
    "n07714571",
    "n02974003",
    "n02114855",
    "n03785016",
    "n03764736",
    "n03775546",
    "n02087046",
    "n07836838",
    "n04099969",
    "n04592741",
    "n03891251",
    "n02701002",
    "n03379051",
    "n02259212",
    "n07715103",
    "n03947888",
    "n04026417",
    "n02326432",
    "n03637318",
    "n01980166",
    "n02113799",
    "n02086240",
    "n03903868",
    "n02483362",
    "n04127249",
    "n02089973",
    "n03017168",
    "n02093428",
    "n02804414",
    "n02396427",
    "n04418357",
    "n02172182",
    "n01729322",
    "n02113978",
    "n03787032",
    "n02089867",
    "n02119022",
    "n03777754",
    "n04238763",
    "n02231487",
    "n03032252",
    "n02138441",
    "n02104029",
    "n03837869",
    "n03494278",
    "n04136333",
    "n03794056",
    "n03492542",
    "n02018207",
    "n04067472",
    "n03930630",
    "n03584829",
    "n02123045",
    "n04229816",
    "n02100583",
    "n03642806",
    "n04336792",
    "n03259280",
    "n02116738",
    "n02108089",
    "n03424325",
    "n01855672",
    "n02090622",
]


[docs] class Imagenette(datasets.GeneratorBasedBuilder): """TODO: Short description of my dataset.""" VERSION = datasets.Version("1.1.0") # This is an example of a dataset with multiple configurations. # If you don't want/need to define several sub-sets in your dataset, # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. # If you need to make complex sub-parts in the datasets with configurable options # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig # BUILDER_CONFIG_CLASS = MyBuilderConfig # You will be able to load one or the other configurations in the following list with # data = datasets.load_dataset('my_dataset', 'first_domain') # data = datasets.load_dataset('my_dataset', 'second_domain') BUILDER_CONFIGS = [ datasets.BuilderConfig( name="imagenet", version=VERSION, description="1000-class version", ), datasets.BuilderConfig( name="imagenette", version=VERSION, description="10-class version", ), datasets.BuilderConfig( name="imagenet100", version=VERSION, description="100-class version", ), ] DEFAULT_CONFIG_NAME = "imagenette" def _get_in1k_module(self): if hasattr(self, "_in_mod"): return self._in_mod path = Path(snapshot_download(repo_id="ILSVRC/imagenet-1k", repo_type="dataset")) print(path) if not (path / "imagenet_1k.py").is_file(): (path / "imagenet-1k.py").rename(path / "imagenet_1k.py") if not (path / "__init__.py").is_file(): (path / "__init__.py").touch() (path / "__init__.py").write_text("from . import imagenet_1k") sys.path.append(str(path.parent)) self._in_mod = importlib.import_module(path.name) self._DATA_URL = path return self._in_mod def _info(self): if self.config.name == "imagenet": mod = self._get_in1k_module() return mod.imagenet_1k.Imagenet1k()._info() elif self.config.name == "imagenet100": names = _IN100_CLASSES else: names = _IN10_classes features = datasets.Features( { "image": datasets.Image(), "label": datasets.ClassLabel(names=names), } ) if self.config.name == "imagenet": homepage = "https://www.image-net.org/update-mar-11-2021.php" license = "CC BY 2.0" elif self.config.name == "imagenette": homepage = "https://github.com/fastai/imagenette" license = "Apache 2.0" return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description="Imagenet and its variants", # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and # specify them. They'll be used if as_supervised=True in builder.as_dataset. # supervised_keys=("sentence", "label"), homepage=homepage, license=license, # Citation for the dataset # citation=_CITATION, ) def _split_generators(self, dl_manager): if self.config.name == "imagenet": mod = self._get_in1k_module() # print(self._DATA_URL, mod.imagenet_1k._DATA_URL) mod.imagenet_1k._DATA_URL = { fold: [(self._DATA_URL / p) for p in mod.imagenet_1k._DATA_URL[fold]] for fold in mod.imagenet_1k._DATA_URL } print(mod.imagenet_1k._DATA_URL) archives = dl_manager.download(mod.imagenet_1k._DATA_URL) print(archives) print([dl_manager.iter_archive(archive) for archive in archives["train"]]) print( datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "archives": [dl_manager.iter_archive(archive) for archive in archives["train"]], "split": "train", }, ) ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "archives": [dl_manager.iter_archive(archive) for archive in archives["train"]], "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "archives": [dl_manager.iter_archive(archive) for archive in archives["val"]], "split": "validation", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "archives": [dl_manager.iter_archive(archive) for archive in archives["test"]], "split": "test", }, ), ] # return mod.imagenet_1k.Imagenet1k()._split_generators(dl_manager) elif self.config.name == "imagenette": urls = "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz" elif self.config.name == "imagenet100": d = datasets.load_dataset("imagenet-1k") d["train"] = d["train"].filter(lambda example: example["label"] in _IN100_CLASSES) d["validation"] = d["validation"].filter(lambda example: example["label"] in _IN100_CLASSES) data_dir = Path(dl_manager.download_and_extract(urls)) train_path = data_dir / "imagenette2" / "train" test_path = data_dir / "imagenette2" / "val" return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"files": train_path.rglob("*.JPEG")}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"files": test_path.rglob("*.JPEG")}, ), ] # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` def _generate_examples(self, **kwargs): if self.config.name == "imagenet": mod = self._get_in1k_module() return mod.imagenet_1k.Imagenet1k()._generate_examples(**kwargs) files = kwargs["files"] for key, file in enumerate(files): image = str(file) # Image.open(file).convert("RGB") if self.config.name == "imagenette": label = file.parent.name yield key, {"image": image, "label": label}