Source code for stable_datasets.images.svhn
import datasets
import numpy as np
import scipy.io as sio
[docs]
class SVHN(datasets.GeneratorBasedBuilder):
"""SVHN (Street View House Numbers) Dataset for image classification."""
VERSION = datasets.Version("1.0.0")
def _info(self):
return datasets.DatasetInfo(
description="The SVHN dataset contains images of digits obtained from house numbers in Google Street View "
"images. It has over 600,000 labeled digit images.",
features=datasets.Features(
{
"image": datasets.Image(),
"label": datasets.ClassLabel(names=[str(i) for i in range(10)]),
}
),
supervised_keys=("image", "label"),
homepage="http://ufldl.stanford.edu/housenumbers/",
citation="""@inproceedings{netzer2011reading,
title={Reading digits in natural images with unsupervised feature learning},
author={Netzer, Yuval and Wang, Tao and Coates, Adam and Bissacco, Alessandro and Wu, Baolin and Ng, Andrew Y and others},
booktitle={NIPS workshop on deep learning and unsupervised feature learning},
volume={2011},
number={2},
pages={4},
year={2011},
organization={Granada}
}""",
)
def _split_generators(self, dl_manager):
urls = {
"train": "http://ufldl.stanford.edu/housenumbers/train_32x32.mat",
"test": "http://ufldl.stanford.edu/housenumbers/test_32x32.mat",
}
downloaded_files = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"file_path": downloaded_files["train"]},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"file_path": downloaded_files["test"]},
),
]
def _generate_examples(self, file_path):
data = sio.loadmat(file_path)
images = data["X"].transpose([3, 0, 1, 2])
labels = np.squeeze(data["y"])
# Convert '0' label from 10 to 0
labels[labels == 10] = 0
for idx, (image, label) in enumerate(zip(images, labels)):
yield idx, {"image": image, "label": label}