Source code for stable_datasets.images.svhn

import datasets
import numpy as np
import scipy.io as sio



[docs]
class SVHN(datasets.GeneratorBasedBuilder):
    """SVHN (Street View House Numbers) Dataset for image classification."""

    VERSION = datasets.Version("1.0.0")

    def _info(self):
        return datasets.DatasetInfo(
            description="The SVHN dataset contains images of digits obtained from house numbers in Google Street View "
            "images. It has over 600,000 labeled digit images.",
            features=datasets.Features(
                {
                    "image": datasets.Image(),
                    "label": datasets.ClassLabel(names=[str(i) for i in range(10)]),
                }
            ),
            supervised_keys=("image", "label"),
            homepage="http://ufldl.stanford.edu/housenumbers/",
            citation="""@inproceedings{netzer2011reading,
                          title={Reading digits in natural images with unsupervised feature learning},
                          author={Netzer, Yuval and Wang, Tao and Coates, Adam and Bissacco, Alessandro and Wu, Baolin and Ng, Andrew Y and others},
                          booktitle={NIPS workshop on deep learning and unsupervised feature learning},
                          volume={2011},
                          number={2},
                          pages={4},
                          year={2011},
                          organization={Granada}
                        }""",
        )

    def _split_generators(self, dl_manager):
        urls = {
            "train": "http://ufldl.stanford.edu/housenumbers/train_32x32.mat",
            "test": "http://ufldl.stanford.edu/housenumbers/test_32x32.mat",
        }
        downloaded_files = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"file_path": downloaded_files["train"]},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={"file_path": downloaded_files["test"]},
            ),
        ]

    def _generate_examples(self, file_path):
        data = sio.loadmat(file_path)
        images = data["X"].transpose([3, 0, 1, 2])
        labels = np.squeeze(data["y"])

        # Convert '0' label from 10 to 0
        labels[labels == 10] = 0

        for idx, (image, label) in enumerate(zip(images, labels)):
            yield idx, {"image": image, "label": label}