Source code for stable_datasets.images.arabic_digits

import io
from zipfile import ZipFile

import datasets
from PIL import Image
from tqdm import tqdm



[docs]
class ArabicDigits(datasets.GeneratorBasedBuilder):
    """Arabic Handwritten Digits Dataset."""

    VERSION = datasets.Version("1.0.0")

    def _info(self):
        return datasets.DatasetInfo(
            description="""Arabic Handwritten Digits Dataset, composed of images of Arabic digits handwritten
                           by participants. This dataset is structured for use in machine learning tasks such
                           as digit classification.""",
            features=datasets.Features(
                {"image": datasets.Image(), "label": datasets.ClassLabel(names=[str(i) for i in range(10)])}
            ),
            supervised_keys=("image", "label"),
            homepage="https://github.com/mloey/Arabic-Handwritten-Digits-Dataset",
            citation="""@inproceedings{el2016cnn,
                        title={CNN for handwritten arabic digits recognition based on LeNet-5},
                        author={El-Sawy, Ahmed and Hazem, EL-Bakry and Loey, Mohamed},
                        booktitle={International conference on advanced intelligent systems and informatics},
                        pages={566--575},
                        year={2016},
                        organization={Springer}
                        }""",
        )

    def _split_generators(self, dl_manager):
        urls = {
            "train": "https://github.com/mloey/Arabic-Handwritten-Digits-Dataset/raw/master/Train%20Images.zip",
            "test": "https://github.com/mloey/Arabic-Handwritten-Digits-Dataset/raw/master/Test%20Images.zip",
        }
        downloaded_files = dl_manager.download(urls)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"archive_path": downloaded_files["train"], "split": "train"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={"archive_path": downloaded_files["test"], "split": "test"},
            ),
        ]

    def _generate_examples(self, archive_path, split):
        """Generate examples from the ZIP archives of images and labels."""
        with ZipFile(archive_path, "r") as archive:
            for entry in tqdm(archive.infolist(), desc=f"Processing {split} set"):
                if entry.filename.endswith(".png"):
                    content = archive.read(entry)
                    image = Image.open(io.BytesIO(content))
                    label = int(entry.filename.split("_")[-1][:-4])  # Extract label from filename

                    yield entry.filename, {"image": image, "label": label}