Source code for stable_datasets.images.arabic_digits
import io
from zipfile import ZipFile
import datasets
from PIL import Image
from tqdm import tqdm
[docs]
class ArabicDigits(datasets.GeneratorBasedBuilder):
"""Arabic Handwritten Digits Dataset."""
VERSION = datasets.Version("1.0.0")
def _info(self):
return datasets.DatasetInfo(
description="""Arabic Handwritten Digits Dataset, composed of images of Arabic digits handwritten
by participants. This dataset is structured for use in machine learning tasks such
as digit classification.""",
features=datasets.Features(
{"image": datasets.Image(), "label": datasets.ClassLabel(names=[str(i) for i in range(10)])}
),
supervised_keys=("image", "label"),
homepage="https://github.com/mloey/Arabic-Handwritten-Digits-Dataset",
citation="""@inproceedings{el2016cnn,
title={CNN for handwritten arabic digits recognition based on LeNet-5},
author={El-Sawy, Ahmed and Hazem, EL-Bakry and Loey, Mohamed},
booktitle={International conference on advanced intelligent systems and informatics},
pages={566--575},
year={2016},
organization={Springer}
}""",
)
def _split_generators(self, dl_manager):
urls = {
"train": "https://github.com/mloey/Arabic-Handwritten-Digits-Dataset/raw/master/Train%20Images.zip",
"test": "https://github.com/mloey/Arabic-Handwritten-Digits-Dataset/raw/master/Test%20Images.zip",
}
downloaded_files = dl_manager.download(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"archive_path": downloaded_files["train"], "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"archive_path": downloaded_files["test"], "split": "test"},
),
]
def _generate_examples(self, archive_path, split):
"""Generate examples from the ZIP archives of images and labels."""
with ZipFile(archive_path, "r") as archive:
for entry in tqdm(archive.infolist(), desc=f"Processing {split} set"):
if entry.filename.endswith(".png"):
content = archive.read(entry)
image = Image.open(io.BytesIO(content))
label = int(entry.filename.split("_")[-1][:-4]) # Extract label from filename
yield entry.filename, {"image": image, "label": label}