Source code for stable_datasets.images.linnaeus5
from io import BytesIO
import datasets
import rarfile
[docs]
class Linnaeus5(datasets.GeneratorBasedBuilder):
"""Linnaeus 5 Dataset: RGB images (256x256) for classification across 5 categories."""
VERSION = datasets.Version("1.0.0")
def _info(self):
return datasets.DatasetInfo(
description="""Linnaeus 5 dataset contains RGB images (256x256) for classification across 5 categories:
berry, bird, dog, flower, and other (negative set). It includes 1200 training images
and 400 test images per class.""",
features=datasets.Features(
{
"image": datasets.Image(),
"label": datasets.ClassLabel(names=["berry", "bird", "dog", "flower", "other"]),
}
),
supervised_keys=("image", "label"),
homepage="http://chaladze.com/l5/",
citation="""@article{chaladze2017linnaeus,
title={Linnaeus 5 dataset for machine learning},
author={Chaladze, G and Kalatozishvili, L},
journal={chaladze. com},
year={2017}}
""",
)
def _split_generators(self, dl_manager):
archive_path = dl_manager.download("http://chaladze.com/l5/img/Linnaeus%205%20256X256.rar")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"archive_path": archive_path, "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"archive_path": archive_path, "split": "test"},
),
]
def _generate_examples(self, archive_path, split):
with rarfile.RarFile(archive_path) as rar:
for member in rar.infolist():
if split in member.filename and member.filename.endswith(".jpg"):
label = member.filename.split("/")[2]
with rar.open(member) as file:
image_bytes = BytesIO(file.read())
yield (
member.filename,
{
"image": image_bytes,
"label": label,
},
)