Source code for stable_datasets.images.hasy_v2

import os
import tempfile

import datasets
import pandas as pd
from PIL import Image


[docs] class HASYv2(datasets.GeneratorBasedBuilder): """ The HASYv2 dataset contains handwritten symbol images of 369 classes. Each image is 32x32 pixels in size. """ VERSION = datasets.Version("1.0.0") def _info(self): return datasets.DatasetInfo( description="""The HASYv2 dataset contains 32x32 black-and-white images of 369 handwritten symbol classes. It includes over 168,236 samples categorized into various classes like Latin characters, numerals, and symbols.""", features=datasets.Features( { "image": datasets.Image(), "label": datasets.ClassLabel(names=self._labels()), } ), supervised_keys=("image", "label"), homepage="https://github.com/MartinThoma/HASY", citation="""@article{thoma2017hasyv2, title={The hasyv2 dataset}, author={Thoma, Martin}, journal={arXiv preprint arXiv:1701.08380}, year={2017}}""", ) def _split_generators(self, dl_manager): url = "https://zenodo.org/record/259444/files/HASYv2.tar.bz2?download=1" archive_path = dl_manager.download_and_extract(url) fold_1_dir = os.path.join(archive_path, "classification-task/fold-1") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"csv_path": os.path.join(fold_1_dir, "train.csv"), "base_dir": archive_path}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"csv_path": os.path.join(fold_1_dir, "test.csv"), "base_dir": archive_path}, ), ] def _generate_examples(self, csv_path, base_dir): # Read the CSV file df = pd.read_csv(csv_path) for idx, row in df.iterrows(): # Resolve the full path to the image image_path = os.path.join(base_dir, row["path"].lstrip("../../")) # Open the image and convert to grayscale with Image.open(image_path).convert("L") as image: # Save the processed image to a temporary file with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file: image.save(temp_file.name, format="PNG") temp_image_path = temp_file.name yield ( idx, { "image": temp_image_path, # Provide the path to the temporary file "label": str(row["symbol_id"]), # Pass the label as a string }, ) @staticmethod def _labels(): return [ "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "59", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "81", "82", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", "103", "104", "105", "106", "107", "108", "110", "111", "112", "113", "114", "115", "116", "117", "150", "151", "152", "153", "154", "155", "156", "157", "158", "159", "160", "161", "162", "163", "164", "165", "166", "167", "168", "169", "170", "171", "174", "175", "176", "177", "178", "179", "180", "181", "182", "183", "184", "185", "186", "187", "188", "189", "190", "191", "192", "193", "194", "195", "196", "197", "254", "257", "259", "260", "261", "262", "263", "264", "265", "266", "267", "268", "269", "508", "510", "511", "512", "513", "514", "517", "520", "521", "523", "524", "526", "527", "528", "529", "530", "531", "532", "533", "534", "535", "536", "537", "538", "539", "540", "541", "542", "544", "549", "550", "553", "555", "562", "564", "574", "577", "582", "583", "584", "591", "595", "600", "601", "603", "604", "605", "607", "608", "609", "610", "611", "612", "613", "614", "615", "616", "617", "618", "620", "621", "622", "630", "631", "634", "635", "636", "639", "640", "644", "647", "650", "661", "671", "678", "679", "683", "684", "698", "711", "712", "713", "716", "728", "739", "741", "743", "748", "751", "753", "756", "757", "758", "759", "761", "762", "763", "764", "765", "767", "768", "770", "771", "775", "777", "778", "783", "785", "786", "788", "791", "792", "801", "809", "812", "817", "822", "823", "827", "837", "838", "881", "882", "884", "885", "886", "887", "888", "889", "890", "891", "892", "894", "901", "912", "913", "914", "915", "916", "917", "918", "919", "920", "921", "922", "923", "924", "934", "936", "941", "943", "944", "945", "946", "947", "948", "949", "950", "951", "953", "956", "957", "958", "959", "960", "965", "968", "971", "972", "973", "974", "977", "992", "993", "994", "995", "996", "997", "998", "999", "1000", "1004", "1005", "1006", "1007", "1008", "1010", "1011", "1012", "1013", "1016", "1018", "1019", "1031", "1037", "1042", "1045", "1046", "1051", "1053", "1062", "1064", "1065", "1066", "1074", "1075", "1077", "1078", "1079", "1080", "1082", "1086", "1090", "1093", "1101", "1102", "1103", "1111", "1112", "1115", "1116", "1117", "1168", "1169", "1177", "1184", "1185", "1187", "1314", "1315", "1316", "1317", "1369", "1371", "1374", "1382", "1385", "1394", "1395", "1396", "1400", ]