Source code for stable_datasets.utils

import multiprocessing
import os
import time
from collections.abc import Iterable, Mapping
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from types import MappingProxyType
from urllib.parse import urlparse

import datasets
import numpy as np
import pandas as pd
import rich.progress
from datasets import DownloadConfig
from filelock import FileLock
from loguru import logger as logging
from requests_cache import CachedSession
from rich.progress import (
    BarColumn,
    MofNCompleteColumn,
    TextColumn,
    TimeElapsedColumn,
    TimeRemainingColumn,
)
from tqdm import tqdm


DEFAULT_CACHE_DIR = "~/.stable_datasets/"


def _default_dest_folder() -> Path:
    """Default folder where files are saved."""
    return Path(os.path.expanduser(DEFAULT_CACHE_DIR)) / "downloads"


def _default_processed_cache_dir() -> Path:
    """Default folder where processed datasets (Arrow files) are cached."""
    return Path(os.path.expanduser(DEFAULT_CACHE_DIR)) / "processed"



[docs]
class BaseDatasetBuilder(datasets.GeneratorBasedBuilder):
    """
    Base class for stable-datasets that enables direct dataset loading.
    """

    # Subclasses must define:
    # - VERSION: datasets.Version
    #
    # For dataset provenance / downloads, subclasses can either:
    # - define a class attribute SOURCE (static), or
    # - override _source(self) to compute it at runtime (e.g. from self.config)
    VERSION: datasets.Version
    SOURCE: Mapping

    @staticmethod
    def _freeze(obj):
        """
        Recursively freeze basic Python containers to make SOURCE effectively immutable.

        - dict / Mapping -> MappingProxyType(dict(...)) (shallowly immutable mapping)
        - list / tuple -> tuple(...)
        - set -> frozenset(...)
        """
        if isinstance(obj, MappingProxyType):
            return obj
        if isinstance(obj, Mapping):
            # Create a fresh dict so callers can't retain a handle to the mutable original.
            return MappingProxyType({k: BaseDatasetBuilder._freeze(v) for k, v in dict(obj).items()})
        if isinstance(obj, list | tuple):
            return tuple(BaseDatasetBuilder._freeze(v) for v in obj)
        if isinstance(obj, set):
            return frozenset(BaseDatasetBuilder._freeze(v) for v in obj)
        return obj

    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)

        # Don't validate the base class itself
        if cls is BaseDatasetBuilder:
            return

        # Allow tests / internal helpers to opt out if needed
        if getattr(cls, "_SKIP_SOURCE_VALIDATION", False):
            return

        # VERSION must exist and be a datasets.Version
        if not hasattr(cls, "VERSION"):
            raise TypeError(f"{cls.__name__} must define a class attribute VERSION = datasets.Version('x.y.z').")
        if not isinstance(getattr(cls, "VERSION"), datasets.Version):
            raise TypeError(f"{cls.__name__}.VERSION must be a datasets.Version instance.")

        # Enforce that a source is provided either statically (SOURCE) or dynamically (_source override).
        has_static_source = hasattr(cls, "SOURCE")
        has_dynamic_source = cls._source is not BaseDatasetBuilder._source  # overridden
        if not (has_static_source or has_dynamic_source):
            raise TypeError(
                f"{cls.__name__} must define SOURCE = {{...}} or override _source(self) to compute it at runtime."
            )

        # Freeze static SOURCE at class creation time.
        if has_static_source:
            cls.SOURCE = cls._freeze(getattr(cls, "SOURCE"))

        # If subclass overrides _source(), wrap it so the returned mapping is frozen/immutable.
        if has_dynamic_source and not getattr(cls._source, "_stable_datasets_freezes_source", False):
            original = cls._source

            def _wrapped_source(self):
                source = original(self)
                return BaseDatasetBuilder._freeze(source)

            _wrapped_source._stable_datasets_freezes_source = True  # type: ignore[attr-defined]
            cls._source = _wrapped_source  # type: ignore[method-assign]

    def _source(self) -> Mapping:
        """
        Return dataset provenance / download configuration.

        Default: uses a class attribute SOURCE (frozen into an immutable Mapping).
        Override in subclasses when the source depends on runtime config (e.g. self.config.variant).
        """
        if not hasattr(self.__class__, "SOURCE"):
            raise TypeError(f"{self.__class__.__name__} does not define SOURCE and did not override _source().")
        return getattr(self.__class__, "SOURCE")

    @staticmethod
    def _validate_source(source: Mapping) -> None:
        if not isinstance(source, Mapping):
            raise TypeError("source must be a mapping.")

        # Required for provenance
        if "homepage" not in source or source["homepage"] is None or not isinstance(source["homepage"], str):
            raise TypeError("SOURCE['homepage'] must be a string and must be present.")
        if "citation" not in source or source["citation"] is None or not isinstance(source["citation"], str):
            raise TypeError("SOURCE['citation'] must be a string and must be present.")

        # Required for downloads (even if a dataset overrides _split_generators).
        if "assets" not in source or not isinstance(source["assets"], Mapping):
            raise TypeError("SOURCE must contain a mapping-valued 'assets' key.")

    def _split_generators(self, dl_manager):
        """
        Default split generator implementation.

        Most stable-datasets follow the pattern "one downloadable file per split", expressed
        via `SOURCE["assets"]`. Datasets with different layouts can override this method.
        """
        source = self._source()
        if not isinstance(source, Mapping):
            raise TypeError(f"{self.__class__.__name__}._source() must return a mapping.")
        self._validate_source(source)

        assets = source["assets"]
        if len(assets) == 0:
            raise ValueError(f"{self.__class__.__name__}.SOURCE['assets'] is empty; cannot infer splits.")

        split_names = list(assets.keys())
        ordered_urls = [assets[s] for s in split_names]

        # stable-datasets standardizes on our local bulk downloader (not HF dl_manager).
        # Deduplicate URLs to avoid redundant downloads for datasets where all splits share a single file.
        unique_urls = list(dict.fromkeys(ordered_urls))
        download_dir = getattr(self, "_raw_download_dir", None)
        if download_dir is None:
            download_dir = _default_dest_folder()
        unique_paths = bulk_download(unique_urls, dest_folder=download_dir)
        url_to_path = dict(zip(unique_urls, unique_paths))
        local_paths = [url_to_path[u] for u in ordered_urls]

        split_to_path = dict(zip(split_names, local_paths))

        name_map = {
            "train": datasets.Split.TRAIN,
            "test": datasets.Split.TEST,
            "val": datasets.Split.VALIDATION,
        }

        return [
            datasets.SplitGenerator(
                name=name_map.get(split_name, split_name),
                gen_kwargs={"data_path": split_to_path[split_name], "split": split_name},
            )
            for split_name in split_names
        ]

    def __new__(cls, *args, split=None, processed_cache_dir=None, download_dir=None, **kwargs):
        """
        Automatically download, prepare, and return the dataset for the specified split.

        Args:
            split: Dataset split to load (e.g., "train", "test", "validation"). If None,
                loads all available splits and returns a DatasetDict.
            processed_cache_dir: Cache directory for processed datasets (Arrow cache). If None,
                defaults to ~/.stable_datasets/processed/.
            download_dir: Directory for raw downloads (ZIP/NPZ/etc). If None, defaults to
                ~/.stable_datasets/downloads/.
            **kwargs: Additional arguments passed to the dataset builder.

        Returns:
            Union[datasets.Dataset, datasets.DatasetDict]: The loaded dataset (single split)
                or a DatasetDict (all splits).
        """
        instance = super().__new__(cls)

        # 1) Decide cache locations
        # Processed cache (Arrow)
        if processed_cache_dir is None:
            processed_cache_dir = str(_default_processed_cache_dir())
        instance._processed_cache_dir = Path(processed_cache_dir)

        # Raw downloads
        if download_dir is None:
            download_dir = str(_default_dest_folder())
        instance._raw_download_dir = Path(download_dir)

        # 2) Initialize builder with our processed cache_dir explicitly
        instance.__init__(*args, cache_dir=str(processed_cache_dir), **kwargs)

        # 2b) Validate dataset SOURCE contract early.
        source = instance._source()
        if not isinstance(source, Mapping):
            raise TypeError(f"{cls.__name__}._source() must return a mapping.")
        cls._validate_source(source)

        # 3) Explicitly tell HF to use our processed cache_dir for any dl_manager downloads
        download_config = DownloadConfig(cache_dir=str(processed_cache_dir))

        instance.download_and_prepare(
            download_config=download_config,
        )

        # 4) Load the split from the same cache_dir
        if split is None:
            result = instance.as_dataset()
        else:
            result = instance.as_dataset(split=split)

        # Expose cache locations on the returned dataset object for convenience.
        # Note: DatasetDict may not allow attribute assignment; ignore if not supported.
        try:
            setattr(result, "_stable_datasets_processed_cache_dir", instance._processed_cache_dir)
        except Exception:
            pass
        try:
            setattr(result, "_stable_datasets_download_dir", instance._raw_download_dir)
        except Exception:
            pass
        return result




[docs]
def bulk_download(
    urls: Iterable[str],
    dest_folder: str | Path,
    backend: str = "filesystem",
    cache_dir: str = DEFAULT_CACHE_DIR,
) -> list[Path]:
    """
    Download multiple files concurrently and return their local paths.

    Args:
        urls: Iterable of URL strings to download.
        dest_folder: Destination folder for downloads.
        backend: requests_cache backend (e.g. "filesystem").
        cache_dir: Cache directory for requests_cache.

    Returns:
        list[Path]: Local file paths in the same order as the input URLs.
    """
    urls = list(urls)
    num_workers = len(urls)
    if num_workers == 0:
        return []

    dest_folder = Path(dest_folder)
    dest_folder.mkdir(parents=True, exist_ok=True)

    filenames = [os.path.basename(urlparse(url).path) for url in urls]
    results: list[Path] = []

    with rich.progress.Progress(
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        MofNCompleteColumn(),
        TextColumn("•"),
        TimeElapsedColumn(),
        TextColumn("•"),
        TimeRemainingColumn(),
        refresh_per_second=5,
    ) as progress:
        futures = []
        with multiprocessing.Manager() as manager:
            _progress = manager.dict()  # shared between worker processes

            with ProcessPoolExecutor(max_workers=num_workers) as executor:
                # submit one download task per URL
                for i in range(num_workers):
                    task_id = filenames[i]
                    future = executor.submit(
                        download,
                        urls[i],
                        dest_folder,
                        backend,
                        cache_dir,
                        False,  # disable per-file tqdm; Rich handles progress
                        _progress,
                        task_id,
                    )
                    futures.append(future)

                rich_tasks = {}

                # update Rich progress while downloads are running
                while not all(f.done() for f in futures):
                    for task_id in list(_progress.keys()):
                        prog = _progress[task_id]
                        if task_id not in rich_tasks:
                            rich_tasks[task_id] = progress.add_task(
                                f"[green]{task_id}",
                                total=prog["total"],
                                visible=True,
                            )
                        progress.update(
                            rich_tasks[task_id],
                            completed=prog["progress"],
                        )
                    time.sleep(0.01)

            # collect results in the same order as urls
            for future in futures:
                results.append(future.result())

    return results




[docs]
def download(
    url: str,
    dest_folder: str | Path | None = None,
    backend: str = "filesystem",
    cache_dir: str = DEFAULT_CACHE_DIR,
    progress_bar: bool = True,
    _progress_dict=None,
    _task_id=None,
) -> Path:
    """
    Download a single file from a URL with caching and optional progress tracking.

    Args:
        url: URL to download from.
        dest_folder: Destination folder for the downloaded file. If None,
            defaults to ~/.stable_datasets/downloads/.
        backend: requests_cache backend (e.g. "filesystem").
        cache_dir: Cache directory for requests_cache.
        progress_bar: Whether to show a tqdm progress bar (for standalone use).
        _progress_dict: Internal shared dict for bulk_download progress reporting.
        _task_id: Internal task ID key for bulk_download progress reporting.

    Returns:
        Path: Local path to the downloaded file.

    Raises:
        Exception: Any exception from network/file operations is logged and re-raised.
    """
    try:
        if dest_folder is None:
            dest_folder = _default_dest_folder()
        dest_folder = Path(dest_folder)
        dest_folder.mkdir(parents=True, exist_ok=True)

        filename = os.path.basename(urlparse(url).path)
        local_filename = dest_folder / filename
        lock_filename = dest_folder / f"{filename}.lock"

        # prevent concurrent downloads of the same file
        with FileLock(lock_filename):
            session = CachedSession(cache_dir, backend=backend)
            logging.info(f"Downloading: {url}")

            head = session.head(url)
            total_size = int(head.headers.get("content-length", 0) or 0)
            logging.info(f"Total size: {total_size} bytes")

            response = session.get(url, stream=True)
            downloaded = 0

            with (
                open(local_filename, "wb") as f,
                tqdm(
                    desc=local_filename.name,
                    total=total_size or None,  # None if size unknown
                    unit="B",
                    unit_scale=True,
                    unit_divisor=1024,
                    disable=not progress_bar,
                ) as bar,
            ):
                for chunk in response.iter_content(chunk_size=8192):
                    if not chunk:
                        continue
                    f.write(chunk)
                    downloaded += len(chunk)
                    bar.update(len(chunk))

                    if _progress_dict is not None and _task_id is not None:
                        _progress_dict[_task_id] = {
                            "progress": downloaded,
                            "total": total_size,
                        }

            if total_size and downloaded != total_size:
                logging.error(f"Download incomplete: got {downloaded} of {total_size} bytes for {url}")
            else:
                logging.info(f"Download finished: {local_filename}")

            return local_filename

    except Exception as e:
        logging.error(f"Error downloading {url}: {e}")
        raise e




[docs]
def load_from_tsfile_to_dataframe(
    full_file_path_and_name,
    return_separate_X_and_y=True,
    replace_missing_vals_with="NaN",
):
    """Load data from a .ts file into a Pandas DataFrame.
    Credit to https://github.com/sktime/sktime/blob/7d572796ec519c35d30f482f2020c3e0256dd451/sktime/datasets/_data_io.py#L379
    Parameters
    ----------
    full_file_path_and_name: str
        The full pathname of the .ts file to read.
    return_separate_X_and_y: bool
        true if X and Y values should be returned as separate Data Frames (
        X) and a numpy array (y), false otherwise.
        This is only relevant for data that
    replace_missing_vals_with: str
       The value that missing values in the text file should be replaced
       with prior to parsing.
    Returns
    -------
    DataFrame (default) or ndarray (i
        If return_separate_X_and_y then a tuple containing a DataFrame and a
        numpy array containing the relevant time-series and corresponding
        class values.
    DataFrame
        If not return_separate_X_and_y then a single DataFrame containing
        all time-series and (if relevant) a column "class_vals" the
        associated class values.
    """
    # Initialize flags and variables used when parsing the file
    metadata_started = False
    data_started = False

    has_problem_name_tag = False
    has_timestamps_tag = False
    has_univariate_tag = False
    has_class_labels_tag = False
    has_data_tag = False

    previous_timestamp_was_int = None
    prev_timestamp_was_timestamp = None
    num_dimensions = None
    is_first_case = True
    instance_list = []
    class_val_list = []
    line_num = 0
    # Parse the file
    with open(full_file_path_and_name, encoding="utf-8") as file:
        for line in file:
            # Strip white space from start/end of line and change to
            # lowercase for use below
            line = line.strip().lower()
            # Empty lines are valid at any point in a file
            if line:
                # Check if this line contains metadata
                # Please note that even though metadata is stored in this
                # function it is not currently published externally
                if line.startswith("@problemname"):
                    # Check that the data has not started
                    if data_started:
                        raise OSError("metadata must come before data")
                    # Check that the associated value is valid
                    tokens = line.split(" ")
                    token_len = len(tokens)
                    if token_len == 1:
                        raise OSError("problemname tag requires an associated value")
                    # problem_name = line[len("@problemname") + 1:]
                    has_problem_name_tag = True
                    metadata_started = True
                elif line.startswith("@timestamps"):
                    # Check that the data has not started
                    if data_started:
                        raise OSError("metadata must come before data")
                    # Check that the associated value is valid
                    tokens = line.split(" ")
                    token_len = len(tokens)
                    if token_len != 2:
                        raise OSError("timestamps tag requires an associated Boolean value")
                    elif tokens[1] == "true":
                        timestamps = True
                    elif tokens[1] == "false":
                        timestamps = False
                    else:
                        raise OSError("invalid timestamps value")
                    has_timestamps_tag = True
                    metadata_started = True
                elif line.startswith("@univariate"):
                    # Check that the data has not started
                    if data_started:
                        raise OSError("metadata must come before data")
                    # Check that the associated value is valid
                    tokens = line.split(" ")
                    token_len = len(tokens)
                    if token_len != 2:
                        raise OSError("univariate tag requires an associated Boolean  value")
                    elif tokens[1] == "true":
                        # univariate = True
                        pass
                    elif tokens[1] == "false":
                        # univariate = False
                        pass
                    else:
                        raise OSError("invalid univariate value")
                    has_univariate_tag = True
                    metadata_started = True
                elif line.startswith("@classlabel"):
                    # Check that the data has not started
                    if data_started:
                        raise OSError("metadata must come before data")
                    # Check that the associated value is valid
                    tokens = line.split(" ")
                    token_len = len(tokens)
                    if token_len == 1:
                        raise OSError("classlabel tag requires an associated Boolean  value")
                    if tokens[1] == "true":
                        class_labels = True
                    elif tokens[1] == "false":
                        class_labels = False
                    else:
                        raise OSError("invalid classLabel value")
                    # Check if we have any associated class values
                    if token_len == 2 and class_labels:
                        raise OSError("if the classlabel tag is true then class values must be supplied")
                    has_class_labels_tag = True
                    class_label_list = [token.strip() for token in tokens[2:]]
                    metadata_started = True
                elif line.startswith("@targetlabel"):
                    if data_started:
                        raise OSError("metadata must come before data")
                    tokens = line.split(" ")
                    token_len = len(tokens)
                    if token_len == 1:
                        raise OSError("targetlabel tag requires an associated Boolean value")
                    if tokens[1] == "true":
                        class_labels = True
                    elif tokens[1] == "false":
                        class_labels = False
                    else:
                        raise OSError("invalid targetlabel value")
                    if token_len > 2:
                        raise OSError(
                            "targetlabel tag should not be accompanied with info "
                            "apart from true/false, but found "
                            f"{tokens}"
                        )
                    has_class_labels_tag = True
                    metadata_started = True
                # Check if this line contains the start of data
                elif line.startswith("@data"):
                    if line != "@data":
                        raise OSError("data tag should not have an associated value")
                    if data_started and not metadata_started:
                        raise OSError("metadata must come before data")
                    else:
                        has_data_tag = True
                        data_started = True
                # If the 'data tag has been found then metadata has been
                # parsed and data can be loaded
                elif data_started:
                    # Check that a full set of metadata has been provided
                    if (
                        not has_problem_name_tag
                        or not has_timestamps_tag
                        or not has_univariate_tag
                        or not has_class_labels_tag
                        or not has_data_tag
                    ):
                        raise OSError("a full set of metadata has not been provided before the data")
                    # Replace any missing values with the value specified
                    line = line.replace("?", replace_missing_vals_with)
                    # Check if we are dealing with data that has timestamps
                    if timestamps:
                        # We're dealing with timestamps so cannot just split
                        # line on ':' as timestamps may contain one
                        has_another_value = False
                        has_another_dimension = False
                        timestamp_for_dim = []
                        values_for_dimension = []
                        this_line_num_dim = 0
                        line_len = len(line)
                        char_num = 0
                        while char_num < line_len:
                            # Move through any spaces
                            while char_num < line_len and str.isspace(line[char_num]):
                                char_num += 1
                            # See if there is any more data to read in or if
                            # we should validate that read thus far
                            if char_num < line_len:
                                # See if we have an empty dimension (i.e. no
                                # values)
                                if line[char_num] == ":":
                                    if len(instance_list) < (this_line_num_dim + 1):
                                        instance_list.append([])
                                    instance_list[this_line_num_dim].append(pd.Series(dtype="object"))
                                    this_line_num_dim += 1
                                    has_another_value = False
                                    has_another_dimension = True
                                    timestamp_for_dim = []
                                    values_for_dimension = []
                                    char_num += 1
                                else:
                                    # Check if we have reached a class label
                                    if line[char_num] != "(" and class_labels:
                                        class_val = line[char_num:].strip()
                                        if class_val not in class_label_list:
                                            raise OSError(
                                                "the class value '"
                                                + class_val
                                                + "' on line "
                                                + str(line_num + 1)
                                                + " is not "
                                                "valid"
                                            )
                                        class_val_list.append(class_val)
                                        char_num = line_len
                                        has_another_value = False
                                        has_another_dimension = False
                                        timestamp_for_dim = []
                                        values_for_dimension = []
                                    else:
                                        # Read in the data contained within
                                        # the next tuple
                                        if line[char_num] != "(" and not class_labels:
                                            raise OSError(
                                                "dimension "
                                                + str(this_line_num_dim + 1)
                                                + " on line "
                                                + str(line_num + 1)
                                                + " does "
                                                "not "
                                                "start "
                                                "with a "
                                                "'('"
                                            )
                                        char_num += 1
                                        tuple_data = ""
                                        while char_num < line_len and line[char_num] != ")":
                                            tuple_data += line[char_num]
                                            char_num += 1
                                        if char_num >= line_len or line[char_num] != ")":
                                            raise OSError(
                                                "dimension "
                                                + str(this_line_num_dim + 1)
                                                + " on line "
                                                + str(line_num + 1)
                                                + " does "
                                                "not end"
                                                " with a "
                                                "')'"
                                            )
                                        # Read in any spaces immediately
                                        # after the current tuple
                                        char_num += 1
                                        while char_num < line_len and str.isspace(line[char_num]):
                                            char_num += 1

                                        # Check if there is another value or
                                        # dimension to process after this tuple
                                        if char_num >= line_len:
                                            has_another_value = False
                                            has_another_dimension = False
                                        elif line[char_num] == ",":
                                            has_another_value = True
                                            has_another_dimension = False
                                        elif line[char_num] == ":":
                                            has_another_value = False
                                            has_another_dimension = True
                                        char_num += 1
                                        # Get the numeric value for the
                                        # tuple by reading from the end of
                                        # the tuple data backwards to the
                                        # last comma
                                        last_comma_index = tuple_data.rfind(",")
                                        if last_comma_index == -1:
                                            raise OSError(
                                                "dimension "
                                                + str(this_line_num_dim + 1)
                                                + " on line "
                                                + str(line_num + 1)
                                                + " contains a tuple that has "
                                                "no comma inside of it"
                                            )
                                        try:
                                            value = tuple_data[last_comma_index + 1 :]
                                            value = float(value)
                                        except ValueError:
                                            raise OSError(
                                                "dimension "
                                                + str(this_line_num_dim + 1)
                                                + " on line "
                                                + str(line_num + 1)
                                                + " contains a tuple that does "
                                                "not have a valid numeric "
                                                "value"
                                            )
                                        # Check the type of timestamp that
                                        # we have
                                        timestamp = tuple_data[0:last_comma_index]
                                        try:
                                            timestamp = int(timestamp)
                                            timestamp_is_int = True
                                            timestamp_is_timestamp = False
                                        except ValueError:
                                            timestamp_is_int = False
                                        if not timestamp_is_int:
                                            try:
                                                timestamp = timestamp.strip()
                                                timestamp_is_timestamp = True
                                            except ValueError:
                                                timestamp_is_timestamp = False
                                        # Make sure that the timestamps in
                                        # the file (not just this dimension
                                        # or case) are consistent
                                        if not timestamp_is_timestamp and not timestamp_is_int:
                                            raise OSError(
                                                "dimension "
                                                + str(this_line_num_dim + 1)
                                                + " on line "
                                                + str(line_num + 1)
                                                + " contains a tuple that "
                                                "has an invalid timestamp '" + timestamp + "'"
                                            )
                                        if (
                                            previous_timestamp_was_int is not None
                                            and previous_timestamp_was_int
                                            and not timestamp_is_int
                                        ):
                                            raise OSError(
                                                "dimension "
                                                + str(this_line_num_dim + 1)
                                                + " on line "
                                                + str(line_num + 1)
                                                + " contains tuples where the "
                                                "timestamp format is "
                                                "inconsistent"
                                            )
                                        if (
                                            prev_timestamp_was_timestamp is not None
                                            and prev_timestamp_was_timestamp
                                            and not timestamp_is_timestamp
                                        ):
                                            raise OSError(
                                                "dimension "
                                                + str(this_line_num_dim + 1)
                                                + " on line "
                                                + str(line_num + 1)
                                                + " contains tuples where the "
                                                "timestamp format is "
                                                "inconsistent"
                                            )
                                        # Store the values
                                        timestamp_for_dim += [timestamp]
                                        values_for_dimension += [value]
                                        #  If this was our first tuple then
                                        #  we store the type of timestamp we
                                        #  had
                                        if prev_timestamp_was_timestamp is None and timestamp_is_timestamp:
                                            prev_timestamp_was_timestamp = True
                                            previous_timestamp_was_int = False

                                        if previous_timestamp_was_int is None and timestamp_is_int:
                                            prev_timestamp_was_timestamp = False
                                            previous_timestamp_was_int = True
                                        # See if we should add the data for
                                        # this dimension
                                        if not has_another_value:
                                            if len(instance_list) < (this_line_num_dim + 1):
                                                instance_list.append([])

                                            if timestamp_is_timestamp:
                                                timestamp_for_dim = pd.DatetimeIndex(timestamp_for_dim)

                                            instance_list[this_line_num_dim].append(
                                                pd.Series(
                                                    index=timestamp_for_dim,
                                                    data=values_for_dimension,
                                                )
                                            )
                                            this_line_num_dim += 1
                                            timestamp_for_dim = []
                                            values_for_dimension = []
                            elif has_another_value:
                                raise OSError(
                                    "dimension " + str(this_line_num_dim + 1) + " on "
                                    "line " + str(line_num + 1) + " ends with a ',' that "
                                    "is not followed by "
                                    "another tuple"
                                )
                            elif has_another_dimension and class_labels:
                                raise OSError(
                                    "dimension " + str(this_line_num_dim + 1) + " on "
                                    "line " + str(line_num + 1) + " ends with a ':' while "
                                    "it should list a class "
                                    "value"
                                )
                            elif has_another_dimension and not class_labels:
                                if len(instance_list) < (this_line_num_dim + 1):
                                    instance_list.append([])
                                instance_list[this_line_num_dim].append(pd.Series(dtype=np.float32))
                                this_line_num_dim += 1
                                num_dimensions = this_line_num_dim
                            # If this is the 1st line of data we have seen
                            # then note the dimensions
                            if not has_another_value and not has_another_dimension:
                                if num_dimensions is None:
                                    num_dimensions = this_line_num_dim
                                if num_dimensions != this_line_num_dim:
                                    raise OSError(
                                        "line " + str(line_num + 1) + " does not have the "
                                        "same number of "
                                        "dimensions as the "
                                        "previous line of "
                                        "data"
                                    )
                        # Check that we are not expecting some more data,
                        # and if not, store that processed above
                        if has_another_value:
                            raise OSError(
                                "dimension "
                                + str(this_line_num_dim + 1)
                                + " on line "
                                + str(line_num + 1)
                                + " ends with a ',' that is "
                                "not followed by another "
                                "tuple"
                            )
                        elif has_another_dimension and class_labels:
                            raise OSError(
                                "dimension "
                                + str(this_line_num_dim + 1)
                                + " on line "
                                + str(line_num + 1)
                                + " ends with a ':' while it "
                                "should list a class value"
                            )
                        elif has_another_dimension and not class_labels:
                            if len(instance_list) < (this_line_num_dim + 1):
                                instance_list.append([])
                            instance_list[this_line_num_dim].append(pd.Series(dtype="object"))
                            this_line_num_dim += 1
                            num_dimensions = this_line_num_dim
                        # If this is the 1st line of data we have seen then
                        # note the dimensions
                        if not has_another_value and num_dimensions != this_line_num_dim:
                            raise OSError(
                                "line " + str(line_num + 1) + " does not have the same "
                                "number of dimensions as the "
                                "previous line of data"
                            )
                        # Check if we should have class values, and if so
                        # that they are contained in those listed in the
                        # metadata
                        if class_labels and len(class_val_list) == 0:
                            raise OSError("the cases have no associated class values")
                    else:
                        dimensions = line.split(":")
                        # If first row then note the number of dimensions (
                        # that must be the same for all cases)
                        if is_first_case:
                            num_dimensions = len(dimensions)
                            if class_labels:
                                num_dimensions -= 1
                            for _dim in range(0, num_dimensions):
                                instance_list.append([])
                            is_first_case = False
                        # See how many dimensions that the case whose data
                        # in represented in this line has
                        this_line_num_dim = len(dimensions)
                        if class_labels:
                            this_line_num_dim -= 1
                        # All dimensions should be included for all series,
                        # even if they are empty
                        if this_line_num_dim != num_dimensions:
                            raise OSError(
                                "inconsistent number of dimensions. "
                                "Expecting " + str(num_dimensions) + " but have read " + str(this_line_num_dim)
                            )
                        # Process the data for each dimension
                        for dim in range(0, num_dimensions):
                            dimension = dimensions[dim].strip()

                            if dimension:
                                data_series = dimension.split(",")
                                data_series = [float(i) for i in data_series]
                                instance_list[dim].append(pd.Series(data_series))
                            else:
                                instance_list[dim].append(pd.Series(dtype="object"))
                        if class_labels:
                            class_val_list.append(dimensions[num_dimensions].strip())
            line_num += 1
    # Check that the file was not empty
    if line_num:
        # Check that the file contained both metadata and data
        if metadata_started and not (
            has_problem_name_tag
            and has_timestamps_tag
            and has_univariate_tag
            and has_class_labels_tag
            and has_data_tag
        ):
            raise OSError("metadata incomplete")

        elif metadata_started and not data_started:
            raise OSError("file contained metadata but no data")

        elif metadata_started and data_started and len(instance_list) == 0:
            raise OSError("file contained metadata but no data")
        # Create a DataFrame from the data parsed above
        data = pd.DataFrame(dtype=np.float32)
        for dim in range(0, num_dimensions):
            data["dim_" + str(dim)] = instance_list[dim]
        # Check if we should return any associated class labels separately
        if class_labels:
            if return_separate_X_and_y:
                return data, np.asarray(class_val_list)
            else:
                data["class_vals"] = pd.Series(class_val_list)
                return data
        else:
            return data
    else:
        raise OSError("empty file")