Source code for darts.utils.data.training_dataset

"""
Training Datasets Base Classes
------------------------------
"""

from abc import ABC, abstractmethod
from typing import Dict, Optional, Tuple

import numpy as np
from torch.utils.data import Dataset

from darts import TimeSeries
from darts.logging import get_logger, raise_log
from darts.utils.data.utils import CovariateType

logger = get_logger(__name__)
SampleIndexType = Tuple[
    int, int, int, int, Optional[int], Optional[int], Optional[int], Optional[int]
]


[docs]class TrainingDataset(ABC, Dataset):
    def __init__(self):
        """
        Super-class for all training datasets for torch models in Darts. These include

        * "PastCovariates" datasets (for PastCovariatesTorchModel): containing (past_target,
                                                                                past_covariates,
                                                                                static_covariates,
                                                                                future_target)
        * "FutureCovariates" datasets (for FutureCovariatesTorchModel): containing (past_target,
                                                                                    future_covariates,
                                                                                    static_covariates,
                                                                                    future_target)
        * "DualCovariates" datasets (for DualCovariatesTorchModel): containing (past_target,
                                                                                historic_future_covariates,
                                                                                future_covariates,
                                                                                static_covariates,
                                                                                future_target)
        * "MixedCovariates" datasets (for MixedCovariatesTorchModel): containing (past_target,
                                                                                  past_covariates,
                                                                                  historic_future_covariates,
                                                                                  future_covariates,
                                                                                  static_covariates,
                                                                                  future_target)
        * "SplitCovariates" datasets (for SplitCovariatesTorchModel): containing (past_target,
                                                                                  past_covariates,
                                                                                  future_covariates,
                                                                                  static_covariates,
                                                                                  future_target)

        The covariates are optional and can be `None`.

        This is meant to be used for training (or validation), all data except `future_target` represents model
        inputs (`future_target` is the output the model are trained to predict).

        Darts `TorchForecastingModel`s can be fit from instances of `TrainingDataset` of the right type using the
        `fit_from_dataset()` method.

        `TrainingDataset` inherits torch `Dataset`; meaning that the implementations have to
        provide the `__getitem__()` method.

        It contains `np.ndarray` (and not `TimeSeries`), because training requires the values only,
        and so we can get big performance gains when slicing by returning only numpy views of the data
        underlying the `TimeSeries`.
        """

        self._index_memory: Dict = {}

    @abstractmethod
    def __len__(self) -> int:
        pass

    @abstractmethod
    def __getitem__(self, idx: int):
        pass

    def _memory_indexer(
        self,
        target_idx: int,
        target_series: TimeSeries,
        shift: int,
        input_chunk_length: int,
        output_chunk_length: int,
        end_of_output_idx: int,
        covariate_series: Optional[TimeSeries] = None,
        covariate_type: CovariateType = CovariateType.NONE,
        sample_weight_series: Optional[TimeSeries] = None,
    ) -> SampleIndexType:
        """Returns the (start, end) indices for past target, future target and covariates (sub sets) of the current
        sample `i` from `target_idx`.

        Works for all TimeSeries index types: pd.DatetimeIndex, pd.RangeIndex (and the deprecated Int64Index)

        When `target_idx` is observed for the first time, it stores the position of the sample `0` within the full
        target time series and the (start, end) indices of all sub sets.
        This allows to calculate the sub set indices for all future samples `i` by simply adjusting for the difference
        between the positions of sample `i` and sample `0`.

        Parameters
        ----------
        target_idx
            index of the current target TimeSeries.
        target_series
            current target TimeSeries.
        shift
            The number of time steps by which to shift the output chunks relative to the input chunks.
        input_chunk_length
            The length of the emitted past series.
        output_chunk_length
            The length of the emitted future output series.
        end_of_output_idx
            the index where the output chunk of the current sample ends in `target_series`.
        covariate_series
            current covariate TimeSeries.
        covariate_type
            the type of covariate to extract. Instance of `CovariateType`: One of (`CovariateType.PAST`,
            `CovariateType.FUTURE`, `CovariateType.NONE`).
        sample_weight_series
            current sample weight TimeSeries.
        """

        covariate_start, covariate_end = None, None
        sample_weight_start, sample_weight_end = None, None

        # the first time target_idx is observed
        if target_idx not in self._index_memory:
            start_of_output_idx = end_of_output_idx - output_chunk_length
            start_of_input_idx = start_of_output_idx - shift

            # select forecast point and target period, using the previously computed indexes
            future_start, future_end = (
                start_of_output_idx,
                start_of_output_idx + output_chunk_length,
            )

            # select input period; look at the `input_chunk_length` points after start of input
            past_start, past_end = (
                start_of_input_idx,
                start_of_input_idx + input_chunk_length,
            )

            if covariate_type is not CovariateType.NONE:
                # not CovariateType.FUTURE -> both CovariateType.PAST and CovariateType.HISTORIC_FUTURE
                start = (
                    future_start
                    if covariate_type is CovariateType.FUTURE
                    else past_start
                )
                end = future_end if covariate_type is CovariateType.FUTURE else past_end

                # we need to be careful with getting ranges and indexes:
                # to get entire range, full_range = ts[:len(ts)]; to get last index: last_idx = ts[len(ts) - 1]
                # extract actual index value (respects datetime- and integer-based indexes; also from non-zero
                # start)
                target_time_index = target_series._time_index
                covariate_time_index = covariate_series._time_index
                start_time = target_time_index[start]
                end_time = target_time_index[end - 1]

                if (
                    start_time not in covariate_time_index
                    or end_time not in covariate_time_index
                ):
                    raise_log(
                        ValueError(
                            f"Missing covariates; could not find {covariate_type.value} covariates in index "
                            f"value range: {start_time} - {end_time}."
                        ),
                        logger=logger,
                    )

                # extract the index position (index) from index value
                covariate_start = covariate_time_index.get_loc(start_time)
                covariate_end = covariate_time_index.get_loc(end_time) + 1

            # sample weight
            if sample_weight_series is not None:
                # extract the index position (index) from index value
                target_time_index = target_series._time_index
                sample_weight_time_index = sample_weight_series._time_index

                start_time = target_time_index[future_start]
                end_time = target_time_index[future_end - 1]

                if (
                    start_time not in sample_weight_time_index
                    or end_time not in sample_weight_time_index
                ):
                    raise_log(
                        ValueError(
                            f"Missing sample weights; could not find sample weights in index "
                            f"value range: {start_time} - {end_time}."
                        ),
                        logger=logger,
                    )

                sample_weight_start = sample_weight_time_index.get_loc(start_time)
                sample_weight_end = sample_weight_time_index.get_loc(end_time) + 1

            # store position of initial sample and all relevant sub set indices
            self._index_memory[target_idx] = {
                "end_of_output_idx": end_of_output_idx,
                "past_target": (past_start, past_end),
                "future_target": (future_start, future_end),
                "covariate": (covariate_start, covariate_end),
                "sample_weight": (sample_weight_start, sample_weight_end),
            }
        else:
            # load position of initial sample and its sub set indices
            end_of_output_idx_last = self._index_memory[target_idx]["end_of_output_idx"]
            past_start, past_end = self._index_memory[target_idx]["past_target"]
            future_start, future_end = self._index_memory[target_idx]["future_target"]
            covariate_start, covariate_end = self._index_memory[target_idx]["covariate"]
            sample_weight_start, sample_weight_end = self._index_memory[target_idx][
                "sample_weight"
            ]

            # evaluate how much the new sample needs to be shifted, and shift all indexes
            idx_shift = end_of_output_idx - end_of_output_idx_last
            past_start += idx_shift
            past_end += idx_shift
            future_start += idx_shift
            future_end += idx_shift
            covariate_start = (
                covariate_start + idx_shift if covariate_start is not None else None
            )
            covariate_end = (
                covariate_end + idx_shift if covariate_end is not None else None
            )
            sample_weight_start = (
                sample_weight_start + idx_shift
                if sample_weight_start is not None
                else None
            )
            sample_weight_end = (
                sample_weight_end + idx_shift if sample_weight_end is not None else None
            )

        return (
            past_start,
            past_end,
            future_start,
            future_end,
            covariate_start,
            covariate_end,
            sample_weight_start,
            sample_weight_end,
        )


[docs]class PastCovariatesTrainingDataset(TrainingDataset, ABC):
    def __init__(self):
        """
        Abstract class for a PastCovariatesTorchModel training dataset. It contains 3-tuples of
        `(past_target, past_covariate, static_covariates, future_target)` `np.ndarray`.
        The covariates are optional and can be `None`.
        """
        super().__init__()

    @abstractmethod
    def __getitem__(
        self, idx: int
    ) -> Tuple[
        np.ndarray,
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        np.ndarray,
    ]:
        pass


[docs]class FutureCovariatesTrainingDataset(TrainingDataset, ABC):
    def __init__(self):
        """
        Abstract class for a FutureCovariatesTorchModel training dataset. It contains 3-tuples of
        `(past_target, future_covariate, static_covariates, future_target)` `np.ndarray`.
        The covariates are optional and can be `None`.
        """
        super().__init__()

    @abstractmethod
    def __getitem__(
        self, idx: int
    ) -> Tuple[
        np.ndarray,
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        np.ndarray,
    ]:
        pass


[docs]class DualCovariatesTrainingDataset(TrainingDataset, ABC):
    def __init__(self):
        """
        Abstract class for a DualCovariatesTorchModel training dataset. It contains 4-tuples of
        `(past_target, historic_future_covariates, future_covariates, static_covariates, future_target)` `np.ndarray`.
        The covariates are optional and can be `None`.
        """
        super().__init__()

    @abstractmethod
    def __getitem__(
        self, idx: int
    ) -> Tuple[
        np.ndarray,
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        np.ndarray,
    ]:
        pass


[docs]class MixedCovariatesTrainingDataset(TrainingDataset, ABC):
    def __init__(self):
        """
        Abstract class for a MixedCovariatesTorchModel training dataset. It contains 5-tuples of
        `(past_target, past_covariates, historic_future_covariates, future_covariates, static_covariates,
        future_target)` `np.ndarray`.
        The covariates are optional and can be `None`.
        """
        super().__init__()

    @abstractmethod
    def __getitem__(
        self, idx: int
    ) -> Tuple[
        np.ndarray,
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        np.ndarray,
    ]:
        pass


[docs]class SplitCovariatesTrainingDataset(TrainingDataset, ABC):
    def __init__(self):
        """
        Abstract class for a SplitCovariatesTorchModel training dataset. It contains 4-tuples of
        `(past_target, past_covariates, future_covariates, static_covariates, future_target)` `np.ndarray`.
        The covariates are optional and can be `None`.
        """
        super().__init__()

    @abstractmethod
    def __getitem__(
        self, idx: int
    ) -> Tuple[
        np.ndarray,
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        Optional[np.ndarray],
        np.ndarray,
    ]:
        pass