Source code for darts.dataprocessing.transformers.static_covariates_transformer

"""
Static Covariates Transformer
------
"""

from collections import OrderedDict
from typing import Any, Dict, List, Optional, Sequence, Tuple

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

from darts.logging import get_logger, raise_log
from darts.timeseries import TimeSeries

from .fittable_data_transformer import FittableDataTransformer
from .invertible_data_transformer import InvertibleDataTransformer

logger = get_logger(__name__)


[docs]class StaticCovariatesTransformer(FittableDataTransformer, InvertibleDataTransformer): def __init__( self, transformer_num=None, transformer_cat=None, cols_num: Optional[List[str]] = None, cols_cat: Optional[List[str]] = None, name="StaticCovariatesTransformer", n_jobs: int = 1, verbose: bool = False, ): """Generic wrapper class for scalers/encoders/transformers of static covariates. This transformer acts only on static covariates of the series passed to ``fit()``, ``transform()``, ``fit_transform()``, and ``inverse_transform()``. It can both scale numerical features, as well as encode categorical features. The underlying ``transformer_num`` and ``transformer_cat`` have to implement the ``fit()``, ``transform()``, and ``inverse_transform()`` methods (typically from scikit-learn). By default, numerical and categorical columns/features are inferred and allocated to ``transformer_num`` and ``transformer_cat``, respectively. Alternatively, specify which columns to scale/transform with ``cols_num`` and ``cols_cat``. Both ``transformer_num`` and ``transformer_cat`` are fit globally on static covariate data from all series passed to :class:`StaticCovariatesTransformer.fit()` Parameters ---------- transformer_num The transformer to transform numeric static covariate columns with. It must provide ``fit()``, ``transform()`` and ``inverse_transform()`` methods. Default: :class:`sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1))`; this will scale all values between 0 and 1. transformer_cat The encoder to transform categorical static covariate columns with. It must provide ``fit()``, ``transform()`` and ``inverse_transform()`` methods. Default: :class:`sklearn.preprocessing.OrdinalEncoder()`; this will convert categories into integer valued arrays where each integer stands for a specific category. cols_num Optionally, a list of column names for which to apply the numeric transformer ``transformer_num``. By default, the transformer will infer all numerical features based on types, and scale them with `transformer_num`. If an empty list, no column will be scaled. cols_cat Optionally, a list of column names for which to apply the categorical transformer `transformer_cat`. By default, the transformer will infer all categorical features based on types, and transform them with `transformer_cat`. If an empty list, no column will be transformed. name A specific name for the :class:`StaticCovariatesTransformer`. n_jobs The number of jobs to run in parallel. Parallel jobs are created only when a ``Sequence[TimeSeries]`` is passed as input to a method, parallelising operations regarding different ``TimeSeries``. Defaults to `1` (sequential). Setting the parameter to `-1` means using all the available processors. Note: for a small amount of data, the parallelisation overhead could end up increasing the total required amount of time. verbose Optionally, whether to print operations progress Examples -------- >>> import numpy as np >>> import pandas as pd >>> from darts import TimeSeries >>> from darts.dataprocessing.transformers import StaticCovariatesTransformer >>> static_covs = pd.DataFrame(data={"num": [0, 2, 1], "cat": ["a", "c", "b"]}) >>> series = TimeSeries.from_values( >>> values=np.random.random((10, 3)), >>> columns=["comp1", "comp2", "comp3"], >>> static_covariates=static_covs, >>> ) >>> transformer = StaticCovariatesTransformer() >>> series_transformed = transformer.fit_transform(series) >>> print(series.static_covariates) static_covariates num cat component comp1 0.0 a comp2 2.0 c comp3 1.0 b >>> print(series_transformed.static_covariates) static_covariates num cat component comp1 0.0 0.0 comp2 1.0 2.0 comp3 0.5 1.0 """ # Define fixed params (i.e. attributes defined before calling `super().__init__`): self.transformer_num = ( MinMaxScaler() if transformer_num is None else transformer_num ) self.transformer_cat = ( OrdinalEncoder() if transformer_cat is None else transformer_cat ) for transformer, transformer_name in zip( [self.transformer_num, self.transformer_cat], ["transformer_num", "transformer_cat"], ): if ( not callable(getattr(transformer, "fit", None)) or not callable(getattr(transformer, "transform", None)) or not callable(getattr(transformer, "inverse_transform", None)) ): raise_log( ValueError( f"The provided `{transformer_name}` object must have fit(), transform() and " f"inverse_transform() methods" ), logger, ) # numeric/categorical cols will be inferred at fitting time, if user did not set them self.cols_num, self.cols_cat = cols_num, cols_cat super().__init__( name=name, n_jobs=n_jobs, verbose=verbose, mask_components=False, global_fit=True, ) # # Fitting Methods: #
[docs] @staticmethod def ts_fit( series: Sequence[TimeSeries], params: Dict[str, Dict[str, Any]], *args, **kwargs ): """ Collates static covariates of all provided `TimeSeries` and fits the following parameters: 1. `transformer_num`, the fitted numerical static covariate transformer. 2. `transformer_cat`, the fitted categorical static covariate transformer. 3. `mask_num`, a dictionary containing two boolean arrays: one that indicates which components of the *untransformed* static covariates are numerical, and another that indicates which components of the *transformed* static covariates are numerical. 4. `mask_cat`, a dictionary containing two boolean arrays: one that indicates which components of the *untransformed* static covariates are categorical, and another that indicates which components of the *transformed* static covariates are categorical. 5. `n_cat_cols`, a dictionary that stores the number of categorical columns we should expect in the untransformed and in the transformed static covariates. """ fixed_params = params["fixed"] transformer_num = fixed_params["transformer_num"] transformer_cat = fixed_params["transformer_cat"] cols_num = fixed_params["cols_num"] cols_cat = fixed_params["cols_cat"] # Collate static covariates of all `series`: stat_covs = pd.concat([s.static_covariates for s in series], axis=0) cols_num, cols_cat = StaticCovariatesTransformer._infer_static_cov_dtypes( stat_covs, cols_num, cols_cat ) mask_num, mask_cat = StaticCovariatesTransformer._create_component_masks( stat_covs, cols_num, cols_cat ) # Fit numerical and categorical static covariate transformers: stat_covs = stat_covs.to_numpy(copy=False) if mask_num.any(): transformer_num = transformer_num.fit(stat_covs[:, mask_num]) if mask_cat.any(): transformer_cat = transformer_cat.fit(stat_covs[:, mask_cat]) ( cat_mapping, inv_cat_mapping, ) = StaticCovariatesTransformer._create_category_mappings( stat_covs, transformer_cat, mask_cat, cols_cat ) ( inv_mask_num, inv_mask_cat, ) = StaticCovariatesTransformer._create_inv_component_masks( mask_num, mask_cat, cat_mapping, cols_cat ) # Store masks and category mappings for untransformed and transformed static covariates: mask_num_dict = {"transform": mask_num, "inverse_transform": inv_mask_num} mask_cat_dict = {"transform": mask_cat, "inverse_transform": inv_mask_cat} col_map_cat_dict = { "transform": cat_mapping, "inverse_transform": inv_cat_mapping, } # Count number of categorical features in untransformed and transformed static covariates: n_cat_cols = { method: len(col_map_cat_dict[method]) for method in ("transform", "inverse_transform") } return { "transformer_num": transformer_num, "transformer_cat": transformer_cat, "mask_num": mask_num_dict, "mask_cat": mask_cat_dict, "col_map_cat": col_map_cat_dict, "n_cat_cols": n_cat_cols, }
@staticmethod def _infer_static_cov_dtypes( stat_covs: pd.DataFrame, cols_num: Optional[Sequence[str]], cols_cat: Optional[Sequence[str]], ): """ Returns a list of names of numerical static covariates and a list of names of categorical/ordinal static covariates. """ if cols_num is None: mask_num = stat_covs.columns.isin( stat_covs.select_dtypes(include=np.number).columns ) cols_num = stat_covs.columns[mask_num] if cols_cat is None: mask_cat = stat_covs.columns.isin( stat_covs.select_dtypes(exclude=np.number).columns ) cols_cat = stat_covs.columns[mask_cat] return cols_num, cols_cat @staticmethod def _create_component_masks( untransformed_stat_covs: pd.DataFrame, cols_num: Sequence[str], cols_cat: Sequence[str], ): """ Returns a boolean array indicating which components of the UNTRANSFORMED `stat_covs` are numerical and a boolean array indicating which components of the UNTRANSFORMED `stat_covs` are categoical. It's important to recognise that these masks only apply to the UNTRANSFORMED static covariates since some transformations can generate multiple new components from a single component (e.g. one-hot encoding). """ mask_num = untransformed_stat_covs.columns.isin(cols_num) mask_cat = untransformed_stat_covs.columns.isin(cols_cat) return mask_num, mask_cat @staticmethod def _create_category_mappings( untransformed_stat_covs: np.ndarray, transformer_cat, mask_cat: np.ndarray, cols_cat: Sequence[str], ): """ Returns mapping from names of untransformed categorical static covariates names and names of transformed categorical static covariate names (i.e. `col_map_cat`), as well as a mapping from the transformed categorical static covariate names to the untransformed ones (i.e. `inv_col_map_cat`). These mappings will be many-to-one/one-to-many if a transformation that generates multiple components from a single categorical variable is being used (e.g. one-hot encoding). """ if mask_cat.any(): # check how many features the transformer generates n_cat_out = transformer_cat.transform( np.expand_dims(untransformed_stat_covs[0, mask_cat], 0) ).shape[-1] # transformer generates same number of features -> make a 1-1 column map if n_cat_out == sum(mask_cat): col_map_cat = inv_col_map_cat = OrderedDict( {col: [col] for col in cols_cat} ) # transformer generates more features (i.e. OneHotEncoder) -> create a 1-many column map else: col_map_cat = OrderedDict() inv_col_map_cat = OrderedDict() for col, categories in zip(cols_cat, transformer_cat.categories_): col_map_cat_i = [] for cat in categories: col_map_cat_i.append(cat) if len(categories) > 1: cat_col_name = str(col) + "_" + str(cat) inv_col_map_cat[cat_col_name] = [col] else: inv_col_map_cat[cat] = [col] col_map_cat[col] = col_map_cat_i # If we don't have any categorical static covariates, don't need to generate mapping: else: col_map_cat = {} inv_col_map_cat = {} return col_map_cat, inv_col_map_cat @staticmethod def _create_inv_component_masks( mask_num: np.ndarray, mask_cat: np.ndarray, cat_mapping: Dict[str, str], cols_cat: Sequence[str], ): """ Returns a boolean array indicating which components of the TRANSFORMED `stat_covs` are numerical and a boolean array indicating which components of the TRANSFORMED `stat_covs` are categoical. It's important to recognise that these masks only apply to the UNTRANSFORMED static covariates since some transformations can generate multiple new components from a single component (e.g. one-hot encoding). """ # check how many categorical features were generated per categorical column after transforming the data cat_idx = 0 inv_mask_num, inv_mask_cat = [], [] for is_num, is_cat in zip(mask_num, mask_cat): if is_num: inv_mask_num.append(True) inv_mask_cat.append(False) elif is_cat: # some categorical encoders (OneHotEncoder) generate more features and we need to keep track of that cat_name = cols_cat[cat_idx] num_cat_outputs = len(cat_mapping[cat_name]) inv_mask_num += num_cat_outputs * [False] inv_mask_cat += num_cat_outputs * [True] cat_idx += 1 else: # don't scale this feature/column inv_mask_num.append(False) inv_mask_cat.append(False) inv_mask_num = np.array(inv_mask_num, dtype=bool) inv_mask_cat = np.array(inv_mask_cat, dtype=bool) return inv_mask_num, inv_mask_cat # # Transform and Inverse Transform Methods: #
[docs] @staticmethod def ts_transform( series: TimeSeries, params: Dict[str, Any], *args, **kwargs ) -> TimeSeries: return StaticCovariatesTransformer._transform_static_covs( series, params["fitted"], method="transform" )
[docs] @staticmethod def ts_inverse_transform( series: TimeSeries, params: Dict[str, Any], *args, **kwargs ) -> TimeSeries: return StaticCovariatesTransformer._transform_static_covs( series, params["fitted"], method="inverse_transform" )
@staticmethod def _transform_static_covs( series: TimeSeries, fitted_params: Dict[str, Any], method: Literal["transform", "inverse_transform"], ): """ Transforms the static covariates of a `series` if `method = 'transform'`, and inverse transforms the static covariates of a `series` if `method = 'inverse_transform'`. """ # Unpack parameters: transformer_num = fitted_params["transformer_num"] transformer_cat = fitted_params["transformer_cat"] mask_num = fitted_params["mask_num"][method] mask_cat = fitted_params["mask_cat"][method] col_map_cat = fitted_params["col_map_cat"][method] n_cat_cols = fitted_params["n_cat_cols"][method] vals_num, vals_cat = StaticCovariatesTransformer._extract_static_covs( series, mask_num, mask_cat ) # Transform static covs: tr_out_num, tr_out_cat = None, None if mask_num.any(): tr_out_num = getattr(transformer_num, method)(vals_num) if mask_cat.any(): tr_out_cat = getattr(transformer_cat, method)(vals_cat) # sparse one hot encoding to dense array if isinstance(tr_out_cat, csr_matrix): tr_out_cat = tr_out_cat.toarray() # quick check if everything is in order n_vals_cat_cols = 0 if vals_cat is None else vals_cat.shape[1] if (method == "inverse_transform") and (n_vals_cat_cols != n_cat_cols): raise_log( ValueError( f"Expected `{n_cat_cols}` categorical value columns but only encountered `{n_vals_cat_cols}`" ), logger, ) series = StaticCovariatesTransformer._add_back_static_covs( series, tr_out_num, tr_out_cat, mask_num, mask_cat, col_map_cat ) return series @staticmethod def _extract_static_covs( series: TimeSeries, mask_num: np.ndarray, mask_cat: np.ndarray ) -> Tuple[np.array, np.array]: """ Extracts all static covariates from a `TimeSeries`, and then extracts the numerical and categorical components to transform from these static covariates. """ vals = series.static_covariates_values(copy=False) return vals[:, mask_num], vals[:, mask_cat] @staticmethod def _add_back_static_covs( series: TimeSeries, vals_num: np.ndarray, vals_cat: np.ndarray, mask_num: np.ndarray, mask_cat: np.ndarray, col_map_cat: Dict[str, str], ) -> pd.DataFrame: """ Adds transformed static covariates back to original `TimeSeries`. The categorical component mapping is used to correctly name categorical components with a one-to-many mapping between their untransformed and transformed versions (e.g. components generated using one-hot encoding). """ data = {} idx_num, idx_cat = 0, 0 static_cov_columns = [] for col, is_num, is_cat in zip( series.static_covariates.columns, mask_num, mask_cat ): if is_num: # numeric scaled column data[col] = vals_num[:, idx_num] static_cov_columns.append(col) idx_num += 1 elif is_cat: # categorical transformed column # covers one to one feature map (ordinal/label encoding) and one to multi feature (one hot encoding) for col_name in col_map_cat[col]: if len(col_map_cat[col]) > 1: col_name = str(col) + "_" + str(col_name) if col_name not in static_cov_columns: data[col_name] = vals_cat[:, idx_cat] static_cov_columns.append(col_name) idx_cat += 1 else: # is_num and is_cat are False -> feature is not part of transformer, use original values data[col] = series.static_covariates[col] static_cov_columns.append(col) transformed_static_covs = pd.DataFrame( data, columns=static_cov_columns, index=series.static_covariates.index, ) return series.with_static_covariates(transformed_static_covs)