"""
Random Forest
-------------
A forecasting model using a random forest regression. It uses some of the target series' lags, as well as optionally
some covariate series lags in order to obtain a forecast.
See [1]_ for a reference around random forests.
The implementations is wrapped around `RandomForestRegressor
<https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor>`_.
References
----------
.. [1] https://en.wikipedia.org/wiki/Random_forest
"""
from typing import Optional
from sklearn.ensemble import RandomForestRegressor
from darts.logging import get_logger
from darts.models.forecasting.regression_model import (
FUTURE_LAGS_TYPE,
LAGS_TYPE,
RegressionModel,
)
logger = get_logger(__name__)
[docs]class RandomForest(RegressionModel):
def __init__(
self,
lags: Optional[LAGS_TYPE] = None,
lags_past_covariates: Optional[LAGS_TYPE] = None,
lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None,
output_chunk_length: int = 1,
output_chunk_shift: int = 0,
add_encoders: Optional[dict] = None,
n_estimators: Optional[int] = 100,
max_depth: Optional[int] = None,
multi_models: Optional[bool] = True,
use_static_covariates: bool = True,
**kwargs,
):
"""Random Forest Model
Parameters
----------
lags
Lagged target `series` values used to predict the next time step/s.
If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0`
corresponds the first predicted time step of each sample. If `output_chunk_shift > 0`, then
lag `-1` translates to `-1 - output_chunk_shift` steps before the first prediction step.
If a list of integers, each value must be < 0. Uses only the specified values as lags.
If a dictionary, the keys correspond to the `series` component names (of the first series when
using multiple series) and the values correspond to the component lags (integer or list of integers). The
key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some
components are missing and the 'default_lags' key is not provided.
lags_past_covariates
Lagged `past_covariates` values used to predict the next time step/s.
If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`,
where `0` corresponds to the first predicted time step of each sample. If `output_chunk_shift > 0`, then
lag `-1` translates to `-1 - output_chunk_shift` steps before the first prediction step.
If a list of integers, each value must be < 0. Uses only the specified values as lags.
If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when
using multiple series) and the values correspond to the component lags (integer or list of integers). The
key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some
components are missing and the 'default_lags' key is not provided.
lags_future_covariates
Lagged `future_covariates` values used to predict the next time step/s. The lags are always relative to the
first step in the output chunk, even when `output_chunk_shift > 0`.
If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future`
future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` corresponds the first
predicted time step of each sample. If `output_chunk_shift > 0`, the position of negative lags differ from
those of `lags` and `lags_past_covariates`. In this case a future lag `-5` would point at the same
step as a target lag of `-5 + output_chunk_shift`.
If a list of integers, uses only the specified values as lags.
If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when
using multiple series) and the values correspond to the component lags (tuple or list of integers). The key
'default_lags' can be used to provide default lags for un-specified components. Raises and error if some
components are missing and the 'default_lags' key is not provided.
output_chunk_length
Number of time steps predicted at once (per chunk) by the internal model. It is not the same as forecast
horizon `n` used in `predict()`, which is the desired number of prediction points generated using a
one-shot- or autoregressive forecast. Setting `n <= output_chunk_length` prevents auto-regression. This is
useful when the covariates don't extend far enough into the future, or to prohibit the model from using
future values of past and / or future covariates for prediction (depending on the model's covariate
support).
output_chunk_shift
Optionally, the number of steps to shift the start of the output chunk into the future (relative to the
input chunk end). This will create a gap between the input (history of target and past covariates) and
output. If the model supports `future_covariates`, the `lags_future_covariates` are relative to the first
step in the shifted output chunk. Predictions will start `output_chunk_shift` steps after the end of the
target `series`. If `output_chunk_shift` is set, the model cannot generate autoregressive predictions
(`n > output_chunk_length`).
add_encoders
A large number of past and future covariates can be automatically generated with `add_encoders`.
This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
transform the generated covariates. This happens all under one hood and only needs to be specified at
model creation.
Read :meth:`SequentialEncoder <darts.dataprocessing.encoders.SequentialEncoder>` to find out more about
``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:
.. highlight:: python
.. code-block:: python
def encode_year(idx):
return (idx.year - 1950) / 50
add_encoders={
'cyclic': {'future': ['month']},
'datetime_attribute': {'future': ['hour', 'dayofweek']},
'position': {'past': ['relative'], 'future': ['relative']},
'custom': {'past': [encode_year]},
'transformer': Scaler(),
'tz': 'CET'
}
..
n_estimators : int
The number of trees in the forest.
max_depth : int
The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all
leaves contain less than min_samples_split samples.
multi_models
If True, a separate model will be trained for each future lag to predict. If False, a single model
is trained to predict all the steps in 'output_chunk_length' (features lags are shifted back by
`output_chunk_length - n` for each step `n`). Default: True.
use_static_covariates
Whether the model should use static covariate information in case the input `series` passed to ``fit()``
contain static covariates. If ``True``, and static covariates are available at fitting time, will enforce
that all target `series` have the same static covariate dimensionality in ``fit()`` and ``predict()``.
**kwargs
Additional keyword arguments passed to `sklearn.ensemble.RandomForest`.
Examples
--------
>>> from darts.datasets import WeatherDataset
>>> from darts.models import RandomForest
>>> series = WeatherDataset().load()
>>> # predicting atmospheric pressure
>>> target = series['p (mbar)'][:100]
>>> # optionally, use past observed rainfall (pretending to be unknown beyond index 100)
>>> past_cov = series['rain (mm)'][:100]
>>> # optionally, use future temperatures (pretending this component is a forecast)
>>> future_cov = series['T (degC)'][:106]
>>> # random forest with 200 trees trained with MAE
>>> model = RandomForest(
>>> lags=12,
>>> lags_past_covariates=12,
>>> lags_future_covariates=[0,1,2,3,4,5],
>>> output_chunk_length=6,
>>> n_estimators=200,
>>> criterion="absolute_error",
>>> )
>>> model.fit(target, past_covariates=past_cov, future_covariates=future_cov)
>>> pred = model.predict(6)
>>> pred.values()
array([[1006.29805],
[1006.23675],
[1006.17325],
[1006.10295],
[1006.06505],
[1006.05465]])
"""
self.n_estimators = n_estimators
self.max_depth = max_depth
self.kwargs = kwargs
self.kwargs["n_estimators"] = self.n_estimators
self.kwargs["max_depth"] = self.max_depth
super().__init__(
lags=lags,
lags_past_covariates=lags_past_covariates,
lags_future_covariates=lags_future_covariates,
output_chunk_length=output_chunk_length,
output_chunk_shift=output_chunk_shift,
add_encoders=add_encoders,
multi_models=multi_models,
model=RandomForestRegressor(**kwargs),
use_static_covariates=use_static_covariates,
)