Source code for tide.regressors

import datetime as dt

import pandas as pd
import numpy as np
from pandas import DatetimeIndex

from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.utils.validation import check_is_fitted

from tide.base import BaseSTL, TideBaseMixin
from tide.utils import check_and_return_dt_index_df, check_datetime_index

MODEL_DEFAULT_CONF = {"ARIMA": {"order": (1, 1, 0), "trend": "t"}}


def format_prophet_df(
    x: pd.Series | pd.DataFrame | pd.DatetimeIndex, y: pd.Series = None
) -> pd.DataFrame:
    df = pd.DataFrame()
    if y is not None:
        if not x.shape[0] == y.shape[0]:
            raise ValueError("x and y have incompatible shape")
        df["y"] = y.values

    if not isinstance(x, pd.DatetimeIndex):
        x = check_and_return_dt_index_df(x)
        df["ds"] = x.index.tz_localize(None)
        df[x.columns] = x.values
    elif isinstance(x, pd.DatetimeIndex):
        df["ds"] = x.tz_localize(None)
    else:
        raise ValueError(
            f"Invalid x. Was expecting an instance of DateTimeIndex"
            f"DataFrame or Series, got {type(x)}"
        )
    return df


[docs] class SkSTLForecast(RegressorMixin, BaseSTL): """ A model designed for time series forecasting or backcasting (predicting past values). It applies seasonal-trend decomposition (STL) to the training data to capture both trend and seasonal patterns. The model then uses ARIMA or a custom autoregressive model to predict these components, as well as the overall observed variable. Parameters ---------- period : int, str, or datetime.timedelta The period of the time series (e.g., daily, weekly, monthly, etc.). Can be an integer, string, or timedelta. This defines the seasonal periodicity for the STL decomposition. trend : int, str, or datetime.timedelta The length of the trend smoother. If an int is specified, it must be odd and larger than season. Statsplot indicate it is usually around 150% of season. Strongly depends on your time series. ar_model : object, optional A string corresponding to the name of the Autoregressive model to be used to predict STL trend an periodic component. The name must be chosen among MODEL_MAP keys() If not provided, ARIMA will be used as the default model. seasonal : int, str, or datetime.timedelta, optional The seasonal component's smoothing parameter for STL. It defines how much the seasonal component is smoothed. If given as an integer, it must be an odd number. If None, a default value will be used. stl_kwargs : dict[str, float], optional Additional keyword arguments for the STL decomposition. These allow fine-tuning of the decomposition process. (https://www.statsmodels.org/stable/index.html) ar_kwargs : dict, optional Keyword arguments to be passed to the autoregressive model (e.g., order for ARIMA). backcast : bool, optional If True, the model will be trained to backcast (predict the past), otherwise, it will perform standard forward forecasting. Attributes ---------- forecaster_ : dict Dictionary containing the fitted forecaster for each feature in the time series. train_dat_end_ : pandas.Timestamp Timestamp of the last data point used in training. training_freq_ : pandas.tseries.offsets.BaseOffset Frequency of the training data, either provided explicitly or inferred. """
[docs] def __init__( self, period: int | str | dt.timedelta = "24h", trend: int | str | dt.timedelta = "15d", ar_model: str = "ARIMA", seasonal: int | str | dt.timedelta = None, stl_kwargs: dict[str, float] = None, ar_kwargs: str | dict = None, backcast: bool = False, ): super().__init__(period, trend, seasonal, stl_kwargs) self.backcast = backcast self.ar_model = ar_model self.ar_kwargs = ar_kwargs
[docs] def fit(self, X: pd.Index | pd.Series | pd.DataFrame, y=pd.Series | pd.DataFrame): if not isinstance(X, pd.DatetimeIndex): X = check_and_return_dt_index_df(X) y = check_and_return_dt_index_df(y) try: from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa.forecasting.stl import STLForecast except ImportError: raise ImportError( "statsmodels is required for SkSTLForecast. " "Install it with: pip install python-tide[statsmodels]" ) model_map = {"ARIMA": ARIMA} ar_model = model_map[self.ar_model] if self.ar_kwargs is None: ar_kwargs = MODEL_DEFAULT_CONF[self.ar_model] else: ar_kwargs = self.ar_kwargs self._pre_fit(y) self.training_freq_ = ( y.index.freq if y.index.freq is not None else y.index.inferred_freq ) if self.backcast: y = y[::-1] self.train_dat_end_ = y.index[-1] self.forecaster_ = {} for feat in y: self.forecaster_[feat] = STLForecast( endog=y[feat].to_numpy(), model=ar_model, model_kwargs=ar_kwargs, **self.stl_kwargs, ).fit() return self
[docs] def predict(self, X: pd.DatetimeIndex | pd.Series | pd.DataFrame): check_is_fitted( self, attributes=[ "forecaster_", "train_dat_end_", "training_freq_", ], ) if isinstance(X, DatetimeIndex): check_datetime_index(X) X = X.to_frame() else: X = check_and_return_dt_index_df(X) if X.index.shape[0] == 2: X.index.freq = pd.tseries.frequencies.to_offset( abs(X.index[-1] - X.index[0]) ) if X.index.shape[0] > 1 and X.index.freq != self.training_freq_: raise ValueError( f"Required prediction freq {X.index.freq} " f"differs from training_freq_ {self.training_freq_}" ) if (self.backcast and X.index[-1] >= self.train_dat_end_) or ( not self.backcast and X.index[0] <= self.train_dat_end_ ): direction = "future" if self.backcast else "past" raise ValueError( f"Cannot forecast on {direction} values or training data. " f"{'Backcast' if self.backcast else 'Forecast'} can only happen " f"{'before' if self.backcast else 'after'} {self.train_dat_end_}" ) output_index = X.index[::-1] if self.backcast else X.index casting_steps = int( len(output_index) + abs(output_index[0] - self.train_dat_end_) / self.training_freq_ - 1 ) steps_to_jump = casting_steps - len(output_index) inferred_df = pd.DataFrame(index=output_index) for feat in self.forecaster_.keys(): cast = self.forecaster_[feat].forecast(casting_steps) inferred_df[feat] = cast[steps_to_jump:] return inferred_df.sort_index()
[docs] class SkProphet(RegressorMixin, BaseEstimator, TideBaseMixin): """ A scikit-learn compatible wrapper for Meta Prophet forecasting model. This class combines the functionality of Prophet with scikit-learn's API, allowing it to be used in scikit-learn pipelines and model selection tools. It supports multi-feature forecasting, with a separate Prophet model fitted for each feature. Parameters ---------- prophet_kwargs : dict, optional (default={}) Additional keyword arguments to be passed to the Prophet model. changepoint_prior_scale : float, optional (default=0.05) Determines the flexibility of the automatic changepoint selection. Large values allow many changepoints, small values allow few changepoints. seasonality_prior_scale : float, optional (default=10.0) Parameter modulating the strength of the seasonality model. Larger values allow the model to fit larger seasonal fluctuations. return_upper_lower_bounds : bool, optional (default=False) If True, return upper and lower prediction bounds along with the forecast. backcast : bool, optional (default=False) No effect, just here for tide FillGapAR compatibility. Attributes ---------- forecaster_ : dict A dictionary of fitted Prophet models, one for each feature. feature_names_in_ : list The feature names seen during fit. Methods ------- fit(X, y=None) Fit the Prophet model to the input data. predict(X) Make predictions using the fitted Prophet model. Examples -------- >>> import pandas as pd >>> import numpy as np >>> import datetime as dt >>> from tide.regressors import SkProphet >>> index = pd.date_range("2009-01-01", "2009-12-31 23:00:00", freq="h", tz="UTC") >>> cumsum_second = np.arange( ... 0, (index[-1] - index[0]).total_seconds() + 1, step=3600 ... ) >>> annual = 5 * -np.cos( ... 2 * np.pi / dt.timedelta(days=360).total_seconds() * cumsum_second ...) >>> daily = 5 * np.sin( ... 2 * np.pi / dt.timedelta(days=1).total_seconds() * cumsum_second ...) >>> toy_series = pd.Series(annual + daily + 5, index=index) >>> exo = 12 + 3 * np.arange(index.shape[0]) >>> toy_df = pd.DataFrame( ... { ... "Temp_3__°C": toy_series + exo, ... "Exo": exo, ... } ...) >>> forecaster = SkProphet() >>> forecaster.fit( ... X=toy_df.loc["2009-01-24":"2009-07-24", "Exo"], ... y=toy_df.loc["2009-01-24":"2009-07-24", "Temp_3__°C"], ...) >>> result = forecaster.predict(X=toy_df.loc["2009-07-25":"2009-07-30", "Exo"]) >>> print(result.head()) Temp_3__°C 2009-07-25 00:00:00+00:00 14781.715143 2009-07-25 01:00:00+00:00 14786.009401 2009-07-25 02:00:00+00:00 14790.215280 2009-07-25 03:00:00+00:00 14794.250621 2009-07-25 04:00:00+00:00 14798.044970 Notes ----- - Additional regressors are passed in X during fitting operation - Holidays cannot be configured in this regressor. We recommend to pass it as a feature during the fitting process. It will be treated as an additional regressor Returns ------- pd.DataFrame A DataFrame with DateTime index. Columns are the y targets """
[docs] def __init__( self, prophet_kwargs: dict = {}, changepoint_prior_scale: float = 0.05, seasonality_prior_scale: float = 10.0, return_upper_lower_bounds: bool = False, backcast: bool = False, ): super().__init__() self.seasonality_prior_scale = seasonality_prior_scale self.changepoint_prior_scale = changepoint_prior_scale self.prophet_kwargs = prophet_kwargs self.return_upper_lower_bounds = return_upper_lower_bounds self.backcast = backcast
[docs] def fit(self, X: pd.Index | pd.Series | pd.DataFrame, y=pd.Series | pd.DataFrame): y = check_and_return_dt_index_df(y) self.feature_names_out_ = list(y.columns) if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame): X = check_and_return_dt_index_df(X) self.feature_names_in_ = list(X.columns) else: check_datetime_index(X) self.feature_names_in_ = [] self.forecaster_ = {} if self.return_upper_lower_bounds: self.added_columns = [] for bound in ["upper", "lower"]: for feat in self.feature_names_out_: parts = feat.split("__") parts[0] = f"{parts[0]}_{bound}" self.added_columns.append("__".join(parts)) try: from prophet import Prophet except ImportError: raise ImportError( "prophet is required for SkProphet. " "Install it with: pip install python-tide[prophet]" ) for target in y: prophet_df = format_prophet_df(X, y[target]) model = Prophet( seasonality_prior_scale=self.seasonality_prior_scale, changepoint_prior_scale=self.changepoint_prior_scale, **self.prophet_kwargs, ) if isinstance(X, pd.DataFrame): for feat in X: model.add_regressor(feat) self.forecaster_[target] = model.fit(prophet_df) return self
[docs] def predict(self, X: pd.Index | pd.Series | pd.DataFrame): check_is_fitted( self, attributes=["forecaster_", "feature_names_in_"], ) if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame): X = check_and_return_dt_index_df(X) if not np.all([f in self.feature_names_in_ for f in X.columns]): raise ValueError( "One of the requested feature was not present during fitting" ) else: check_datetime_index(X) out_idx = X if isinstance(X, pd.DatetimeIndex) else X.index inferred_df = pd.DataFrame(index=out_idx) for feat in self.forecaster_.keys(): df_prophet = format_prophet_df(X) prediction = self.forecaster_[feat].predict(df_prophet) inferred_df[feat] = prediction["yhat"].values if self.return_upper_lower_bounds: for bound in ["upper", "lower"]: parts = feat.split("__") parts[0] = f"{parts[0]}_{bound}" bound_feat = "__".join(parts) inferred_df[bound_feat] = prediction[f"yhat_{bound}"].values return inferred_df.sort_index()