Source code for etna.transforms.missing_values.imputation
import warnings
from enum import Enum
from typing import List
from typing import Optional
import numpy as np
import pandas as pd
from etna.transforms.base import PerSegmentWrapper
from etna.transforms.base import Transform
[docs]class ImputerMode(str, Enum):
"""Enum for different imputation strategy."""
zero = "zero"
mean = "mean"
running_mean = "running_mean"
forward_fill = "forward_fill"
seasonal = "seasonal"
constant = "constant"
[docs]class _OneSegmentTimeSeriesImputerTransform(Transform):
"""One segment version of transform to fill NaNs in series of a given dataframe.
- It is assumed that given series begins with first non NaN value.
- This transform can't fill NaNs in the future, only on train data.
- This transform can't fill NaNs if all values are NaNs. In this case exception is raised.
"""
def __init__(
self,
in_column: str,
strategy: str,
window: int,
seasonality: int,
default_value: Optional[float],
constant_value: float = 0,
):
"""
Create instance of _OneSegmentTimeSeriesImputerTransform.
Parameters
----------
in_column:
name of processed column
strategy:
filling value in missing timestamps:
- If "zero", then replace missing dates with zeros
- If "mean", then replace missing dates using the mean in fit stage.
- If "running_mean" then replace missing dates using mean of subset of data
- If "forward_fill" then replace missing dates using last existing value
- If "seasonal" then replace missing dates using seasonal moving average
- If "constant" then replace missing dates using constant value.
window:
In case of moving average and seasonality.
* If ``window=-1`` all previous dates are taken in account
* Otherwise only window previous dates
seasonality:
the length of the seasonality
default_value:
value which will be used to impute the NaNs left after applying the imputer with the chosen strategy
constant_value:
value to fill gaps in "constant" strategy
Raises
------
ValueError:
if incorrect strategy given
"""
self.in_column = in_column
self.strategy = ImputerMode(strategy)
self.window = window
self.seasonality = seasonality
self.default_value = default_value
self.constant_value = constant_value
self.fill_value: Optional[float] = None
self.nan_timestamps: Optional[List[pd.Timestamp]] = None
[docs] def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform":
"""
Fit preprocess params.
Parameters
----------
df: pd.DataFrame
dataframe with series to fit preprocess params with
Returns
-------
self: _OneSegmentTimeSeriesImputerTransform
fitted preprocess
"""
raw_series = df[self.in_column]
if np.all(raw_series.isna()):
raise ValueError("Series hasn't non NaN values which means it is empty and can't be filled.")
series = raw_series[raw_series.first_valid_index() :]
self.nan_timestamps = series[series.isna()].index
if self.strategy == ImputerMode.zero:
warnings.warn(
"zero strategy will be removed in etna 2.0.0. Use constant strategy instead.",
DeprecationWarning,
stacklevel=2,
)
self.fill_value = 0
if self.strategy == ImputerMode.constant:
self.fill_value = self.constant_value
elif self.strategy == ImputerMode.mean:
self.fill_value = series.mean()
return self
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform given series.
Parameters
----------
df: pd.Dataframe
transform ``in_column`` series of given dataframe
Returns
-------
result: pd.DataFrame
dataframe with in_column series with filled gaps
"""
result_df = df.copy()
cur_nans = result_df[result_df[self.in_column].isna()].index
result_df[self.in_column] = self._fill(result_df[self.in_column])
# restore nans not in self.nan_timestamps
restore_nans = cur_nans.difference(self.nan_timestamps)
result_df.loc[restore_nans, self.in_column] = np.nan
return result_df
[docs] def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Inverse transform dataframe.
Parameters
----------
df: pd.Dataframe
inverse transform ``in_column`` series of given dataframe
Returns
-------
result: pd.DataFrame
dataframe with in_column series with initial values
"""
result_df = df.copy()
index = result_df.index.intersection(self.nan_timestamps)
result_df.loc[index, self.in_column] = np.nan
return result_df
def _fill(self, df: pd.Series) -> pd.Series:
"""
Create new Series taking all previous dates and adding missing dates.
Fills missed values for new dates according to ``self.strategy``
Parameters
----------
df: pd.Series
series to fill
Returns
-------
result: pd.Series
"""
if self.nan_timestamps is None:
raise ValueError("Trying to apply the unfitted transform! First fit the transform.")
if (
self.strategy == ImputerMode.zero
or self.strategy == ImputerMode.mean
or self.strategy == ImputerMode.constant
):
df = df.fillna(value=self.fill_value)
elif self.strategy == ImputerMode.forward_fill:
df = df.fillna(method="ffill")
elif self.strategy == ImputerMode.running_mean or self.strategy == ImputerMode.seasonal:
history = self.seasonality * self.window if self.window != -1 else len(df)
timestamps = list(df.index)
for timestamp in self.nan_timestamps:
i = timestamps.index(timestamp)
indexes = np.arange(i - self.seasonality, i - self.seasonality - history, -self.seasonality)
indexes = indexes[indexes >= 0]
df.iloc[i] = np.nanmean(df.iloc[indexes])
if self.default_value:
df = df.fillna(value=self.default_value)
return df
[docs]class TimeSeriesImputerTransform(PerSegmentWrapper):
"""Transform to fill NaNs in series of a given dataframe.
- It is assumed that given series begins with first non NaN value.
- This transform can't fill NaNs in the future, only on train data.
- This transform can't fill NaNs if all values are NaNs. In this case exception is raised.
Warning
-------
This transform can suffer from look-ahead bias in 'mean' mode. For transforming data at some timestamp
it uses information from the whole train part.
"""
def __init__(
self,
in_column: str = "target",
strategy: str = ImputerMode.constant,
window: int = -1,
seasonality: int = 1,
default_value: Optional[float] = None,
constant_value: float = 0,
):
"""
Create instance of TimeSeriesImputerTransform.
Parameters
----------
in_column:
name of processed column
strategy:
filling value in missing timestamps:
- If "zero", then replace missing dates with zeros
- If "mean", then replace missing dates using the mean in fit stage.
- If "running_mean" then replace missing dates using mean of subset of data
- If "forward_fill" then replace missing dates using last existing value
- If "seasonal" then replace missing dates using seasonal moving average
- If "constant" then replace missing dates using constant value.
window:
In case of moving average and seasonality.
* If ``window=-1`` all previous dates are taken in account
* Otherwise only window previous dates
seasonality:
the length of the seasonality
default_value:
value which will be used to impute the NaNs left after applying the imputer with the chosen strategy
constant_value:
value to fill gaps in "constant" strategy
Raises
------
ValueError:
if incorrect strategy given
"""
self.in_column = in_column
self.strategy = strategy
self.window = window
self.seasonality = seasonality
self.default_value = default_value
self.constant_value = constant_value
super().__init__(
transform=_OneSegmentTimeSeriesImputerTransform(
in_column=self.in_column,
strategy=self.strategy,
window=self.window,
seasonality=self.seasonality,
default_value=self.default_value,
constant_value=self.constant_value,
)
)
__all__ = ["TimeSeriesImputerTransform"]