Module timexseries_clustering.data_clustering.transformation

Expand source code
from pandas import Series, DataFrame
import numpy as np
import pandas
from scipy.stats import yeojohnson
import pywt

class Transformation:
    """
    Super-class used to represent various types of data transformation.
    """

    def apply(self, data: DataFrame) -> DataFrame:
        """
        Apply the transformation on each value in a Pandas DataFrame. Returns the transformed DataFrame, i.e. a DataFrame with
        transformed values.

        Note that it is not guaranteed that the dtype of the returned DataFrame is the same of `data`.

        Parameters
        ----------
        data : DataFrame
            Data to transform.

        Returns
        -------
        DataFrame
            Transformed data.
        """
        pass

    def inverse(self, data: DataFrame) -> DataFrame:
        """
        Apply the inverse of the transformation on the values of a Pandas DataFrame of transformed values.
        Returns the data re-transformed back to the real world.

        Any class implementing Transformation should make the `inverse` method always return a DataFrame with the same
        shape as the one of `data`. If the function is not invertible (e.g. Log), the returning values should be
        approximated. It is assumed in the rest of TIMEX that `inverse` does not fail.

        Parameters
        ----------
        data : DataFrame
            Data to transform.

        Returns
        -------
        DataFrame
            Transformed data.
        """
        pass


class DWT(Transformation):
    """Class corresponding to the a custom variant of DWT Discrete Wavelet Transformation by using specificly Haar Wavelet representation.
    In particular, this transformation tries to to representate a time series with a linear combination of basis functions.
    This produce a high quality reduced-dimensionality approximation of time series.

    Notes
    -----
    The Haar Wavelet decomposition works by averaging two adjacent values on the time series
    function at a given resolution to form a smoothed, lower-dimensional signal, and the resulting
    coefficients are simply the differences between the values and their averages
    """
    
    def apply(self, data: DataFrame) -> DataFrame:
        #Approximation and detail coefficients
        (cA, cD) = pywt.dwt(data.copy().transpose(),'haar')
        rows, columns = cA.shape
        new_index = pandas.date_range(data.index.date[0], end=data.index.date[-1], periods=columns, normalize=True)
        return pandas.DataFrame(cA.transpose(), columns = data.columns, index=new_index)

    def inverse(self, data: DataFrame) -> DataFrame:
        ts_rec = pywt.idwt(data.copy().transpose(), None, 'haar')
        return pandas.DataFrame(ts_rec, columns = data.columns)

    def __str__(self):
        return "DWT"

class DFT(Transformation):
    """Class corresponding to the a custom variant of DFT Discrete Fourier Transformation by using specificly Haar Wavelet representation.
    In particular, this transformation tries to to representate a time series with a linear combination of basis functions.
    This produce a high quality reduced-dimensionality approximation of time series.

    Notes
    -----
    The Haar Wavelet decomposition works by averaging two adjacent values on the time series
    function at a given resolution to form a smoothed, lower-dimensional signal, and the resulting
    coefficients are simply the differences between the values and their averages
    """
    
    def apply(self, data: DataFrame) -> DataFrame:
        #Approximation and detail coefficients
        (cA, cD) = pywt.dwt(data.copy().transpose(),'haar')
        return pandas.DataFrame(cA.transpose(), columns = data.columns)

    def inverse(self, data: DataFrame) -> DataFrame:
        ts_rec = pywt.idwt(data.copy().transpose(), None, 'haar')
        return pandas.DataFrame(ts_rec, columns = data.columns)

    def __str__(self):
        return "DWT"

class Log(Transformation):
    """Class corresponding to a somewhat classic logarithmic feature transformation.

    Notes
    -----
    The actual function computed by this transformation is:

    .. math::
        f(x) = sign(x) * log(|x|)

    if `x` > 1, 0 otherwise.

    Note that this way time-series which contain 0 values will have its values modified, because `inverse` will return
    1 instead of 0 when returning the transformed time-series to the real world.

    The inverse function, indeed, is:

    .. math::
        f^{-1}(x) = sign(x) * e^{abs(x)}

    LogModified should be preferred.
    """
    def apply(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.log(abs(x)) if abs(x) > 1 else 0)

    def inverse(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.exp(abs(x)))

    def __str__(self):
        return "Log"


class LogModified(Transformation):
    """Class corresponding to the a custom variant of logarithmic feature transformation.
    In particular, this transformation tries to overcome the traditional issues of a logarithmic transformation, i.e.
    the impossibility to work on negative data and the different behaviour on 0 < x < 1.

    Notes
    -----
    The actual function computed by this transformation is:

    .. math::
        f(x) = sign(x) * log(|x| + 1)

    The inverse, instead, is:

    .. math::
        f^{-1}(x) = sign(x) * e^{(abs(x) - sign(x))}
    """
    def apply(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.log(abs(x) + 1))

    def inverse(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.exp(abs(x)) - np.sign(x))

    def __str__(self):
        return "modified Log"


class Identity(Transformation):
    """Class corresponding to the identity transformation.
    This is useful because the absence of a data pre-processing transformation would be a particular case for functions
    which compute predictions; instead, using this, that case is not special anymore.

    Notes
    -----
    The actual function computed by this transformation is:

    .. math::
        f(x) = x

    The inverse, instead, is:

    .. math::
        f^{-1}(x) = x
    """
    def apply(self, data: DataFrame) -> DataFrame:
        return data

    def inverse(self, data: DataFrame) -> DataFrame:
        return data

    def __str__(self):
        return "none"


class YeoJohnson(Transformation):
    """Class corresponding to the Yeo-Johnson transformation.

    Notes
    -----
    Introduced in [^1], this transformation tries to make the input data more stable.

    Warnings
    --------
    .. warning:: Yeo-Johnson is basically broken for some series with high values.
                 Follow this issue: https://github.com/scikit-learn/scikit-learn/issues/14959
                 Until this is solved, Yeo-Johnson may not work as expected and create random crashes.

    References
    ----------
    [^1]: Yeo, I. K., & Johnson, R. A. (2000). A new family of power transformations to improve normality or symmetry.
          Biometrika, 87(4), 954-959. https://doi.org/10.1093/biomet/87.4.954
    """

    def __init__(self):
        self.lmbda = 0

    def apply(self, data: DataFrame) -> DataFrame:
        res, lmbda = yeojohnson(data)
        self.lmbda = lmbda
        return res

    def inverse(self, data: DataFrame) -> DataFrame:
        lmbda = self.lmbda
        x_inv = np.zeros_like(data)
        pos = data >= 0

        # when x >= 0
        if abs(lmbda) < np.spacing(1.):
            x_inv[pos] = np.exp(data[pos]) - 1
        else:  # lmbda != 0
            x_inv[pos] = np.power(data[pos] * lmbda + 1, 1 / lmbda) - 1

        # when x < 0
        if abs(lmbda - 2) > np.spacing(1.):
            x_inv[~pos] = 1 - np.power(-(2 - lmbda) * data[~pos] + 1,
                                       1 / (2 - lmbda))
        else:  # lmbda == 2
            x_inv[~pos] = 1 - np.exp(-data[~pos])

        return DataFrame(x_inv)

    def __str__(self):
        return f"Yeo-Johnson (lambda: {round(self.lmbda, 3)})"


class Diff(Transformation):
    """Class corresponding to the differentiate transformation.
    Basically, each value at time `t` is computed as the difference between the current value and the past one.
    Applying this transformation makes the transformed Series have one less value, because the first one can not be
    computed; the value is saved in order to be able to recompute `inverse`.

    Notes
    -----
    Let `X` be the time-series and `X(t)` the value of the time-series at time `t`. This transformation changes X in Y,
    where:

    .. math::
        Y(t) = X(t) - X(t-1)
    """
    def __init__(self):
        self.first_value = 0

    def apply(self, data: Series) -> Series:
        self.first_value = data[0]
        return data.diff()[1:]

    def inverse(self, data: Series) -> Series:
        return Series(np.r_[self.first_value, data].cumsum())

    def __str__(self):
        return "differentiate (1)"


def transformation_factory(tr_class: str) -> Transformation:
    """
    Given the type of the transformation, encoded as string, return the Transformation object.

    Parameters
    ----------
    tr_class : str
        Transformation type.

    Returns
    -------
    Transformation
        Transformation object.

    Examples
    --------
    Create a Pandas Series and apply the logarithmic transformation:

    >>> x = Series([2, 3, 4, 5])
    >>> tr = transformation_factory("log")
    >>> tr_x = tr.apply(x)
    >>> tr_x
    0    0.693147
    1    1.098612
    2    1.386294
    3    1.609438
    dtype: float64

    Now, let's compute the inverse transformation which should return the data to the real world:

    >>> inv_tr_x = tr.inverse(tr_x)
    >>> inv_tr_x
    0    2.0
    1    3.0
    2    4.0
    3    5.0
    dtype: float64
    """
    if tr_class == "log":
        return Log()
    elif tr_class == "log_modified":
        return LogModified()
    elif tr_class == "none":
        return Identity()
    elif tr_class == "diff":
        return Diff()
    elif tr_class == "yeo_johnson":
        return YeoJohnson()
    elif tr_class == "DWT":
        return DWT()
    elif tr_class == "DFT":
        return DFT()

Functions

def transformation_factory(tr_class: str) ‑> Transformation

Given the type of the transformation, encoded as string, return the Transformation object.

Parameters

tr_class : str
Transformation type.

Returns

Transformation
Transformation object.

Examples

Create a Pandas Series and apply the logarithmic transformation:

>>> x = Series([2, 3, 4, 5])
>>> tr = transformation_factory("log")
>>> tr_x = tr.apply(x)
>>> tr_x
0    0.693147
1    1.098612
2    1.386294
3    1.609438
dtype: float64

Now, let's compute the inverse transformation which should return the data to the real world:

>>> inv_tr_x = tr.inverse(tr_x)
>>> inv_tr_x
0    2.0
1    3.0
2    4.0
3    5.0
dtype: float64
Expand source code
def transformation_factory(tr_class: str) -> Transformation:
    """
    Given the type of the transformation, encoded as string, return the Transformation object.

    Parameters
    ----------
    tr_class : str
        Transformation type.

    Returns
    -------
    Transformation
        Transformation object.

    Examples
    --------
    Create a Pandas Series and apply the logarithmic transformation:

    >>> x = Series([2, 3, 4, 5])
    >>> tr = transformation_factory("log")
    >>> tr_x = tr.apply(x)
    >>> tr_x
    0    0.693147
    1    1.098612
    2    1.386294
    3    1.609438
    dtype: float64

    Now, let's compute the inverse transformation which should return the data to the real world:

    >>> inv_tr_x = tr.inverse(tr_x)
    >>> inv_tr_x
    0    2.0
    1    3.0
    2    4.0
    3    5.0
    dtype: float64
    """
    if tr_class == "log":
        return Log()
    elif tr_class == "log_modified":
        return LogModified()
    elif tr_class == "none":
        return Identity()
    elif tr_class == "diff":
        return Diff()
    elif tr_class == "yeo_johnson":
        return YeoJohnson()
    elif tr_class == "DWT":
        return DWT()
    elif tr_class == "DFT":
        return DFT()

Classes

class DFT

Class corresponding to the a custom variant of DFT Discrete Fourier Transformation by using specificly Haar Wavelet representation. In particular, this transformation tries to to representate a time series with a linear combination of basis functions. This produce a high quality reduced-dimensionality approximation of time series.

Notes

The Haar Wavelet decomposition works by averaging two adjacent values on the time series function at a given resolution to form a smoothed, lower-dimensional signal, and the resulting coefficients are simply the differences between the values and their averages

Expand source code
class DFT(Transformation):
    """Class corresponding to the a custom variant of DFT Discrete Fourier Transformation by using specificly Haar Wavelet representation.
    In particular, this transformation tries to to representate a time series with a linear combination of basis functions.
    This produce a high quality reduced-dimensionality approximation of time series.

    Notes
    -----
    The Haar Wavelet decomposition works by averaging two adjacent values on the time series
    function at a given resolution to form a smoothed, lower-dimensional signal, and the resulting
    coefficients are simply the differences between the values and their averages
    """
    
    def apply(self, data: DataFrame) -> DataFrame:
        #Approximation and detail coefficients
        (cA, cD) = pywt.dwt(data.copy().transpose(),'haar')
        return pandas.DataFrame(cA.transpose(), columns = data.columns)

    def inverse(self, data: DataFrame) -> DataFrame:
        ts_rec = pywt.idwt(data.copy().transpose(), None, 'haar')
        return pandas.DataFrame(ts_rec, columns = data.columns)

    def __str__(self):
        return "DWT"

Ancestors

Inherited members

class DWT

Class corresponding to the a custom variant of DWT Discrete Wavelet Transformation by using specificly Haar Wavelet representation. In particular, this transformation tries to to representate a time series with a linear combination of basis functions. This produce a high quality reduced-dimensionality approximation of time series.

Notes

The Haar Wavelet decomposition works by averaging two adjacent values on the time series function at a given resolution to form a smoothed, lower-dimensional signal, and the resulting coefficients are simply the differences between the values and their averages

Expand source code
class DWT(Transformation):
    """Class corresponding to the a custom variant of DWT Discrete Wavelet Transformation by using specificly Haar Wavelet representation.
    In particular, this transformation tries to to representate a time series with a linear combination of basis functions.
    This produce a high quality reduced-dimensionality approximation of time series.

    Notes
    -----
    The Haar Wavelet decomposition works by averaging two adjacent values on the time series
    function at a given resolution to form a smoothed, lower-dimensional signal, and the resulting
    coefficients are simply the differences between the values and their averages
    """
    
    def apply(self, data: DataFrame) -> DataFrame:
        #Approximation and detail coefficients
        (cA, cD) = pywt.dwt(data.copy().transpose(),'haar')
        rows, columns = cA.shape
        new_index = pandas.date_range(data.index.date[0], end=data.index.date[-1], periods=columns, normalize=True)
        return pandas.DataFrame(cA.transpose(), columns = data.columns, index=new_index)

    def inverse(self, data: DataFrame) -> DataFrame:
        ts_rec = pywt.idwt(data.copy().transpose(), None, 'haar')
        return pandas.DataFrame(ts_rec, columns = data.columns)

    def __str__(self):
        return "DWT"

Ancestors

Inherited members

class Diff

Class corresponding to the differentiate transformation. Basically, each value at time t is computed as the difference between the current value and the past one. Applying this transformation makes the transformed Series have one less value, because the first one can not be computed; the value is saved in order to be able to recompute inverse.

Notes

Let X be the time-series and X(t) the value of the time-series at time t. This transformation changes X in Y, where:

[ Y(t) = X(t) - X(t-1) ]

Expand source code
class Diff(Transformation):
    """Class corresponding to the differentiate transformation.
    Basically, each value at time `t` is computed as the difference between the current value and the past one.
    Applying this transformation makes the transformed Series have one less value, because the first one can not be
    computed; the value is saved in order to be able to recompute `inverse`.

    Notes
    -----
    Let `X` be the time-series and `X(t)` the value of the time-series at time `t`. This transformation changes X in Y,
    where:

    .. math::
        Y(t) = X(t) - X(t-1)
    """
    def __init__(self):
        self.first_value = 0

    def apply(self, data: Series) -> Series:
        self.first_value = data[0]
        return data.diff()[1:]

    def inverse(self, data: Series) -> Series:
        return Series(np.r_[self.first_value, data].cumsum())

    def __str__(self):
        return "differentiate (1)"

Ancestors

Inherited members

class Identity

Class corresponding to the identity transformation. This is useful because the absence of a data pre-processing transformation would be a particular case for functions which compute predictions; instead, using this, that case is not special anymore.

Notes

The actual function computed by this transformation is:

[ f(x) = x ] The inverse, instead, is:

[ f^{-1}(x) = x ]

Expand source code
class Identity(Transformation):
    """Class corresponding to the identity transformation.
    This is useful because the absence of a data pre-processing transformation would be a particular case for functions
    which compute predictions; instead, using this, that case is not special anymore.

    Notes
    -----
    The actual function computed by this transformation is:

    .. math::
        f(x) = x

    The inverse, instead, is:

    .. math::
        f^{-1}(x) = x
    """
    def apply(self, data: DataFrame) -> DataFrame:
        return data

    def inverse(self, data: DataFrame) -> DataFrame:
        return data

    def __str__(self):
        return "none"

Ancestors

Inherited members

class Log

Class corresponding to a somewhat classic logarithmic feature transformation.

Notes

The actual function computed by this transformation is:

[ f(x) = sign(x) * log(|x|) ] if x > 1, 0 otherwise.

Note that this way time-series which contain 0 values will have its values modified, because inverse will return 1 instead of 0 when returning the transformed time-series to the real world.

The inverse function, indeed, is:

[ f^{-1}(x) = sign(x) * e^{abs(x)} ] LogModified should be preferred.

Expand source code
class Log(Transformation):
    """Class corresponding to a somewhat classic logarithmic feature transformation.

    Notes
    -----
    The actual function computed by this transformation is:

    .. math::
        f(x) = sign(x) * log(|x|)

    if `x` > 1, 0 otherwise.

    Note that this way time-series which contain 0 values will have its values modified, because `inverse` will return
    1 instead of 0 when returning the transformed time-series to the real world.

    The inverse function, indeed, is:

    .. math::
        f^{-1}(x) = sign(x) * e^{abs(x)}

    LogModified should be preferred.
    """
    def apply(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.log(abs(x)) if abs(x) > 1 else 0)

    def inverse(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.exp(abs(x)))

    def __str__(self):
        return "Log"

Ancestors

Inherited members

class LogModified

Class corresponding to the a custom variant of logarithmic feature transformation. In particular, this transformation tries to overcome the traditional issues of a logarithmic transformation, i.e. the impossibility to work on negative data and the different behaviour on 0 < x < 1.

Notes

The actual function computed by this transformation is:

[ f(x) = sign(x) * log(|x| + 1) ] The inverse, instead, is:

[ f^{-1}(x) = sign(x) * e^{(abs(x) - sign(x))} ]

Expand source code
class LogModified(Transformation):
    """Class corresponding to the a custom variant of logarithmic feature transformation.
    In particular, this transformation tries to overcome the traditional issues of a logarithmic transformation, i.e.
    the impossibility to work on negative data and the different behaviour on 0 < x < 1.

    Notes
    -----
    The actual function computed by this transformation is:

    .. math::
        f(x) = sign(x) * log(|x| + 1)

    The inverse, instead, is:

    .. math::
        f^{-1}(x) = sign(x) * e^{(abs(x) - sign(x))}
    """
    def apply(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.log(abs(x) + 1))

    def inverse(self, data: DataFrame) -> DataFrame:
        return data.apply(lambda x: np.sign(x) * np.exp(abs(x)) - np.sign(x))

    def __str__(self):
        return "modified Log"

Ancestors

Inherited members

class Transformation

Super-class used to represent various types of data transformation.

Expand source code
class Transformation:
    """
    Super-class used to represent various types of data transformation.
    """

    def apply(self, data: DataFrame) -> DataFrame:
        """
        Apply the transformation on each value in a Pandas DataFrame. Returns the transformed DataFrame, i.e. a DataFrame with
        transformed values.

        Note that it is not guaranteed that the dtype of the returned DataFrame is the same of `data`.

        Parameters
        ----------
        data : DataFrame
            Data to transform.

        Returns
        -------
        DataFrame
            Transformed data.
        """
        pass

    def inverse(self, data: DataFrame) -> DataFrame:
        """
        Apply the inverse of the transformation on the values of a Pandas DataFrame of transformed values.
        Returns the data re-transformed back to the real world.

        Any class implementing Transformation should make the `inverse` method always return a DataFrame with the same
        shape as the one of `data`. If the function is not invertible (e.g. Log), the returning values should be
        approximated. It is assumed in the rest of TIMEX that `inverse` does not fail.

        Parameters
        ----------
        data : DataFrame
            Data to transform.

        Returns
        -------
        DataFrame
            Transformed data.
        """
        pass

Subclasses

Methods

def apply(self, data: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame

Apply the transformation on each value in a Pandas DataFrame. Returns the transformed DataFrame, i.e. a DataFrame with transformed values.

Note that it is not guaranteed that the dtype of the returned DataFrame is the same of data.

Parameters

data : DataFrame
Data to transform.

Returns

DataFrame
Transformed data.
Expand source code
def apply(self, data: DataFrame) -> DataFrame:
    """
    Apply the transformation on each value in a Pandas DataFrame. Returns the transformed DataFrame, i.e. a DataFrame with
    transformed values.

    Note that it is not guaranteed that the dtype of the returned DataFrame is the same of `data`.

    Parameters
    ----------
    data : DataFrame
        Data to transform.

    Returns
    -------
    DataFrame
        Transformed data.
    """
    pass
def inverse(self, data: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame

Apply the inverse of the transformation on the values of a Pandas DataFrame of transformed values. Returns the data re-transformed back to the real world.

Any class implementing Transformation should make the inverse method always return a DataFrame with the same shape as the one of data. If the function is not invertible (e.g. Log), the returning values should be approximated. It is assumed in the rest of TIMEX that inverse does not fail.

Parameters

data : DataFrame
Data to transform.

Returns

DataFrame
Transformed data.
Expand source code
def inverse(self, data: DataFrame) -> DataFrame:
    """
    Apply the inverse of the transformation on the values of a Pandas DataFrame of transformed values.
    Returns the data re-transformed back to the real world.

    Any class implementing Transformation should make the `inverse` method always return a DataFrame with the same
    shape as the one of `data`. If the function is not invertible (e.g. Log), the returning values should be
    approximated. It is assumed in the rest of TIMEX that `inverse` does not fail.

    Parameters
    ----------
    data : DataFrame
        Data to transform.

    Returns
    -------
    DataFrame
        Transformed data.
    """
    pass
class YeoJohnson

Class corresponding to the Yeo-Johnson transformation.

Notes

Introduced in 1, this transformation tries to make the input data more stable.

Warnings

Warning: Yeo-Johnson is basically broken for some series with high values.

Follow this issue: https://github.com/scikit-learn/scikit-learn/issues/14959 Until this is solved, Yeo-Johnson may not work as expected and create random crashes.

References


  1. Yeo, I. K., & Johnson, R. A. (2000). A new family of power transformations to improve normality or symmetry. Biometrika, 87(4), 954-959. https://doi.org/10.1093/biomet/87.4.954 

Expand source code
class YeoJohnson(Transformation):
    """Class corresponding to the Yeo-Johnson transformation.

    Notes
    -----
    Introduced in [^1], this transformation tries to make the input data more stable.

    Warnings
    --------
    .. warning:: Yeo-Johnson is basically broken for some series with high values.
                 Follow this issue: https://github.com/scikit-learn/scikit-learn/issues/14959
                 Until this is solved, Yeo-Johnson may not work as expected and create random crashes.

    References
    ----------
    [^1]: Yeo, I. K., & Johnson, R. A. (2000). A new family of power transformations to improve normality or symmetry.
          Biometrika, 87(4), 954-959. https://doi.org/10.1093/biomet/87.4.954
    """

    def __init__(self):
        self.lmbda = 0

    def apply(self, data: DataFrame) -> DataFrame:
        res, lmbda = yeojohnson(data)
        self.lmbda = lmbda
        return res

    def inverse(self, data: DataFrame) -> DataFrame:
        lmbda = self.lmbda
        x_inv = np.zeros_like(data)
        pos = data >= 0

        # when x >= 0
        if abs(lmbda) < np.spacing(1.):
            x_inv[pos] = np.exp(data[pos]) - 1
        else:  # lmbda != 0
            x_inv[pos] = np.power(data[pos] * lmbda + 1, 1 / lmbda) - 1

        # when x < 0
        if abs(lmbda - 2) > np.spacing(1.):
            x_inv[~pos] = 1 - np.power(-(2 - lmbda) * data[~pos] + 1,
                                       1 / (2 - lmbda))
        else:  # lmbda == 2
            x_inv[~pos] = 1 - np.exp(-data[~pos])

        return DataFrame(x_inv)

    def __str__(self):
        return f"Yeo-Johnson (lambda: {round(self.lmbda, 3)})"

Ancestors

Inherited members