Module timexseries_clustering.data_clustering.validation_performances
Expand source code
from math import sqrt
from pandas import DataFrame
from tslearn.clustering import silhouette_score
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score
class ValidationPerformance:
"""
Class for the summary of various statistical indexes relative to the performance of a clustering model.
Parameters
----------
None
Attributes
----------
silhouette: float
Silhouette score. Default 0
davies_bouldin: float
Davies Bouldin score. Default 0
calinski_harabasz: float
Calinski Harabasz score. Default 0
"""
def __init__(self):
self.silhouette = 0
self.davies_bouldin = 0
self.calinski_harabasz = 0
def set_performance_stats(self, dataset: DataFrame, labels: DataFrame, metric: str = None):
"""
Set all the statistical indexes according to input data.
Parameters
----------
dataset: DataFrame
Time series dataset stored in a Pandas DataFrame.
labels: DataFrame
Predicted labels for each time series.
metric : string, optional, default None
The metric to use when calculating distance between time series, optional because is used only for silhouette score.
Should be one of {‘dtw’, ‘softdtw’, ‘euclidean’} or a callable distance function or None. If ‘softdtw’ is passed, a
normalized version of Soft-DTW is used that is defined as sdtw_(x,y) := sdtw(x,y) - 1/2(sdtw(x,x)+sdtw(y,y)). If X
is the distance array itself, use metric="precomputed". If None, dtw is used.
Examples
--------
>>> import numpy
>>> from tslearn.generators import random_walks
>>> from timexseries_clustering.data_clustering import ValidationPerformance
>>> numpy.random.seed(0)
>>> X = random_walks(n_ts=20, sz=16, d=1)
>>> X = numpy.resize(X,(20,16))
>>> labels = numpy.random.randint(2, size=20)
Calculate the performances.
>>> perf = ValidationPerformance()
>>> perf.set_performance_stats(X, labels, 'euclidean')
>>> print(perf.silhouette)
0.09
>>> print(perf.davies_bouldin)
2.28
"""
self.silhouette = silhouette_score(dataset, labels, metric)
self.davies_bouldin = davies_bouldin_score(dataset, labels)
self.calinski_harabasz = calinski_harabasz_score(dataset, labels)
def get_dict(self) -> dict:
"""
Return all the parameters, in a dict.
Returns
-------
d : dict
All the statistics, in a dict.
Examples
--------
>>> perf = ValidationPerformance()
>>> perf.set_performance_stats(actual_dataframe['a'], predicted_dataframe['yhat'])
>>> perf.get_dict()
{'first_used_index': None, 'MSE': 4.0, 'RMSE': 2.0, 'MAE': 2.0, 'AM': -2.0, 'SD': 0.0}
"""
d = {}
for attribute, value in self.__dict__.items():
d[attribute] = value
return d
Classes
class ValidationPerformance
-
Class for the summary of various statistical indexes relative to the performance of a clustering model.
Parameters
None
Attributes
silhouette
:float
- Silhouette score. Default 0
davies_bouldin
:float
- Davies Bouldin score. Default 0
calinski_harabasz
:float
- Calinski Harabasz score. Default 0
Expand source code
class ValidationPerformance: """ Class for the summary of various statistical indexes relative to the performance of a clustering model. Parameters ---------- None Attributes ---------- silhouette: float Silhouette score. Default 0 davies_bouldin: float Davies Bouldin score. Default 0 calinski_harabasz: float Calinski Harabasz score. Default 0 """ def __init__(self): self.silhouette = 0 self.davies_bouldin = 0 self.calinski_harabasz = 0 def set_performance_stats(self, dataset: DataFrame, labels: DataFrame, metric: str = None): """ Set all the statistical indexes according to input data. Parameters ---------- dataset: DataFrame Time series dataset stored in a Pandas DataFrame. labels: DataFrame Predicted labels for each time series. metric : string, optional, default None The metric to use when calculating distance between time series, optional because is used only for silhouette score. Should be one of {‘dtw’, ‘softdtw’, ‘euclidean’} or a callable distance function or None. If ‘softdtw’ is passed, a normalized version of Soft-DTW is used that is defined as sdtw_(x,y) := sdtw(x,y) - 1/2(sdtw(x,x)+sdtw(y,y)). If X is the distance array itself, use metric="precomputed". If None, dtw is used. Examples -------- >>> import numpy >>> from tslearn.generators import random_walks >>> from timexseries_clustering.data_clustering import ValidationPerformance >>> numpy.random.seed(0) >>> X = random_walks(n_ts=20, sz=16, d=1) >>> X = numpy.resize(X,(20,16)) >>> labels = numpy.random.randint(2, size=20) Calculate the performances. >>> perf = ValidationPerformance() >>> perf.set_performance_stats(X, labels, 'euclidean') >>> print(perf.silhouette) 0.09 >>> print(perf.davies_bouldin) 2.28 """ self.silhouette = silhouette_score(dataset, labels, metric) self.davies_bouldin = davies_bouldin_score(dataset, labels) self.calinski_harabasz = calinski_harabasz_score(dataset, labels) def get_dict(self) -> dict: """ Return all the parameters, in a dict. Returns ------- d : dict All the statistics, in a dict. Examples -------- >>> perf = ValidationPerformance() >>> perf.set_performance_stats(actual_dataframe['a'], predicted_dataframe['yhat']) >>> perf.get_dict() {'first_used_index': None, 'MSE': 4.0, 'RMSE': 2.0, 'MAE': 2.0, 'AM': -2.0, 'SD': 0.0} """ d = {} for attribute, value in self.__dict__.items(): d[attribute] = value return d
Methods
def get_dict(self) ‑> dict
-
Return all the parameters, in a dict.
Returns
d
:dict
- All the statistics, in a dict.
Examples
>>> perf = ValidationPerformance() >>> perf.set_performance_stats(actual_dataframe['a'], predicted_dataframe['yhat']) >>> perf.get_dict() {'first_used_index': None, 'MSE': 4.0, 'RMSE': 2.0, 'MAE': 2.0, 'AM': -2.0, 'SD': 0.0}
Expand source code
def get_dict(self) -> dict: """ Return all the parameters, in a dict. Returns ------- d : dict All the statistics, in a dict. Examples -------- >>> perf = ValidationPerformance() >>> perf.set_performance_stats(actual_dataframe['a'], predicted_dataframe['yhat']) >>> perf.get_dict() {'first_used_index': None, 'MSE': 4.0, 'RMSE': 2.0, 'MAE': 2.0, 'AM': -2.0, 'SD': 0.0} """ d = {} for attribute, value in self.__dict__.items(): d[attribute] = value return d
def set_performance_stats(self, dataset: pandas.core.frame.DataFrame, labels: pandas.core.frame.DataFrame, metric: str = None)
-
Set all the statistical indexes according to input data.
Parameters
dataset
:DataFrame
- Time series dataset stored in a Pandas DataFrame.
labels
:DataFrame
- Predicted labels for each time series.
metric
:string
, optional, defaultNone
- The metric to use when calculating distance between time series, optional because is used only for silhouette score. Should be one of {‘dtw’, ‘softdtw’, ‘euclidean’} or a callable distance function or None. If ‘softdtw’ is passed, a normalized version of Soft-DTW is used that is defined as sdtw_(x,y) := sdtw(x,y) - 1/2(sdtw(x,x)+sdtw(y,y)). If X is the distance array itself, use metric="precomputed". If None, dtw is used.
Examples
>>> import numpy >>> from tslearn.generators import random_walks >>> from timexseries_clustering.data_clustering import ValidationPerformance >>> numpy.random.seed(0) >>> X = random_walks(n_ts=20, sz=16, d=1) >>> X = numpy.resize(X,(20,16)) >>> labels = numpy.random.randint(2, size=20)
Calculate the performances.
>>> perf = ValidationPerformance() >>> perf.set_performance_stats(X, labels, 'euclidean')
>>> print(perf.silhouette) 0.09
>>> print(perf.davies_bouldin) 2.28
Expand source code
def set_performance_stats(self, dataset: DataFrame, labels: DataFrame, metric: str = None): """ Set all the statistical indexes according to input data. Parameters ---------- dataset: DataFrame Time series dataset stored in a Pandas DataFrame. labels: DataFrame Predicted labels for each time series. metric : string, optional, default None The metric to use when calculating distance between time series, optional because is used only for silhouette score. Should be one of {‘dtw’, ‘softdtw’, ‘euclidean’} or a callable distance function or None. If ‘softdtw’ is passed, a normalized version of Soft-DTW is used that is defined as sdtw_(x,y) := sdtw(x,y) - 1/2(sdtw(x,x)+sdtw(y,y)). If X is the distance array itself, use metric="precomputed". If None, dtw is used. Examples -------- >>> import numpy >>> from tslearn.generators import random_walks >>> from timexseries_clustering.data_clustering import ValidationPerformance >>> numpy.random.seed(0) >>> X = random_walks(n_ts=20, sz=16, d=1) >>> X = numpy.resize(X,(20,16)) >>> labels = numpy.random.randint(2, size=20) Calculate the performances. >>> perf = ValidationPerformance() >>> perf.set_performance_stats(X, labels, 'euclidean') >>> print(perf.silhouette) 0.09 >>> print(perf.davies_bouldin) 2.28 """ self.silhouette = silhouette_score(dataset, labels, metric) self.davies_bouldin = davies_bouldin_score(dataset, labels) self.calinski_harabasz = calinski_harabasz_score(dataset, labels)