Module timexseries_clustering.data_clustering.pipeline
Expand source code
import logging
import os
import pickle
from functools import reduce
from typing import Tuple, List
import dateparser
import numpy, pandas
from pandas import DataFrame
from timexseries_clustering.data_clustering.models.kmeans_cluster import KMeansModel
from timexseries_clustering.data_clustering.models.gmm_cluster import GaussianMixtureModel
from timexseries_clustering.data_clustering.xcorr import calc_all_xcorr
from timexseries_clustering.timeseries_container import TimeSeriesContainer
from timexseries_clustering.data_clustering.transformation import transformation_factory
log = logging.getLogger(__name__)
def get_best_univariate_clusters(ingested_data: DataFrame, param_config: dict, total_xcorr: dict = None) -> \
Tuple[dict, list]:
"""
Compute, for all the columns in `ingested_data` (every time-series) the best univariate clustering possible.
This is done using the clustering approach specified in `param_config` and testing the effect of the different
clustering algorithms, similarity measurements and transformations specified in `param_config`.
Moreover, the best feature transformation found, across the possible ones, will be returned.
Parameters
----------
ingested_data : DataFrame
Initial data of the time-series.
param_config : dict
TIMEX-CLUSTERING configuration dictionary. In particular, the `model_parameters` sub-dictionary will be used. In
`model_parameters` the following options has to be specified:
- `clustering_approach`: clustering approach which will be use (options: "observation_based", "feature_based" or "model_based").
- `pre_transformation`: only one data preprocesing transformation to test, for Feature Based clustering approach, i.e.: none,log or log_modified
- `feature_transformations`: comma-separated list of transformations keywords (e.g. "none,DWT,DFT,SVD").
- `distance_metric`: distance/similarity measure which will be use (e.g. "ED,DTW,arma").
- `models`: comma-separated list of the models to use (e.g. "agglomerative, k_means").
- `main_accuracy_estimator`: error metric which will be minimized as target by the procedure. E.g. "rand_index", "silhouette_index","sse".
total_xcorr : dict, optional, default None
Cross-correlation dictionary computed by `calc_all_xcorr`. The cross-correlation is actually not used in this
function, however it is used to build the returned `timexseries.timeseries_container.TimeSeriesContainer`, if given.
Returns
----------
dict **
Dictionary which assigns the best transformation for every used prediction model, for every time-series.
list **
A list of `timexseries.timeseries_container.TimeSeriesContainer` objects, one for each time-series.
Examples
--------
Create some fake data:
>>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30
>>> ds = pd.DatetimeIndex(dates, freq="D")
>>> a = np.arange(30, 60)
>>> b = np.arange(60, 90)
>>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds)
And create the model configuration part of the TIMEX-CLUSTERING configuration dictionary:
>>> param_config = {
... "model_parameters": {
... "clustering_approach": "observation_based,feature_based,model_based",
... "models": "k_means,gaussian_mixture",
... "pre_transformation": "none",
... "distance_metric": "euclidean,dtw,softdtw",
... "feature_transformations": "DWT",
... "n_clusters": [3, 4, 5, 6],
... "gamma": 0.01,
... "main_accuracy_estimator": "silhouette"
... },
... }
Now, get the univariate clusters:
>>> timeseries_outputs = get_best_univariate_clusters(timeseries_dataframe, param_config)
It is reasonable with this simple data that no transformation is the best transformation.**
We have the `timexseries.timeseries_container.TimeSeriesContainer` list as well:
>>> timeseries_outputs
[<timexseries.timeseries_container.TimeSeriesContainer at 0x7f62f45d1fa0>,
<timexseries.timeseries_container.TimeSeriesContainer at 0x7f62d4596sf0>,
<timexseries.timeseries_container.TimeSeriesContainer at 0x7f62d4e97cd0>]
These are the `timexseries.timeseries_container.TimeSeriesContainer` objects, one for each clustering approach
Each one has various fields, in this case the most interesting one is `models`:
>>> timeseries_outputs[0].models
{'k_means': <timexseries.data_prediction.models.predictor.ModelResult at 0x7f62f45d1d90>}
This is the `timexseries.data_prediction.models.predictor.ModelResult` object for k_means that we have just computed.
"""
case_name = [param_config["activity_title"]]
approaches_to_test = [*param_config["model_parameters"]["clustering_approach"].split(",")]
num_clusters_to_test = param_config["model_parameters"]["n_clusters"]
dist_measures_to_test = [*param_config["model_parameters"]["distance_metric"].split(",")]
main_accuracy_estimator = param_config["model_parameters"]["main_accuracy_estimator"]
# Apply the preprocesing transformation: none,log,logmodified or none.
try:
data_procesing_transformation = param_config["model_parameters"]["pre_transformation"]
except KeyError:
data_procesing_transformation = "none"
pre_transf = transformation_factory(data_procesing_transformation)
ingested_data_pre_transform = pre_transf.apply(ingested_data.copy())
timeseries_containers = []
# Get the set of CPUs on which the calling process is eligible to run.
try:
max_threads = param_config['max_threads']
except KeyError:
try:
max_threads = len(os.sched_getaffinity(0))
except:
max_threads = 1
columns = ingested_data_pre_transform.columns
for col in columns:
timeseries_data = ingested_data_pre_transform[[col]]
xcorr = total_xcorr[col] if total_xcorr is not None else None
for clustering_approach in approaches_to_test:
best_model = {}
model_results = {}
model_counter = 0
models = [*param_config["model_parameters"]["models"].split(",")]
if clustering_approach =='observation_based':
transformations_to_test = ['none']
try: models.remove('gaussian_mixture')
except: pass
elif clustering_approach == 'feature_based':
transformations_to_test = [*param_config["model_parameters"]["feature_transformations"].split(",")]
try: models.remove('gaussian_mixture')
except: pass
elif clustering_approach =='model_based':
transformations_to_test = ['none']
dist_measures_to_test = ['Log-likelihood']
try: models.remove('k_means')
except: pass
else:
log.info(f"Wrong name approach: {clustering_approach}, introduce the approach's name correctly and without spaces, i.e.: 'observation_based,feature_based,model_based'")
for model in models:
this_model_performances = []
model_results[model] = {}
log.info(f"Using approach: {clustering_approach} and using model {model}...")
for metric in dist_measures_to_test:
this_metric_performances = []
single_results = []
for transf in transformations_to_test:
for n_clus in num_clusters_to_test:
log.info(f"Computing univariate clustering using approach: {clustering_approach}, number of clusters: {n_clus}, distance metric: {metric} and transformation: {transf}...")
tr = transformation_factory(transf)
ingested_data_transform = tr.apply(ingested_data_pre_transform)
#ModelResult
_result = model_factory(ingested_data_transform, clustering_approach, model, distance_metric=metric, param_config=param_config, transformation=transf, n_clusters=n_clus)
model_single_results = _result.results[0] #SingleResult
characteristics = _result.characteristics
best_clustering = _result.best_clustering
performances = getattr(model_single_results.performances, main_accuracy_estimator)
single_results.append(model_single_results)
this_metric_performances.append((_result, performances, n_clus, transf))
this_model_performances.append((_result, performances, n_clus, metric, transf, best_clustering))
if main_accuracy_estimator=="silhouette":
this_metric_performances.sort(key=lambda x: x[1],reverse=True)
else:
this_metric_performances.sort(key=lambda x: x[1])
best_result = this_metric_performances[0][0] #object ModelResult
best_n_clusters = this_metric_performances[0][2]
best_n_trans = this_metric_performances[0][3]
log.info(f"For the metric: {metric} the best clustering is obtained using {best_n_clusters} number of clusters and transformation {best_n_trans}.")
best_result.results = single_results
model_results[model][metric] = best_result #object ModelResult
if main_accuracy_estimator=="silhouette":
this_model_performances.sort(key=lambda x: x[1],reverse=True)
else:
this_model_performances.sort(key=lambda x: x[1])
best_n_clusters = this_model_performances[0][2]
best_metric = this_model_performances[0][3]
best_n_trans = this_model_performances[0][4]
best_cluster_vector = this_model_performances[0][5]
column_names = ingested_data.columns.values
clusters_indexes = numpy.unique(best_cluster_vector)
data = []
column_values = []
for yi in clusters_indexes:
data.append(column_names[best_cluster_vector == yi])
column_values.append('Cluster '+str(yi))
clusters_label = pandas.DataFrame(data = data,index = column_values)
cluster_table_df = clusters_label.transpose()
log.info(f"For the model: {model} the best clustering is obtained using metric {best_metric}, with {best_n_clusters} number of clusters, and transformation {best_n_trans}.")
if model_counter == 0:
best_model["clustering_approach"] = clustering_approach
best_model["model"] = model
best_model["distance_metric"] = best_metric
best_model["n_clusters"] = best_n_clusters
best_model["feature_transformation"] = best_n_trans
best_model["pre_transformation"] = data_procesing_transformation
best_model["accuracy_estimator"] = main_accuracy_estimator
best_model["performance"] = this_model_performances[0][1]
best_model["clusters_table"] = cluster_table_df
model_counter = model_counter+1
elif this_model_performances[0][1] > best_model["performance"]:
best_model["model"] = model
best_model["distance_metric"] = best_metric
best_model["n_clusters"] = best_n_clusters
best_model["feature_transformation"] = best_n_trans
best_model["pre_transformation"] = data_procesing_transformation
best_model["performance"] = this_model_performances[0][1]
best_model["clusters_table"] = cluster_table_df
log.info(f"Process of {clustering_approach} clustering finished")
timeseries_containers.append(
TimeSeriesContainer(ingested_data, str(characteristics['clustering_approach']), model_results, best_model, xcorr))
return timeseries_containers
def get_best_clusters(ingested_data: DataFrame, param_config: dict):
"""
Starting from `ingested_data`, using the models/cross correlation settings set in `param_config`, return the best
possible clustering in a `timexseries_clustering.timeseries_container.TimeSeriesContainer` for all the time-series in `ingested_data`.
Parameters
----------
ingested_data : DataFrame
Initial data of the time-series.
param_config : dict
TIMEX configuration dictionary. `get_best_univariate_clusters` and `get_best_multivariate_clusters` (multivariate_clustering will be realased in timexseries_clustering 2.0.0) will
use the various settings in `param_config`.
Returns
-------
list
A list of `timexseries_clustering.timeseries_container.TimeSeriesContainer` objects, one for each time-series.
Examples
--------
This is basically the function on top of `get_best_univariate_clusters` and `get_best_multivariate_predictions`:
it will call first the univariate and then the multivariate if the cross-correlation section is present in `param_config`.
Create some data:
>>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30
>>> ds = pd.DatetimeIndex(dates, freq="D")
>>> a = np.arange(30, 60)
>>> b = np.arange(60, 90)
>>>
>>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds)
Simply compute the clustering and get the returned `timexseries_clustering.timeseries_container.TimeSeriesContainer` objects:
>>> timeseries_outputs = get_best_clusters(timeseries_dataframe, param_config)
"""
if "xcorr_parameters" in param_config and len(ingested_data.columns) > 1:
log.info(f"Computing the cross-correlation...")
total_xcorr = calc_all_xcorr(ingested_data=ingested_data, param_config=param_config)
else:
total_xcorr = None
timeseries_containers = get_best_univariate_clusters(ingested_data, param_config, total_xcorr)
#best_transformations, timeseries_containers = get_best_univariate_clusters(ingested_data, param_config, total_xcorr)
""" **
if total_xcorr is not None or "additional_regressors" in param_config:
timeseries_containers = get_best_multivariate_predictions(timeseries_containers=timeseries_containers, ingested_data=ingested_data,
best_transformations=best_transformations,
total_xcorr=total_xcorr,
param_config=param_config)
"""
return timeseries_containers
def create_timeseries_containers(ingested_data: DataFrame, param_config: dict):
"""
Entry points of the pipeline; it will compute univariate (multivariate in future realeases 2.0.0) clustering, or only
create the containers with the time-series data, according to the content of `param_config`, with this logic:
- if `model_parameters` is in `param_config`, then `get_best_clusters` will be called;
- else, create a list of `timexseries.timeseries_container.TimeSeriesContainer` with only the time-series data and, if
`xcorr_parameters` is in `param_config`, with also the cross-correlation.
Parameters
----------
ingested_data : DataFrame
Initial data of the time-series.
param_config : dict
TIMEX-CLUSTERING configuration dictionary.
Returns
-------
list
A list of `timexseries_clustering.timeseries_container.TimeSeriesContainer` objects, one for each time-series.
Examples
--------
The first example of `get_best_clusters` applies also here; calling `create_timeseries_containers` will
produce the same identical result.
However, if no clustering should be made but we just want the time-series containers:
>>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30
>>> ds = pd.DatetimeIndex(dates, freq="D")
>>> a = np.arange(30, 60)
>>> b = np.arange(60, 90)
>>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds)
Create the containers:
>>> param_config = {}
>>> timeseries_outputs = create_timeseries_containers(timeseries_dataframe, param_config)
Check that no models, no historical predictions and no cross-correlation are present in the containers:
>>> print(timeseries_outputs[0].models)
None
>>> print(timeseries_outputs[0].historical_prediction)
None
>>> print(timeseries_outputs[0].xcorr)
None
If `xcorr_parameters` was specified, then the last command would not return None.
Check that the time-series data is there:
>>> print(timeseries_outputs[0].timeseries_data)
a
2000-01-01 30
2000-01-02 31
...
2000-01-29 58
2000-01-30 59
"""
if "model_parameters" in param_config:
log.debug(f"Computing best clustering.")
timeseries_containers = get_best_clusters(ingested_data, param_config)
else:
log.debug(f"Creating containers only for data visualization.")
timeseries_containers = []
if "xcorr_parameters" in param_config and len(ingested_data.columns) > 1:
total_xcorr = calc_all_xcorr(ingested_data=ingested_data, param_config=param_config)
else:
total_xcorr = None
for col in ingested_data.columns:
timeseries_data = ingested_data[[col]]
timeseries_xcorr = total_xcorr[col] if total_xcorr is not None else None
timeseries_containers.append(
TimeSeriesContainer(timeseries_data, None, None, timeseries_xcorr)
)
return timeseries_containers
def model_factory(ingested_data: DataFrame, clustering_approach: str, model_class: str, distance_metric: str, param_config: dict, transformation: str = None, n_clusters: int = 3): #-> ClustersModel:
"""
Given the clustering_approach and name of the model, return the corresponding ClustersModel.
Parameters
----------
clustering_approach : str
Clustering approach, e.g. "observation_based"
model_class : str
Model type, e.g. "k_means"
param_config : dict
TIMEX-CLUSTERING configuration dictionary, to pass to the just created model.
distance_metric : str, e.g. **
Distance/similarity measure type, e.g. "euclidean, dtw, softdtw" **
transformation : str, optional, default None
Optional `transformation` parameter to pass to the just created model.
n_clusters : int, optional, default 3
Optional `number of clusters` parameter to pass to the just created model.
Returns
-------
ModelResult
Model Result of the class specified in `model_class`, it contains the
results of the best clustering with the index of the cluster that each
time series belongs to. Contains also the model characteristics and the
centers of each cluster.
Examples
--------
>>> param_config = {
... "model_parameters": {
... "clustering_approach": "observation_based",
... "models": "k_means",
... "pre_transformation": "none",
... "distance_metric": "euclidean,dtw,softdtw",
... "feature_transformations": "DWT",
... "n_clusters": [3, 4, 5, 6],
... "gamma": 0.01,
... "main_accuracy_estimator": "silhouette"
... },
...}
>>> model = model_factory(timeseriescontainer[0].timeseries_data, clustering_approach='observation_based', model_class='k_means', distance_metric='dtw', param_config=param_config, transformation=None, n_clusters=3)
"""
if clustering_approach == "observation_based":
if model_class == "k_means":
return KMeansModel(ingested_data=ingested_data, clustering_approach="Observation based", distance_metric=distance_metric,
param_config=param_config, transformation=transformation, n_clusters=n_clusters)
if model_class == "hierarchical":
print('Model in construction...')
if clustering_approach == "feature_based":
if model_class == "k_means":
return KMeansModel(ingested_data=ingested_data, clustering_approach="Feature based", distance_metric=distance_metric,
param_config=param_config, transformation=transformation, n_clusters=n_clusters)
if clustering_approach == "model_based":
if model_class == "gaussian_mixture":
return GaussianMixtureModel(ingested_data=ingested_data, clustering_approach="Model based", distance_metric=distance_metric,
param_config=param_config, transformation=transformation, n_clusters=n_clusters)
Functions
def create_timeseries_containers(ingested_data: pandas.core.frame.DataFrame, param_config: dict)
-
Entry points of the pipeline; it will compute univariate (multivariate in future realeases 2.0.0) clustering, or only create the containers with the time-series data, according to the content of
param_config
, with this logic:- if
model_parameters
is inparam_config
, thenget_best_clusters()
will be called; - else, create a list of
timexseries.timeseries_container.TimeSeriesContainer
with only the time-series data and, ifxcorr_parameters
is inparam_config
, with also the cross-correlation.
Parameters
ingested_data
:DataFrame
- Initial data of the time-series.
param_config
:dict
- TIMEX-CLUSTERING configuration dictionary.
Returns
list
- A list of
TimeSeriesContainer
objects, one for each time-series.
Examples
The first example of
get_best_clusters()
applies also here; callingcreate_timeseries_containers()
will produce the same identical result.However, if no clustering should be made but we just want the time-series containers:
>>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30 >>> ds = pd.DatetimeIndex(dates, freq="D") >>> a = np.arange(30, 60) >>> b = np.arange(60, 90) >>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds)
Create the containers:
>>> param_config = {} >>> timeseries_outputs = create_timeseries_containers(timeseries_dataframe, param_config)
Check that no models, no historical predictions and no cross-correlation are present in the containers:
>>> print(timeseries_outputs[0].models) None >>> print(timeseries_outputs[0].historical_prediction) None >>> print(timeseries_outputs[0].xcorr) None
If
xcorr_parameters
was specified, then the last command would not return None. Check that the time-series data is there:>>> print(timeseries_outputs[0].timeseries_data) a 2000-01-01 30 2000-01-02 31 ... 2000-01-29 58 2000-01-30 59
Expand source code
def create_timeseries_containers(ingested_data: DataFrame, param_config: dict): """ Entry points of the pipeline; it will compute univariate (multivariate in future realeases 2.0.0) clustering, or only create the containers with the time-series data, according to the content of `param_config`, with this logic: - if `model_parameters` is in `param_config`, then `get_best_clusters` will be called; - else, create a list of `timexseries.timeseries_container.TimeSeriesContainer` with only the time-series data and, if `xcorr_parameters` is in `param_config`, with also the cross-correlation. Parameters ---------- ingested_data : DataFrame Initial data of the time-series. param_config : dict TIMEX-CLUSTERING configuration dictionary. Returns ------- list A list of `timexseries_clustering.timeseries_container.TimeSeriesContainer` objects, one for each time-series. Examples -------- The first example of `get_best_clusters` applies also here; calling `create_timeseries_containers` will produce the same identical result. However, if no clustering should be made but we just want the time-series containers: >>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30 >>> ds = pd.DatetimeIndex(dates, freq="D") >>> a = np.arange(30, 60) >>> b = np.arange(60, 90) >>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds) Create the containers: >>> param_config = {} >>> timeseries_outputs = create_timeseries_containers(timeseries_dataframe, param_config) Check that no models, no historical predictions and no cross-correlation are present in the containers: >>> print(timeseries_outputs[0].models) None >>> print(timeseries_outputs[0].historical_prediction) None >>> print(timeseries_outputs[0].xcorr) None If `xcorr_parameters` was specified, then the last command would not return None. Check that the time-series data is there: >>> print(timeseries_outputs[0].timeseries_data) a 2000-01-01 30 2000-01-02 31 ... 2000-01-29 58 2000-01-30 59 """ if "model_parameters" in param_config: log.debug(f"Computing best clustering.") timeseries_containers = get_best_clusters(ingested_data, param_config) else: log.debug(f"Creating containers only for data visualization.") timeseries_containers = [] if "xcorr_parameters" in param_config and len(ingested_data.columns) > 1: total_xcorr = calc_all_xcorr(ingested_data=ingested_data, param_config=param_config) else: total_xcorr = None for col in ingested_data.columns: timeseries_data = ingested_data[[col]] timeseries_xcorr = total_xcorr[col] if total_xcorr is not None else None timeseries_containers.append( TimeSeriesContainer(timeseries_data, None, None, timeseries_xcorr) ) return timeseries_containers
- if
def get_best_clusters(ingested_data: pandas.core.frame.DataFrame, param_config: dict)
-
Starting from
ingested_data
, using the models/cross correlation settings set inparam_config
, return the best possible clustering in aTimeSeriesContainer
for all the time-series iningested_data
. Parameters
ingested_data
:DataFrame
- Initial data of the time-series.
param_config
:dict
- TIMEX configuration dictionary.
get_best_univariate_clusters()
andget_best_multivariate_clusters
(multivariate_clustering will be realased in timexseries_clustering 2.0.0) will use the various settings inparam_config
.
Returns
list
- A list of
TimeSeriesContainer
objects, one for each time-series.
Examples
This is basically the function on top of
get_best_univariate_clusters()
andget_best_multivariate_predictions
: it will call first the univariate and then the multivariate if the cross-correlation section is present inparam_config
. Create some data:>>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30 >>> ds = pd.DatetimeIndex(dates, freq="D") >>> a = np.arange(30, 60) >>> b = np.arange(60, 90) >>> >>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds) Simply compute the clustering and get the returned <code><a title="timexseries_clustering.timeseries_container.TimeSeriesContainer" href="../timeseries_container.html#timexseries_clustering.timeseries_container.TimeSeriesContainer">TimeSeriesContainer</a></code> objects: >>> timeseries_outputs = get_best_clusters(timeseries_dataframe, param_config)
Expand source code
def get_best_clusters(ingested_data: DataFrame, param_config: dict): """ Starting from `ingested_data`, using the models/cross correlation settings set in `param_config`, return the best possible clustering in a `timexseries_clustering.timeseries_container.TimeSeriesContainer` for all the time-series in `ingested_data`. Parameters ---------- ingested_data : DataFrame Initial data of the time-series. param_config : dict TIMEX configuration dictionary. `get_best_univariate_clusters` and `get_best_multivariate_clusters` (multivariate_clustering will be realased in timexseries_clustering 2.0.0) will use the various settings in `param_config`. Returns ------- list A list of `timexseries_clustering.timeseries_container.TimeSeriesContainer` objects, one for each time-series. Examples -------- This is basically the function on top of `get_best_univariate_clusters` and `get_best_multivariate_predictions`: it will call first the univariate and then the multivariate if the cross-correlation section is present in `param_config`. Create some data: >>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30 >>> ds = pd.DatetimeIndex(dates, freq="D") >>> a = np.arange(30, 60) >>> b = np.arange(60, 90) >>> >>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds) Simply compute the clustering and get the returned `timexseries_clustering.timeseries_container.TimeSeriesContainer` objects: >>> timeseries_outputs = get_best_clusters(timeseries_dataframe, param_config) """ if "xcorr_parameters" in param_config and len(ingested_data.columns) > 1: log.info(f"Computing the cross-correlation...") total_xcorr = calc_all_xcorr(ingested_data=ingested_data, param_config=param_config) else: total_xcorr = None timeseries_containers = get_best_univariate_clusters(ingested_data, param_config, total_xcorr) #best_transformations, timeseries_containers = get_best_univariate_clusters(ingested_data, param_config, total_xcorr) """ ** if total_xcorr is not None or "additional_regressors" in param_config: timeseries_containers = get_best_multivariate_predictions(timeseries_containers=timeseries_containers, ingested_data=ingested_data, best_transformations=best_transformations, total_xcorr=total_xcorr, param_config=param_config) """ return timeseries_containers
def get_best_univariate_clusters(ingested_data: pandas.core.frame.DataFrame, param_config: dict, total_xcorr: dict = None) ‑> Tuple[dict, list]
-
Compute, for all the columns in
ingested_data
(every time-series) the best univariate clustering possible. This is done using the clustering approach specified inparam_config
and testing the effect of the different clustering algorithms, similarity measurements and transformations specified inparam_config
. Moreover, the best feature transformation found, across the possible ones, will be returned.Parameters
ingested_data
:DataFrame
- Initial data of the time-series.
param_config
:dict
-
TIMEX-CLUSTERING configuration dictionary. In particular, the
model_parameters
sub-dictionary will be used. Inmodel_parameters
the following options has to be specified:clustering_approach
: clustering approach which will be use (options: "observation_based", "feature_based" or "model_based").pre_transformation
: only one data preprocesing transformation to test, for Feature Based clustering approach, i.e.: none,log or log_modifiedfeature_transformations
: comma-separated list of transformations keywords (e.g. "none,DWT,DFT,SVD").distance_metric
: distance/similarity measure which will be use (e.g. "ED,DTW,arma").models
: comma-separated list of the models to use (e.g. "agglomerative, k_means").main_accuracy_estimator
: error metric which will be minimized as target by the procedure. E.g. "rand_index", "silhouette_index","sse".
total_xcorr
:dict
, optional, defaultNone
- Cross-correlation dictionary computed by
calc_all_xcorr
. The cross-correlation is actually not used in this function, however it is used to build the returnedtimexseries.timeseries_container.TimeSeriesContainer
, if given.
Returns
dict Dictionary which assigns the best transformation for every used prediction model, for every time-series. list A list of
timexseries.timeseries_container.TimeSeriesContainer
objects, one for each time-series.Examples
Create some fake data:
>>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30 >>> ds = pd.DatetimeIndex(dates, freq="D") >>> a = np.arange(30, 60) >>> b = np.arange(60, 90) >>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds)
And create the model configuration part of the TIMEX-CLUSTERING configuration dictionary:
>>> param_config = { ... "model_parameters": { ... "clustering_approach": "observation_based,feature_based,model_based", ... "models": "k_means,gaussian_mixture", ... "pre_transformation": "none", ... "distance_metric": "euclidean,dtw,softdtw", ... "feature_transformations": "DWT", ... "n_clusters": [3, 4, 5, 6], ... "gamma": 0.01, ... "main_accuracy_estimator": "silhouette" ... }, ... }
Now, get the univariate clusters:
>>> timeseries_outputs = get_best_univariate_clusters(timeseries_dataframe, param_config)
It is reasonable with this simple data that no transformation is the best transformation.** We have the
timexseries.timeseries_container.TimeSeriesContainer
list as well:>>> timeseries_outputs [<timexseries.timeseries_container.TimeSeriesContainer at 0x7f62f45d1fa0>, <timexseries.timeseries_container.TimeSeriesContainer at 0x7f62d4596sf0>, <timexseries.timeseries_container.TimeSeriesContainer at 0x7f62d4e97cd0>]
These are the
timexseries.timeseries_container.TimeSeriesContainer
objects, one for each clustering approach Each one has various fields, in this case the most interesting one ismodels
:>>> timeseries_outputs[0].models {'k_means': <timexseries.data_prediction.models.predictor.ModelResult at 0x7f62f45d1d90>}
This is the
timexseries.data_prediction.models.predictor.ModelResult
object for k_means that we have just computed.Expand source code
def get_best_univariate_clusters(ingested_data: DataFrame, param_config: dict, total_xcorr: dict = None) -> \ Tuple[dict, list]: """ Compute, for all the columns in `ingested_data` (every time-series) the best univariate clustering possible. This is done using the clustering approach specified in `param_config` and testing the effect of the different clustering algorithms, similarity measurements and transformations specified in `param_config`. Moreover, the best feature transformation found, across the possible ones, will be returned. Parameters ---------- ingested_data : DataFrame Initial data of the time-series. param_config : dict TIMEX-CLUSTERING configuration dictionary. In particular, the `model_parameters` sub-dictionary will be used. In `model_parameters` the following options has to be specified: - `clustering_approach`: clustering approach which will be use (options: "observation_based", "feature_based" or "model_based"). - `pre_transformation`: only one data preprocesing transformation to test, for Feature Based clustering approach, i.e.: none,log or log_modified - `feature_transformations`: comma-separated list of transformations keywords (e.g. "none,DWT,DFT,SVD"). - `distance_metric`: distance/similarity measure which will be use (e.g. "ED,DTW,arma"). - `models`: comma-separated list of the models to use (e.g. "agglomerative, k_means"). - `main_accuracy_estimator`: error metric which will be minimized as target by the procedure. E.g. "rand_index", "silhouette_index","sse". total_xcorr : dict, optional, default None Cross-correlation dictionary computed by `calc_all_xcorr`. The cross-correlation is actually not used in this function, however it is used to build the returned `timexseries.timeseries_container.TimeSeriesContainer`, if given. Returns ---------- dict ** Dictionary which assigns the best transformation for every used prediction model, for every time-series. list ** A list of `timexseries.timeseries_container.TimeSeriesContainer` objects, one for each time-series. Examples -------- Create some fake data: >>> dates = pd.date_range('2000-01-01', periods=30) # Last index is 2000-01-30 >>> ds = pd.DatetimeIndex(dates, freq="D") >>> a = np.arange(30, 60) >>> b = np.arange(60, 90) >>> timeseries_dataframe = DataFrame(data={"a": a, "b": b}, index=ds) And create the model configuration part of the TIMEX-CLUSTERING configuration dictionary: >>> param_config = { ... "model_parameters": { ... "clustering_approach": "observation_based,feature_based,model_based", ... "models": "k_means,gaussian_mixture", ... "pre_transformation": "none", ... "distance_metric": "euclidean,dtw,softdtw", ... "feature_transformations": "DWT", ... "n_clusters": [3, 4, 5, 6], ... "gamma": 0.01, ... "main_accuracy_estimator": "silhouette" ... }, ... } Now, get the univariate clusters: >>> timeseries_outputs = get_best_univariate_clusters(timeseries_dataframe, param_config) It is reasonable with this simple data that no transformation is the best transformation.** We have the `timexseries.timeseries_container.TimeSeriesContainer` list as well: >>> timeseries_outputs [<timexseries.timeseries_container.TimeSeriesContainer at 0x7f62f45d1fa0>, <timexseries.timeseries_container.TimeSeriesContainer at 0x7f62d4596sf0>, <timexseries.timeseries_container.TimeSeriesContainer at 0x7f62d4e97cd0>] These are the `timexseries.timeseries_container.TimeSeriesContainer` objects, one for each clustering approach Each one has various fields, in this case the most interesting one is `models`: >>> timeseries_outputs[0].models {'k_means': <timexseries.data_prediction.models.predictor.ModelResult at 0x7f62f45d1d90>} This is the `timexseries.data_prediction.models.predictor.ModelResult` object for k_means that we have just computed. """ case_name = [param_config["activity_title"]] approaches_to_test = [*param_config["model_parameters"]["clustering_approach"].split(",")] num_clusters_to_test = param_config["model_parameters"]["n_clusters"] dist_measures_to_test = [*param_config["model_parameters"]["distance_metric"].split(",")] main_accuracy_estimator = param_config["model_parameters"]["main_accuracy_estimator"] # Apply the preprocesing transformation: none,log,logmodified or none. try: data_procesing_transformation = param_config["model_parameters"]["pre_transformation"] except KeyError: data_procesing_transformation = "none" pre_transf = transformation_factory(data_procesing_transformation) ingested_data_pre_transform = pre_transf.apply(ingested_data.copy()) timeseries_containers = [] # Get the set of CPUs on which the calling process is eligible to run. try: max_threads = param_config['max_threads'] except KeyError: try: max_threads = len(os.sched_getaffinity(0)) except: max_threads = 1 columns = ingested_data_pre_transform.columns for col in columns: timeseries_data = ingested_data_pre_transform[[col]] xcorr = total_xcorr[col] if total_xcorr is not None else None for clustering_approach in approaches_to_test: best_model = {} model_results = {} model_counter = 0 models = [*param_config["model_parameters"]["models"].split(",")] if clustering_approach =='observation_based': transformations_to_test = ['none'] try: models.remove('gaussian_mixture') except: pass elif clustering_approach == 'feature_based': transformations_to_test = [*param_config["model_parameters"]["feature_transformations"].split(",")] try: models.remove('gaussian_mixture') except: pass elif clustering_approach =='model_based': transformations_to_test = ['none'] dist_measures_to_test = ['Log-likelihood'] try: models.remove('k_means') except: pass else: log.info(f"Wrong name approach: {clustering_approach}, introduce the approach's name correctly and without spaces, i.e.: 'observation_based,feature_based,model_based'") for model in models: this_model_performances = [] model_results[model] = {} log.info(f"Using approach: {clustering_approach} and using model {model}...") for metric in dist_measures_to_test: this_metric_performances = [] single_results = [] for transf in transformations_to_test: for n_clus in num_clusters_to_test: log.info(f"Computing univariate clustering using approach: {clustering_approach}, number of clusters: {n_clus}, distance metric: {metric} and transformation: {transf}...") tr = transformation_factory(transf) ingested_data_transform = tr.apply(ingested_data_pre_transform) #ModelResult _result = model_factory(ingested_data_transform, clustering_approach, model, distance_metric=metric, param_config=param_config, transformation=transf, n_clusters=n_clus) model_single_results = _result.results[0] #SingleResult characteristics = _result.characteristics best_clustering = _result.best_clustering performances = getattr(model_single_results.performances, main_accuracy_estimator) single_results.append(model_single_results) this_metric_performances.append((_result, performances, n_clus, transf)) this_model_performances.append((_result, performances, n_clus, metric, transf, best_clustering)) if main_accuracy_estimator=="silhouette": this_metric_performances.sort(key=lambda x: x[1],reverse=True) else: this_metric_performances.sort(key=lambda x: x[1]) best_result = this_metric_performances[0][0] #object ModelResult best_n_clusters = this_metric_performances[0][2] best_n_trans = this_metric_performances[0][3] log.info(f"For the metric: {metric} the best clustering is obtained using {best_n_clusters} number of clusters and transformation {best_n_trans}.") best_result.results = single_results model_results[model][metric] = best_result #object ModelResult if main_accuracy_estimator=="silhouette": this_model_performances.sort(key=lambda x: x[1],reverse=True) else: this_model_performances.sort(key=lambda x: x[1]) best_n_clusters = this_model_performances[0][2] best_metric = this_model_performances[0][3] best_n_trans = this_model_performances[0][4] best_cluster_vector = this_model_performances[0][5] column_names = ingested_data.columns.values clusters_indexes = numpy.unique(best_cluster_vector) data = [] column_values = [] for yi in clusters_indexes: data.append(column_names[best_cluster_vector == yi]) column_values.append('Cluster '+str(yi)) clusters_label = pandas.DataFrame(data = data,index = column_values) cluster_table_df = clusters_label.transpose() log.info(f"For the model: {model} the best clustering is obtained using metric {best_metric}, with {best_n_clusters} number of clusters, and transformation {best_n_trans}.") if model_counter == 0: best_model["clustering_approach"] = clustering_approach best_model["model"] = model best_model["distance_metric"] = best_metric best_model["n_clusters"] = best_n_clusters best_model["feature_transformation"] = best_n_trans best_model["pre_transformation"] = data_procesing_transformation best_model["accuracy_estimator"] = main_accuracy_estimator best_model["performance"] = this_model_performances[0][1] best_model["clusters_table"] = cluster_table_df model_counter = model_counter+1 elif this_model_performances[0][1] > best_model["performance"]: best_model["model"] = model best_model["distance_metric"] = best_metric best_model["n_clusters"] = best_n_clusters best_model["feature_transformation"] = best_n_trans best_model["pre_transformation"] = data_procesing_transformation best_model["performance"] = this_model_performances[0][1] best_model["clusters_table"] = cluster_table_df log.info(f"Process of {clustering_approach} clustering finished") timeseries_containers.append( TimeSeriesContainer(ingested_data, str(characteristics['clustering_approach']), model_results, best_model, xcorr)) return timeseries_containers
def model_factory(ingested_data: pandas.core.frame.DataFrame, clustering_approach: str, model_class: str, distance_metric: str, param_config: dict, transformation: str = None, n_clusters: int = 3)
-
Given the clustering_approach and name of the model, return the corresponding ClustersModel.
Parameters
clustering_approach
:str
- Clustering approach, e.g. "observation_based"
model_class
:str
- Model type, e.g. "k_means"
param_config
:dict
- TIMEX-CLUSTERING configuration dictionary, to pass to the just created model.
distance_metric
:str, e.g. **
- Distance/similarity measure type, e.g. "euclidean, dtw, softdtw" **
transformation
:str
, optional, defaultNone
- Optional
transformation
parameter to pass to the just created model. n_clusters
:int
, optional, default3
- Optional
number of clusters
parameter to pass to the just created model.
Returns
ModelResult
- Model Result of the class specified in
model_class
, it contains the results of the best clustering with the index of the cluster that each time series belongs to. Contains also the model characteristics and the centers of each cluster.
Examples
>>> param_config = { ... "model_parameters": { ... "clustering_approach": "observation_based", ... "models": "k_means", ... "pre_transformation": "none", ... "distance_metric": "euclidean,dtw,softdtw", ... "feature_transformations": "DWT", ... "n_clusters": [3, 4, 5, 6], ... "gamma": 0.01, ... "main_accuracy_estimator": "silhouette" ... }, ...}
>>> model = model_factory(timeseriescontainer[0].timeseries_data, clustering_approach='observation_based', model_class='k_means', distance_metric='dtw', param_config=param_config, transformation=None, n_clusters=3)
Expand source code
def model_factory(ingested_data: DataFrame, clustering_approach: str, model_class: str, distance_metric: str, param_config: dict, transformation: str = None, n_clusters: int = 3): #-> ClustersModel: """ Given the clustering_approach and name of the model, return the corresponding ClustersModel. Parameters ---------- clustering_approach : str Clustering approach, e.g. "observation_based" model_class : str Model type, e.g. "k_means" param_config : dict TIMEX-CLUSTERING configuration dictionary, to pass to the just created model. distance_metric : str, e.g. ** Distance/similarity measure type, e.g. "euclidean, dtw, softdtw" ** transformation : str, optional, default None Optional `transformation` parameter to pass to the just created model. n_clusters : int, optional, default 3 Optional `number of clusters` parameter to pass to the just created model. Returns ------- ModelResult Model Result of the class specified in `model_class`, it contains the results of the best clustering with the index of the cluster that each time series belongs to. Contains also the model characteristics and the centers of each cluster. Examples -------- >>> param_config = { ... "model_parameters": { ... "clustering_approach": "observation_based", ... "models": "k_means", ... "pre_transformation": "none", ... "distance_metric": "euclidean,dtw,softdtw", ... "feature_transformations": "DWT", ... "n_clusters": [3, 4, 5, 6], ... "gamma": 0.01, ... "main_accuracy_estimator": "silhouette" ... }, ...} >>> model = model_factory(timeseriescontainer[0].timeseries_data, clustering_approach='observation_based', model_class='k_means', distance_metric='dtw', param_config=param_config, transformation=None, n_clusters=3) """ if clustering_approach == "observation_based": if model_class == "k_means": return KMeansModel(ingested_data=ingested_data, clustering_approach="Observation based", distance_metric=distance_metric, param_config=param_config, transformation=transformation, n_clusters=n_clusters) if model_class == "hierarchical": print('Model in construction...') if clustering_approach == "feature_based": if model_class == "k_means": return KMeansModel(ingested_data=ingested_data, clustering_approach="Feature based", distance_metric=distance_metric, param_config=param_config, transformation=transformation, n_clusters=n_clusters) if clustering_approach == "model_based": if model_class == "gaussian_mixture": return GaussianMixtureModel(ingested_data=ingested_data, clustering_approach="Model based", distance_metric=distance_metric, param_config=param_config, transformation=transformation, n_clusters=n_clusters)