Source code for mizarlabs.transformers.sampling.average_uniqueness
import pandas as pd
from mizarlabs.model.bootstrapping import get_ind_matrix
from mizarlabs.model.bootstrapping import calc_average_uniqueness
from mizarlabs.static import EVENT_END_TIME
from mizarlabs.transformers.utils import check_missing_columns
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
[docs]class AverageUniqueness(BaseEstimator, TransformerMixin):
"""
Calculates the average uniqueness of samples.
"""
def __init__(self, event_end_time_column_name: str = EVENT_END_TIME):
self._event_end_time_column_name = event_end_time_column_name
[docs] def transform(self, X: pd.DataFrame) -> pd.Series:
check_missing_columns(X, [self._event_end_time_column_name])
assert ~X[self._event_end_time_column_name].isna().any(), (
"The expiration barrier should always have a value but "
f" the indices {', '.join([str(index) for index in X[X[self._event_end_time_column_name].isna()].index])} contains NaNs"
)
assert X.index.is_unique, (
f"Index should be unique but indices"
f" {', '.join([str(index) for index in X[X.index.duplicated(keep='last')][:3].index])} are duplicated"
)
ind_mat_csc = get_ind_matrix(
X.copy()[self._event_end_time_column_name],
X.copy(),
self._event_end_time_column_name,
).tocsc()
average_uniqueness_array = calc_average_uniqueness(ind_mat_csc)
average_uniqueness = pd.Series(average_uniqueness_array, index=X.index)
return average_uniqueness