在混合分类和数值数据的 k 原型聚类算法中运行轮廓分数计算缓慢

我将 k-prototyps 库用于混合数字和数字数据类型。根据 https://github.com/nicodv/kmodes/issues/46 计算k个原型中的剪影分数,我计算了分类数据的剪影分数(基于汉明距离)和数值数据的剪影分数(基于欧几里得距离),但是开发的代码很慢计算 60000 条记录的轮廓需要 10 小时。我的笔记本电脑有 12G Ram 和 corei 7。请帮助提高代码速度?

import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
# -------- import data
df = pd.read_csv(r'C:Usersdata.csv')
# ------------- Normalize the data ---------------
# print(df.columns) # To get columns name
x_df = df[['R', 'F']]
x_df_norm = x_df.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
x_df_norm['COType'] = df[['COType']]
def calc_euclian_dis(_s1, _s2):
    # s1 = np.array((3, 5))
    _eucl_dist = np.linalg.norm(_s2 - _s1)  # calculate Euclidean distance, accept input an array [2 6]
    return _eucl_dist
def calc_simpleMatching_dis(_s1, _s2):
    _cat_dist = 0
    if (_s1 != _s2):
        _cat_dist = 1
    return _cat_dist
k = 3
# calculate silhoutte for one cluster number
kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)
clusters_label = kproto.fit_predict(x_df_norm, categorical=[2])
_identical_cluster_labels = list(dict.fromkeys(clusters_label))
# Assign clusters lables to the Dataset
x_df_norm['Cluster_label'] = clusters_label
# ------------- calculate _silhouette_Index -------------
# 1. Calculate ai
_silhouette_Index_arr = []
for i in x_df_norm.itertuples():
    _ai_cluster_label = i[-1]
    # return samples of the same cluster
    _samples_cluster = x_df_norm[x_df_norm['Cluster_label'] == _ai_cluster_label]
    _dist_array_ai = []
    _s1_nume_att = np.array((i[1], i[2]))
    _s1_cat_att = i[3]
    for j in _samples_cluster.itertuples():
        _s2_nume_att = np.array((j[1], j[2]))
        _s2_cat_att = j[3]
        _euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
        _cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
        _dist_array_ai.append(_euclian_dis + (kproto.gamma * _cat_dis))
    ai = np.average(_dist_array_ai)
    # 2. Calculate bi
    # 2.1. determine the samples of other clusters
    _identical_cluster_labels.remove(_ai_cluster_label)
    _dic_cluseter = {}
    _bi_arr = []
    for ii in _identical_cluster_labels:
        _samples = x_df_norm[x_df_norm['Cluster_label'] == ii]
        # 2.2. calculate bi
        _dist_array_bi = []
        for j in _samples.itertuples():
            _s2_nume_att = np.array((j[1], j[2]))
            _s2_cat_att = j[3]
            _euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
            _cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
            _dist_array_bi.append(_euclian_dis + (kproto.gamma * _cat_dis))
        _bi_arr.append(np.average(_dist_array_bi))
    _identical_cluster_labels.append(_ai_cluster_label)
    # min bi is determined as final bi variable
    bi = min(_bi_arr)
    # 3. calculate silhouette Index
    if ai == bi:
        _silhouette_i = 0
    elif ai < bi:
        _silhouette_i = 1 - (ai / bi)
    elif ai > bi:
        _silhouette_i = 1 - (bi / ai)
    _silhouette_Index_arr.append(_silhouette_i)
silhouette_score = np.average(_silhouette_Index_arr)
print('_silhouette_Index = ' + str(silhouette_score))
stack overflow Slow at running the Calculataion of silhouette score in k prototypes clustering algorithm for mixed catgorical and numerical data
原文答案

答案:

作者头像

嘿!我通过使用线性代数运算符来计算差异而不是使用大量 for 循环来重新实现您的函数:它更快:-)

def euclidean_dissim(a, b, **_):

    """Euclidean distance dissimilarity function
    b is the single point, a is the matrix of vectors"""
    if np.isnan(a).any() or np.isnan(b).any():
        raise ValueError("Missing values detected in numerical columns.")  
    return np.linalg.norm(a - b, axis=1)
def matching_dissim(a, b, **_):
    """Simple matching dissimilarity function
    b is the single point, a is the matrix of all other vectors,
    count how many matching values so difference = 0 """
    # We are subtracting to dimension since is not similarity but a dissimilarity
    dimension = len(b) 
    return dimension - np.sum((b-a)==0,axis=1)
def calc_silhouette_proto(dataset,numerical_pos, cat_pos,kproto_model):

    '''------------- calculate _silhouette_Index -------------'''
    # 1. Compute a(i)
    silhouette_Index_arr = []
    for i in dataset.itertuples():
        # convert tuple to np array
        i = np.array(i)
        unique_cluster_labels = list(np.unique(dataset['cluster_labels']))

        # We need each time to remove the considered tuple from the dataset since we don't compute distances from itself
        data = dataset.copy()

        ai_cluster = i[-1] # The cluster is in the last position of the tuple

        # Removing the tuple from the dataset
        tuple_index = dataset.index.isin([i[0]])
        data = data[~tuple_index]

        # Get samples of the same cluster
        samples_of_cluster = data[data['cluster_labels'] == ai_cluster].loc[:,data.columns!='cluster_labels'].to_numpy()

        # Compute the 2 distances among the single points and all the others
        euclidian_distances = euclidean_dissim(samples_of_cluster[:,numerical_pos],i[np.array(numerical_pos)+1])
        categ_distances = matching_dissim(samples_of_cluster[:,cat_pos],i[np.array(cat_pos)+1])

        # Weighted average of the 2 distances
        ai = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances)) 

        # 2. Calculate bi

        unique_cluster_labels.remove(ai_cluster)
        bi_arr = []

        for ii in unique_cluster_labels:
            # Get all the samples of cluster ii
            samples = data[data['cluster_labels'] == ii].loc[:,data.columns!='cluster_labels'].to_numpy()

            # Compute the 2 distances among the single points and all the others
            euclidian_distances = np.linalg.norm(samples[:,numerical_pos] - i[np.array(numerical_pos)+1], axis=1)
            categ_distances = matching_dissim(samples[:,cat_pos],i[np.array(cat_pos)+1])
            distance_bi = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))  
            bi_arr.append(np.average(distance_bi))

        # min bi is determined as final bi variable
        if(len(bi_arr)==0):
            bi = 0
        else:
            bi = min(bi_arr)

        # 3. calculate silhouette Index
        if ai == bi:
            silhouette_i = 0
        elif ai < bi:
            silhouette_i = 1 - (ai / bi)
        elif ai > bi:
            silhouette_i = 1 - (bi / ai)
        silhouette_Index_arr.append(silhouette_i)

    silhouette_score = np.average(silhouette_Index_arr)

    return silhouette_score