  利用数据: 美国加州大学公开的批发商的客户数据
import pandas as pd import numpy as np from sklearn.cluster import KMeans import matplotlib.pyplot as plt df_read = pd.read_csv('Wholesale_customers_data.csv') #读取数据 cust_df = df_read.copy() #删除不要的列 del (cust_df['Channel']) del (cust_df['Region']) cust_array = np.array([cust_df['Fresh'].tolist(), cust_df['Milk'].tolist(), cust_df['Grocery'].tolist(), cust_df['Frozen'].tolist(), cust_df['Detergents_Paper'].tolist(), cust_df['Delicassen'].tolist()], np.int32) cust_array = cust_array.T k = 3 labels = KMeans(n_clusters=k, random_state=0).fit_predict(cust_array) #进行预测 df_read['cluster'] = labels #结果追加在最后 pd.DataFrame.to_excel(df_read,'c:/kmeans_result.xlsx',index=False) #结果保存 cust_df['cluster'] = labels print('---簇的计数---') print(cust_df['cluster'].value_counts()) for i in range(k): print('---簇{}的平均---'.format(str(i))) print(cust_df[cust_df['cluster'] == i].mean()) #可视化 clusterinf = pd.DataFrame() for i in range(k): clusterinf['cluster' + str(i)] = cust_df[cust_df['cluster'] == i].mean() clusterinf = clusterinf.drop('cluster') clustersInf = "Mean({} Clusters)".format(str(k)) clus_plot = clusterinf.T.plot(kind='bar', stacked=True, title=clustersInf ) clus_plot.set_xticklabels(clus_plot.xaxis.get_majorticklabels(), rotation=0) plt.show()
1 328
2 59
0 53
Name: cluster, dtype: int64
Fresh 7751.981132
Milk 17910.509434
Grocery 27037.905660
Frozen 1970.943396
Detergents_Paper 12104.867925
Delicassen 2185.735849
cluster 0.000000
dtype: float64
Fresh 8341.612805
Milk 3779.893293
Grocery 5152.173780
Frozen 2577.237805
Detergents_Paper 1720.573171
Delicassen 1136.542683
cluster 1.000000
dtype: float64
Fresh 36156.389831
Milk 6123.644068
Grocery 6366.779661
Frozen 6811.118644
Detergents_Paper 1050.016949
Delicassen 3090.050847
cluster 2.000000
dtype: float64
    从图中可以看到被分到簇1(cluster1)的顾客(328人) 整体订购量很低(人数最多)
from sklearn.cluster import KMeans import matplotlib.pyplot as plt import pandas as pd X = pd.read_csv("Wholesale_customers_data.csv") del (X['Channel']) del (X['Region']) X = X.values # k means determine k distortions = [] # 存放每次结果的误差平方和 clusters = range(1, 20) for k in clusters: km = KMeans(n_clusters=k).fit(X) distortions.append(km.inertia_) # Plot the elbow plt.plot(clusters, distortions, marker='o') plt.xlabel('Number of clusters (k)') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.show()