多种聚类算法实现Mnist分类

用聚类算法实现mnist数据集分类

1、先导入mnist数据集

1
2
3
4
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("mnist_train.csv",header=None)
data.head()
0 1 2 3 4 5 6 7 8 9 ... 775 776 777 778 779 780 781 782 783 784
0 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 785 columns

1
data.iloc[:,0].value_counts()
1    6742
7    6265
3    6131
2    5958
9    5949
0    5923
6    5918
8    5851
4    5842
5    5421
Name: 0, dtype: int64

2、切分数据集

1
2
X=data.iloc[:,1:]
Y=data.iloc[:,0]
1
2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

3、SVM 随机森林 集成学习

1
2
3
4
5
6
7
8
9
10
11
12
13
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
rnd_clf = RandomForestClassifier(min_samples_leaf=2, n_estimators=200)
svm_clf = make_pipeline(StandardScaler(), SVC(probability=True,gamma='auto'))
vot_clf = VotingClassifier(estimators=[('rf',rnd_clf),('svc',svm_clf)],voting="soft")
from sklearn.metrics import accuracy_score
for clf in (rnd_clf,svm_clf,vot_clf):
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__,f1_score(y_test, y_pred,average='weighted'),accuracy_score(y_test,y_pred))
RandomForestClassifier 0.9655389271780913 0.9655555555555555
Pipeline 0.961329629605016 0.9613131313131313
VotingClassifier 0.9700324936885113 0.970050505050505

4、用Kmeans聚类算法

1
2
3
4
5
6
7
8
#Kmeans 聚类
from sklearn.cluster import KMeans
from sklearn import metrics
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_train)
y_predict = kmeans.predict(X_test)
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

1
2
3
4
5
6
7
8
9
10
11
12
13
# 聚类算法模型评估
clustering_metrics = [
metrics.homogeneity_score,
metrics.completeness_score,
metrics.v_measure_score,
metrics.adjusted_rand_score,
metrics.adjusted_mutual_info_score,
]
kmeans = KMeans(init="k-means++", n_clusters=10, random_state=0)
results=[]
for m in clustering_metrics:
print(m.__name__,m(y_test,y_predict))
results
homogeneity_score 0.4912474848781917
completeness_score 0.49862545459901814
v_measure_score 0.4949089740697011
adjusted_rand_score 0.3614689213307473
adjusted_mutual_info_score 0.49445572431055146

5、使用KNN算法学习

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Knn   搜索最佳k值
from sklearn.neighbors import KNeighborsClassifier
scoreList=[]
for i in range(3,10):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
scoreList.append(knn.score(X_test,y_test))

plt.figure(dpi=300)
plt.plot(range(3,10),scoreList)
# plt.xticks(np.arange(1,20,1))
plt.xlabel("k")
plt.ylabel("score")
plt.grid()
plt.show()
acc = max(scoreList)*100
print("Maxmum KNN score is {:.2f}%".format(acc))

image-20221230212242132

Maxmum KNN score is 97.03%

6、导入测试集datatest

1
2
3
test = pd.read_csv("mnist_test.csv",header=None)
true_label=test.iloc[:,0]
true_label
0       7
1       2
2       1
3       0
4       4
       ..
9995    2
9996    3
9997    4
9998    5
9999    6
Name: 0, Length: 10000, dtype: int64
1
XX=test.iloc[:,1:]
1
XX
1 2 3 4 5 6 7 8 9 10 ... 775 776 777 778 779 780 781 782 783 784
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9996 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9997 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9998 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9999 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

10000 rows × 784 columns

1
2
3
4
best_knn = KNeighborsClassifier(n_neighbors=3)
best_knn.fit(X_train,y_train)
pre_y=best_knn.predict(XX)
pre_y
array([7, 2, 1, ..., 4, 5, 6], dtype=int64)
1
2
from sklearn.metrics import accuracy_score
accuracy_score(true_label, pre_y)
0.9671

多种聚类算法实现Mnist分类
http://example.com/2022/12/26/多种聚类算法实现Mnist分类/
作者
Magnesium
发布于
2022年12月26日
许可协议