PCA Algorithm is a very common algorithm in machine learning and deep learning , I saw this algorithm when I read flower books recently , So after writing the theory, I also want to use some examples to help understand PCA.
About PCA The theoretical part of the algorithm can Check the boss' article
The following is based on python Self realized PCA Algorithm and use Kaggle Platform Handwritten digital datasets ,
m strip 2 D data pca Get four results : Compressed data 、 The restored data 、 The proportion of each eigenvalue 、 front i The proportion of the sum of eigenvalues def load_data(path=None, m=10000):
if not path:
# m strip 2 D data
x = np.random.randn(m)
y = np.random.randn(m)
data = np.stack([x, y], axis=1) # m, 2
# print(data.shape)
else:
if os.path.basename(path)[-3:] == 'csv':
df = pd.read_csv(path)
data = df.loc[:, df.columns!= 'label']
return np.array(data)
def pca(data, k=99999):
# data: m, n,m strip n D data
mean = np.mean(data, axis=0) # n,
# De centralization
meadnRemoved = data - mean # m, n
# rowvar = 0, Except for m-1
# rowvar = 1, Except for m
cov = np.cov(meadnRemoved, rowvar=0) # n, n
# j Eigenvalues
# n, j j individual n A matrix of dimensional eigenvectors
eigVals, eigVecs = np.linalg.eig(cov)# Decompose and calculate the covariance matrix ( The eigenvalue Eigenvector )
eigValsArgsort = np.argsort(eigVals,)# Default is from small to large
eigValsArgsort = eigValsArgsort[:(-k+1):-1]# Flashback retrieval , Take one of the biggest k individual
decEigVecs = eigVecs[:, eigValsArgsort] # n, k
decData = np.dot(meadnRemoved, decEigVecs) # m, n n, k --> m, k
dataReverse = np.dot(decData, decEigVecs.T) # m, k k, n -->m, n
meanReverse = np.mean(dataReverse, axis=0)
dataReverse = dataReverse + meanReverse # Restore the original data through the reduced dimension data and transformation matrix
eigValsSort = np.sort(eigVals)[::-1] # All eigenvalues are arranged from large to small
eigVals_rotio = eigValsSort / np.sum(eigVals) * 100 # The proportion of each eigenvalue to the sum of all eigenvalues
# front i The ratio of the sum of eigenvalues to the sum of all eigenvalues
eigVals_sum_rotio = np.array([np.sum(eigValsSort[:i]) / np.sum(eigVals) * 100 for i in range(len(eigValsSort))])
return decData, dataReverse, eigVals_rotio, eigVals_sum_rotio
if __name__ == '__main__':
data = load_data(m=1000) # m, 2
decData, dataReverse, ratio, sum_ratio = pca(data, k=1)
# visualization
fig = plt.figure(figsize=(20,10))
ax1 = fig.add_subplot(131)
ax1.scatter(data[:,0],data[:,1],marker='^',s=90) # The original data
ax1.scatter(decData[:,0],np.zeros(len(decData)),c='black',) # Reduced dimension graph
ax1.scatter(dataReverse[:,0],dataReverse[:,1],marker='^',s=50,c='red') # Restored data
plt.legend(['row data','decomposed data','reversed data'])
ax2 = fig.add_subplot(132)
ax2.plot(np.arange(1, len(ratio) + 1),ratio, marker='^')
plt.xticks(np.arange(1, len(ratio) + 1))
plt.legend(['eigvals ratio'])
ax3 = fig.add_subplot(133)
ax3.plot(np.arange(1, len(sum_ratio) + 1), sum_ratio, marker='^')
plt.xticks(np.arange(1, len(sum_ratio) + 1))
plt.legend(['frst i eigvals sum ratio'])
plt.show()
The results are shown in the following figure .
The data set includes three csv file .
sample_submission.csv Is submitted to kaggle Of csv File format .train.csv It's data Training set , Contains 784 Characteristics and 1 Tag columns . common 42000 Group data .test.csv For test set , Contains 784 Features , The dataset No, Label column , common 28000 Group data .
1. Roughly determine the range of the number of principal components .load_data Functions and pca The function does not change , Change the main function to the following code .
if __name__ == '__main__':
path = r'digit-recognizer\train.csv'
data = load_data(path=path) # m, n
decData, dataReverse, ratio, sum_rotio = pca(data,)
fig = plt.figure(figsize=(10,5))
plt.plot(np.arange(1, len(sum_rotio)+1),sum_rotio, )
plt.xticks(np.arange(0, 800, 50))
plt.show()
You can see about 100 The hour has reached 90% Above .
2. Further determine the number of principal components
From the above analysis, we can know that the principal component is 1-100 Between , In order to further determine the number of principal components , Take a different one k Value usage SVM Model to calculate the accuracy of the model , Determine according to the line chart .
ps:SVM Training is really slow ! You can try to change other algorithms .
if __name__ == '__main__':
path = r'digit-recognizer\train.csv'
data, label = load_data(path=path) # m, n
n_components = range(1, 101, 10)
scores = []
for i in tqdm(n_components):
decData, dataReverse, ratio, sum_rotio = pca(data, k=i)
clf = SVC(C=200, kernel='rbf')
clf.fit(decData, label)
print("done!")
# X_new = PCA(n_components=i).fit_transform(X)
scores.append(cross_val_score(clf, decData, label, cv=5).mean())
fig = plt.figure(figsize=(10, 5))
plt.plot(n_components, scores)
plt.xticks(range(0, 101, 10))
plt.show()
The result is shown in the figure . It can be seen that the approximate number of principal components is (25, 78 The accuracy reaches the saturation value .
Continue to narrow the scope . For the first time, I only set it to (25, 70, Find out the end 70 At that time, it was still an upward trend , So I ran away again (68,78)
if __name__ == '__main__':
path = r'digit-recognizer\train.csv'
data, label = load_data(path=path) # m, n
n_components = range(25, 78)
scores = []
for i in tqdm(n_components):
decData, dataReverse, ratio, sum_rotio = pca(data, k=i)
clf = SVC(C=200, kernel='rbf')
clf.fit(decData, label)
print("done!")
# X_new = PCA(n_components=i).fit_transform(X)
scores.append(cross_val_score(clf, decData, label, cv=5).mean())
fig = plt.figure(figsize=(10, 5))
plt.plot(n_components, scores)
plt.xticks(range(25, 78))
plt.show()


The final choice 70.
if __name__ == '__main__':
path = r'digit-recognizer\train.csv'
data, label = load_data(path=path) # m, n
decData, dataReverse, ratio, sum_rotio = pca(data, k=70)
knn = KNeighborsClassifier()
score = cross_val_score(knn, decData, label, cv=5).mean()
print(score) # 0.9712380952380952
The accuracy can reach 97.12%.
call sklearn Library and its own implementation PCA The calculated results are compared , Found almost the same .
path = r'digit-recognizer\train.csv'
data, label = load_data(path=path) # m, n
n_components = range(25, 77, )
scores = []
for i in tqdm(n_components):
pca_instance = PCA(n_components=i)
decData = pca_instance.fit_transform(data)
ratio = pca_instance.explained_variance_ratio_
sum_rotio = np.cumsum(ratio)
clf = SVC(C=200, kernel='rbf')
clf.fit(decData, label)
print("done!")
# X_new = PCA(n_components=i).fit_transform(X)
scores.append(cross_val_score(clf, decData, label, cv=5).mean())
fig = plt.figure(figsize=(10, 5))
plt.plot(n_components, scores)
plt.xticks(range(25, 77,))
plt.show()
