Python을 이용한 차원 축소 실습 (1)
우수과제 1 : 최혜빈
1. PCA의 과정
import numpy as np
import numpy.linalg as lin
import matplotlib.pyplot as plt
import pandas as pd
import random
# 기본 모듈들을 불러와 줍니다
x1 = [95, 91, 66, 94, 68, 63, 12, 73, 93, 51, 13, 70, 63, 63, 97, 56, 67, 96, 75, 6]
x2 = [56, 27, 25, 1, 9, 80, 92, 69, 6, 25, 83, 82, 54, 97, 66, 93, 76, 59, 94, 9]
x3 = [57, 34, 9, 79, 4, 77, 100, 42, 6, 96, 61, 66, 9, 25, 84, 46, 16, 63, 53, 30]
# 설명변수 x1, x2, x3의 값이 이렇게 있네요
X = np.stack((x1,x2,x3),axis=0)
# 설명변수들을 하나의 행렬로 만들어 줍니다
X = pd.DataFrame(X.T,columns=['x1','x2','x3'])
X
x1 | x2 | x3 | |
0 | 95 | 56 | 57 |
1 | 91 | 27 | 34 |
2 | 66 | 25 | 9 |
3 | 94 | 1 | 79 |
4 | 68 | 9 | 4 |
5 | 63 | 80 | 77 |
6 | 12 | 92 | 100 |
7 | 73 | 69 | 42 |
8 | 93 | 6 | 6 |
9 | 51 | 25 | 96 |
10 | 13 | 83 | 61 |
11 | 70 | 82 | 66 |
12 | 63 | 54 | 9 |
13 | 63 | 97 | 25 |
14 | 97 | 66 | 84 |
15 | 56 | 93 | 46 |
16 | 67 | 76 | 16 |
17 | 96 | 59 | 63 |
18 | 75 | 94 | 53 |
19 | 6 | 9 | 30 |
먼저 PCA를 시작하기 전에 항상!!!!!! 데이터를 scaling 해주어야 해요
를 참고하시면 도움이 될거에요
from sklearn.preprocessing import StandardScaler #스케일링
scaler = StandardScaler()
X_std = scaler.fit_transform(X) #fit메서드로 데이터 변환을 학습, transform매서드로 실제 데이터의 스케일 조정
X_std
array([[ 1.08573604, 0.02614175, 0.30684189],
[ 0.93801686, -0.86575334, -0.46445467],
[ 0.01477192, -0.92726334, -1.30282049],
[ 1.04880625, -1.66538341, 1.04460382],
[ 0.08863151, -1.41934339, -1.47049366],
[-0.09601747, 0.76426183, 0.97753455],
[-1.97943714, 1.13332186, 1.74883111],
[ 0.2732805 , 0.42595679, -0.1961776 ],
[ 1.01187645, -1.5116084 , -1.40342439],
[-0.53917504, -0.92726334, 1.61469258],
[-1.94250735, 0.85652683, 0.44098042],
[ 0.16249111, 0.82577183, 0.60865359],
[-0.09601747, -0.03536825, -1.30282049],
[-0.09601747, 1.28709688, -0.76626636],
[ 1.15959564, 0.33369178, 1.21227698],
[-0.35452606, 1.16407687, -0.06203907],
[ 0.05170172, 0.64124181, -1.06807806],
[ 1.12266584, 0.11840676, 0.50804969],
[ 0.3471401 , 1.19483187, 0.17270336],
[-2.20101593, -1.41934339, -0.5985932 ]])
features = X_std.T
features
array([[ 1.08573604, 0.93801686, 0.01477192, 1.04880625, 0.08863151,
-0.09601747, -1.97943714, 0.2732805 , 1.01187645, -0.53917504,
-1.94250735, 0.16249111, -0.09601747, -0.09601747, 1.15959564,
-0.35452606, 0.05170172, 1.12266584, 0.3471401 , -2.20101593],
[ 0.02614175, -0.86575334, -0.92726334, -1.66538341, -1.41934339,
0.76426183, 1.13332186, 0.42595679, -1.5116084 , -0.92726334,
0.85652683, 0.82577183, -0.03536825, 1.28709688, 0.33369178,
1.16407687, 0.64124181, 0.11840676, 1.19483187, -1.41934339],
[ 0.30684189, -0.46445467, -1.30282049, 1.04460382, -1.47049366,
0.97753455, 1.74883111, -0.1961776 , -1.40342439, 1.61469258,
0.44098042, 0.60865359, -1.30282049, -0.76626636, 1.21227698,
-0.06203907, -1.06807806, 0.50804969, 0.17270336, -0.5985932 ]])
2. 공분산 행렬 구하기
를 참고하면 도움이 됩니다.
cov_matrix = np.cov(features) #공분산구하는 np.cov사용-> 주어진값 바탕으로 공분산 평가
cov_matrix
array([[ 1.05263158, -0.2037104 , -0.12079228],
[-0.2037104 , 1.05263158, 0.3125801 ],
[-0.12079228, 0.3125801 , 1.05263158]])
3. 고유값과 고유벡터 구하기
lin.eig(cov_matrix) #eigenvalue, eigenvector 구하기
(array([1.48756162, 0.94435407, 0.72597904]),
array([[ 0.47018528, -0.85137353, -0.23257022],
[-0.64960236, -0.15545725, -0.74421087],
[-0.59744671, -0.50099516, 0.62614797]]))
eigenvalues = lin.eig(cov_matrix)[0]
eigenvectors = lin.eig(cov_matrix)[1]
print(eigenvalues)
print(eigenvectors)
[1.48756162 0.94435407 0.72597904]
[[ 0.47018528 -0.85137353 -0.23257022]
[-0.64960236 -0.15545725 -0.74421087]
[-0.59744671 -0.50099516 0.62614797]]
mat = np.zeros((3,3))
mat
array([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
mat[0][0] = eigenvalues[0]
mat[1][1] = eigenvalues[1]
mat[2][2] = eigenvalues[2]
mat
array([[1.48756162, 0. , 0. ],
[0. , 0.94435407, 0. ],
[0. , 0. , 0.72597904]])
4. 고유값 분해의 곱으로 원래 공분산 행렬을 구하
를 참고해서 행렬끼리 곱하시면 됩니다.
np.dot(np.dot(eigenvectors,mat),eigenvectors.T) #행렬곱 dot을 이용, 고유값 분해 곱으로 공분산 구함
array([[ 1.05263158, -0.2037104 , -0.12079228],
[-0.2037104 , 1.05263158, 0.3125801 ],
[-0.12079228, 0.3125801 , 1.05263158]])
eigenvectors.shape
(3, 3)
5. 고유 벡터 축으로 값을 변환
def new_coordinates(X,eigenvectors):
for i in range(eigenvectors.shape[0]):
if i == 0:
new = [X.dot(eigenvectors.T[i])]
else:
new = np.concatenate((new,[X.dot(eigenvectors.T[i])]),axis=0)
return new.T
#X, eigenvector 행렬곱을 통해 데이터를 projection
# 모든 고유 벡터 축으로 데이터를 projection한 값입니다
new_coordinates(X_std,eigenvectors)
# 새로운 축으로 변환되어 나타난 데이터들입니다
array([[ 0.31019368, -1.08215716, -0.07983642],
[ 1.28092404, -0.43132556, 0.13533091],
[ 1.38766381, 0.78428014, -0.12911446],
[ 0.95087515, -1.15737142, 1.6495519 ],
[ 1.84222365, 0.88189889, 0.11493111],
[-1.12563709, -0.52680338, 0.06564012],
[-2.71174416, 0.63290138, 0.71195473],
[-0.03100441, -0.20059783, -0.50339479],
[ 2.29618509, 0.07661447, 0.01087174],
[-0.61585248, -0.205764 , 1.82651199],
[-1.73320252, 1.29971699, 0.09045178],
[-0.82366049, -0.57164535, -0.27123176],
[ 0.75619512, 0.73995175, -0.76710616],
[-0.42344386, 0.26555394, -1.41533681],
[-0.39581307, -1.64646874, 0.24104031],
[-0.88581498, 0.15195119, -0.82271209],
[ 0.24587691, 0.39139878, -1.15801831],
[ 0.14741103, -1.22874561, -0.03110396],
[-0.7161265 , -0.56781471, -0.86180345],
[ 0.24475107, 2.39442622, 1.19337361]])
2. PCA 구현
from sklearn.preprocessing import StandardScaler
def MYPCA(X,number):
scaler = StandardScaler()
x_std = scaler.fit_transform(X) #scaling
features = x_std.T
cov_matrix = np.cov(features) #공분산
eigenvalues = lin.eig(cov_matrix)[0] #eigenvalue
eigenvectors = lin.eig(cov_matrix)[1] #eigenvector
new_coordinates(x_std,eigenvectors)
new_coordinate = new_coordinates(x_std,eigenvectors)
index = eigenvalues.argsort()
index = list(index)
for i in range(number):
if i==0:
new = [new_coordinate[:,index.index(i)]]
else:
new = np.concatenate(([new_coordinate[:,index.index(i)]],new),axis=0)
return new.T #모든 고유벡터 축으로 projection하여 새로운 축에 나타난 데이터 return
MYPCA(X,3)
# 새로운 축으로 잘 변환되어서 나타나나요? #넵!
# 위에서 했던 PCA랑은 차이가 있을 수 있어요 왜냐하면 위에서는 고유값이 큰 축 순서로 정렬을 안했었거든요
array([[ 0.31019368, -1.08215716, -0.07983642],
[ 1.28092404, -0.43132556, 0.13533091],
[ 1.38766381, 0.78428014, -0.12911446],
[ 0.95087515, -1.15737142, 1.6495519 ],
[ 1.84222365, 0.88189889, 0.11493111],
[-1.12563709, -0.52680338, 0.06564012],
[-2.71174416, 0.63290138, 0.71195473],
[-0.03100441, -0.20059783, -0.50339479],
[ 2.29618509, 0.07661447, 0.01087174],
[-0.61585248, -0.205764 , 1.82651199],
[-1.73320252, 1.29971699, 0.09045178],
[-0.82366049, -0.57164535, -0.27123176],
[ 0.75619512, 0.73995175, -0.76710616],
[-0.42344386, 0.26555394, -1.41533681],
[-0.39581307, -1.64646874, 0.24104031],
[-0.88581498, 0.15195119, -0.82271209],
[ 0.24587691, 0.39139878, -1.15801831],
[ 0.14741103, -1.22874561, -0.03110396],
[-0.7161265 , -0.56781471, -0.86180345],
[ 0.24475107, 2.39442622, 1.19337361]])
3. Sklearn과 비교
from sklearn.decomposition import PCA
pca = PCA(n_components=3) #sklearn pca 사용 !
print(pca.fit_transform(X_std)) #fir, transform x는 이미 스케일링한 x_std 사용
[[-0.31019368 -1.08215716 -0.07983642]
[-1.28092404 -0.43132556 0.13533091]
[-1.38766381 0.78428014 -0.12911446]
[-0.95087515 -1.15737142 1.6495519 ]
[-1.84222365 0.88189889 0.11493111]
[ 1.12563709 -0.52680338 0.06564012]
[ 2.71174416 0.63290138 0.71195473]
[ 0.03100441 -0.20059783 -0.50339479]
[-2.29618509 0.07661447 0.01087174]
[ 0.61585248 -0.205764 1.82651199]
[ 1.73320252 1.29971699 0.09045178]
[ 0.82366049 -0.57164535 -0.27123176]
[-0.75619512 0.73995175 -0.76710616]
[ 0.42344386 0.26555394 -1.41533681]
[ 0.39581307 -1.64646874 0.24104031]
[ 0.88581498 0.15195119 -0.82271209]
[-0.24587691 0.39139878 -1.15801831]
[-0.14741103 -1.22874561 -0.03110396]
[ 0.7161265 -0.56781471 -0.86180345]
[-0.24475107 2.39442622 1.19337361]]
MYPCA(X,3) #앞서 구한 값과 비교!하면 첫번째 값들이 +-가 반대로 나왔네용
array([[ 0.31019368, -1.08215716, -0.07983642],
[ 1.28092404, -0.43132556, 0.13533091],
[ 1.38766381, 0.78428014, -0.12911446],
[ 0.95087515, -1.15737142, 1.6495519 ],
[ 1.84222365, 0.88189889, 0.11493111],
[-1.12563709, -0.52680338, 0.06564012],
[-2.71174416, 0.63290138, 0.71195473],
[-0.03100441, -0.20059783, -0.50339479],
[ 2.29618509, 0.07661447, 0.01087174],
[-0.61585248, -0.205764 , 1.82651199],
[-1.73320252, 1.29971699, 0.09045178],
[-0.82366049, -0.57164535, -0.27123176],
[ 0.75619512, 0.73995175, -0.76710616],
[-0.42344386, 0.26555394, -1.41533681],
[-0.39581307, -1.64646874, 0.24104031],
[-0.88581498, 0.15195119, -0.82271209],
[ 0.24587691, 0.39139878, -1.15801831],
[ 0.14741103, -1.22874561, -0.03110396],
[-0.7161265 , -0.56781471, -0.86180345],
[ 0.24475107, 2.39442622, 1.19337361]])
4. MNIST data에 적용
import numpy as np
import numpy.linalg as lin
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_openml
from scipy import io
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
# mnist 손글씨 데이터를 불러옵니다
mnist = io.loadmat('mnist-original.mat')
X = mnist['data'].T
y = mnist['label'].T
# data information
# 7만개의 작은 숫자 이미지
# 행 열이 반대로 되어있음 -> 전치
# grayscale 28x28 pixel = 784 feature
# 각 picel은 0~255의 값
# label = 1~10 label이 총 10개인거에 주목하자
# data를 각 픽셀에 이름붙여 표현
feat_cols = [ 'pixel'+str(i) for i in range(X.shape[1]) ]
df = pd.DataFrame(X,columns=feat_cols)
df.head()
pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# df에 라벨 y를 붙여서 데이터프레임 생성
df['y'] = y
ML 기법 적용
train_test_split을 이용해 train test 비율을 0.8, 0.2로 분리하기
PCA를 이용하여 mnist 차원축소 후 학습
#먼저 trian, test data split 진행!
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
#pca를 사용하기 위해 미리 scaling진행
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train) #train set은 fit, transform 진행
X_test_std = scaler.transform(X_test) #test set은 transform만 진행
먼저 주성분 개수를 정하기 위해
Elbow point : 곡선의 기울기가 급격히 감소하는 지점
Kaiser’s Rule : 고유값 1 이상의 주성분들
누적설명률이 70%~80% 이상인 지 이 세가지 확인!
#먼저 2번 Kaiser's Rule을 확인!
#먼저 trainset의 공분산을 구해주고
cov_mat = np.cov(X_train.T)
cov_mat.shape
explain_values_raw, components_raw = lin.eig(cov_mat) #고유값eigenvalue를 구한다
pca_1 = len(explain_values_raw[explain_values_raw > 1]) #고유값이 1이상의 주성분들로 차원축소하는 pca
from sklearn.decomposition import PCA
pca = PCA(pca_1).fit(X_train_std)
pca_X_train = pca.transform(X_train_std)
pca_X_test = pca.transform(X_test_std)
components = pca.components_
pca_X_train.shape #확인 결과 2번으로는 주성분 개수를 655개로 줄여야 하지만 너무 많기 때문에
#다른 조건들도 확인하기로 결정하였습니다
#그 다음 1번 elbow point를 확인해보기로 결정하였습니다!
sing_vals = range(pca.n_components_)
sing_vals
#앞서 줄인 651개 중에서 explained_variance_ratio를 통해
#이 때 explained variance란!
#각각의 주성분 벡터가 이루는 축에 projection한 결과의 분산의 비율, 즉 각 eigenvalue의 비율을 말함
range(0, 652)
eigvals = pca.explained_variance_ratio_
plt.plot(sing_vals, eigvals, 'ro-', linewidth=1)
plt.plot(sing_vals, eigvals, 'ro-', linewidth=1)
plt.xlim(0,40)
#줄이고 줄인 결과적으로 30-40 정도가 적당하다고 판단했습니다
#마지막으로 누적설명률을 판단했습니다
pca = PCA(n_components=0.8)
pca.fit(X_train_std)
pca.n_components_
148
pca = PCA(n_components=0.75)
pca.fit(X_train_std)
pca.n_components_
#scaling한 데이터는 120개 정도가 적당하다 판단했고
120
#그래서 스케일링하지 않은 데이터도 확인해보고싶어서
#스케일링하지 않은 데이터는 주성분개수 43 또는 33이 적당하다 판단했습니다
pca = PCA(n_components=0.8)
pca.fit(X_train)
pca.n_components_
43
pca = PCA(n_components=0.75)
pca.fit(X_train)
pca.n_components_
33
먼저 randomforest 모델을 사용했습니다.
from sklearn.decomposition import PCA
pca = PCA(n_components=120)
pca.fit(X_train_std)
PCA(copy=True, iterated_power='auto', n_components=120, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
new_X_train = pca.transform(X_train_std)
new_X_test = pca.transform(X_test_std)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(new_X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
clf.score(new_X_test,y_test)
#scaling 한 데이터 주성분 개수 120으로 했을 때 0.945
0.9451428571428572
#스케일링을 안했을 때/ 주성분개수 43으로 해봤습니다
pca = PCA(n_components=43)
pca.fit(X_train)
PCA(copy=True, iterated_power='auto', n_components=43, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
new_X_train = pca.transform(X_train)
new_X_test = pca.transform(X_test)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(new_X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
clf.score(new_X_test,y_test)
#스케일링하지 않은 데이터/randomforest 사용/주성분개수 43
#accuracy: 0.955
0.955
from sklearn.decomposition import PCA
pca = PCA(n_components=33)
pca.fit(X_train)
PCA(copy=True, iterated_power='auto', n_components=33, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
new_X_train = pca.transform(X_train)
new_X_test = pca.transform(X_test)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(new_X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
clf.score(new_X_test,y_test)
#스케일링하지 않은 데이터/randomforest 사용/주성분개수 33
#accuracy: 0.953
0.9532857142857143
SVM모델
#먼저 scaling한 데이터
from sklearn.decomposition import PCA
pca = PCA(n_components=120)
pca.fit(X_train_std)
PCA(copy=True, iterated_power='auto', n_components=120, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
new_X_train = pca.transform(X_train_std)
new_X_test = pca.transform(X_test_std)
from sklearn import svm
svc = svm.SVC(kernel = 'rbf') #rbf kernel만 파라미터 설정하고 돌려보았습니다
svc.fit(new_X_train, y_train)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
svc.score(new_X_test,y_test)
#스케일링한 데이터, 주성분 개수 120
#0.969
0.9690714285714286
#스케일링하지 않은 데이터 주성분개수 43/33개로 해보았습니다
from sklearn.decomposition import PCA
pca = PCA(n_components=43)
pca.fit(X_train)
PCA(copy=True, iterated_power='auto', n_components=43, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
new_X_train = pca.transform(X_train)
new_X_test = pca.transform(X_test)
from sklearn import svm
svc = svm.SVC(kernel = 'rbf') #rbf kernel만 파라미터 설정하고 돌려보았습니다
svc.fit(new_X_train, y_train)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
svc.score(new_X_test,y_test)
#스케일링하지 않은 데이터/svm 사용/주성분개수 43
#accuracy: 0.9822!!!!!
0.9822142857142857
from sklearn.decomposition import PCA
pca = PCA(n_components=33)
pca.fit(X_train)
PCA(copy=True, iterated_power='auto', n_components=33, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
new_X_train = pca.transform(X_train)
new_X_test = pca.transform(X_test)
from sklearn import svm
svc = svm.SVC(kernel = 'rbf')
svc.fit(new_X_train, y_train)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
svc.score(new_X_test,y_test)
#스케일링 안한 데이터/ SVM / 주성분개수 33
#accuracy 0.98
0.9800714285714286
#스케일링하지 않은 데이터에, 주성분 개수가 43이 제일 적당한 것 같고 svm이 좋은 모델임을 확인하였습니다!
#가장 좋았던 accuracy는 0.9822!
Last updated