728x90
In [2]:
# Jupyter Notebook 환경 설정
from IPython.core.display import display, HTML
display(HTML(
"""<style>
div.container { width:100% !implotant; }
div.CodeMirror { font-family: Consolas; font-size: 16pt;}
div.output {font-size: 16pt; font-weight: bold; }
div.input {font-family: Consolas; font-size: 16pt; }
div.prompt { min-width: 100px;}
</style>
"""))
교차검증, 그리드서치¶
In [3]:
#검증세트
#교차검증: k-fold cross validation
#그리드서치
#랜덤서치
In [4]:
# 테스트세트로 일반화 성능을 올바르게 예측하려면 가능한 한 테스트 세트를 사용하지 않는다.
# 테스트 세트는 모델을 만들고 마지막에 딱 한번만 사용
# 그렇다면 max_depth 등의 hyperparameter tuning은 어떻게??
# 훈련 세트를 또 나누어 검증 세트(validation set)를 만든다.
In [5]:
import pandas as pd
In [9]:
wine = pd.read_csv('wine.csv')
In [12]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()
In [13]:
from sklearn.model_selection import train_test_split
In [14]:
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)
In [16]:
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)
In [17]:
print(sub_input.shape, val_input.shape)
(4157, 3) (1040, 3)
In [19]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))
0.9971133028626413 0.864423076923077
In [20]:
# 교차 검증
from sklearn.model_selection import cross_validate
In [22]:
scores = cross_validate(dt, train_input, train_target)
print(scores)
{'fit_time': array([0.00598264, 0.00698113, 0.00701261, 0.00598025, 0.00594735]), 'score_time': array([0. , 0. , 0. , 0.00100493, 0.00103641]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}
In [23]:
import numpy as np
In [24]:
print(np.mean(scores['test_score']))
0.855300214703487
In [26]:
# 분할기 지정: 분류 모델인 경우 StratifiedKFold 사용
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))
0.855300214703487
In [27]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))
0.8574181117533719
In [28]:
# Hyperparameter tuning
#1.기본값을 그대로 사용하여 모델 훈련
#2. 검증 세트의 점수나 교차 검증을 통해서 매개변수 조금씩 바꿔서
# max_depth 고정
# min_samples_split
# GridSearchCV: 하이퍼파라미터 탐색과 교차검증 한번에 수행
from sklearn.model_selection import GridSearchCV
In [29]:
params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
In [30]:
# GridSearch의 cv 매개변수 기본값 5, parameter 갯수 5
#
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
In [31]:
gs.fit(train_input, train_target)
Out[31]:
GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1, param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]})
In [32]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))
0.9615162593804117
In [33]:
print(gs.best_params_)
{'min_impurity_decrease': 0.0001}
In [34]:
print(gs.cv_results_['mean_test_score'])
[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]
In [35]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
In [37]:
print(gs.cv_results_['params'][best_index])
{'min_impurity_decrease': 0.0001}
In [43]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001), # 원소 갯수: 9
'max_depth': range(5, 20, 1), # 원소 갯수: 15
'min_samples_split': range(2, 100, 10)} # 원소 갯수: 10
# 교차 검증 횟수: 9 * 15 * 10 = 1,350
# 기본 5-fold 교차 검증: 1,350 * 5 = 6,750
In [44]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)
Out[44]:
GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1, param_grid={'max_depth': range(5, 20), 'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009]), 'min_samples_split': range(2, 100, 10)})
In [45]:
# 최상 매개변수 조합
print(gs.best_params_)
{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}
In [46]:
# 최상 교차 검증 점수
print(np.max(gs.cv_results_['mean_test_score']))
0.8683865773302731
In [48]:
# 랜덤 서치
# 매개변수 값의범위나 간격을 미리 정하기 어렵거나,
# 너무 많은 매개변수 조건이 있어 그리드서치 수행 시간이 오래 걸리는 경우
# 매개변수 값 목록 전달이 아닌 매개변수를 샘플링할 수 있는 확률분포 객체 전달
from scipy.stats import uniform, randint
In [51]:
rgen = randint(0, 10)
rgen.rvs(10) #random value select
Out[51]:
array([1, 8, 2, 7, 1, 9, 9, 4, 7, 2])
In [52]:
np.unique(rgen.rvs(1000), return_counts=True)
Out[52]:
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([107, 117, 92, 94, 102, 97, 98, 91, 107, 95], dtype=int64))
In [54]:
ugen = uniform(0,1)
ugen.rvs(10)
Out[54]:
array([0.37272721, 0.81919781, 0.69698489, 0.23815517, 0.56647575, 0.26393255, 0.54156375, 0.20003519, 0.86102 , 0.20571526])
In [67]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001), # 원소 갯수: 9
'max_depth': randint(20, 50), # 원소 갯수: 15
'min_samples_split': randint(2, 25), # 원소 갯수: 10
'min_samples_leaf': randint(1,25),}
In [68]:
from sklearn.model_selection import RandomizedSearchCV
In [69]:
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)
Out[69]:
RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_iter=100, n_jobs=-1, param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020EA02C1DF0>, 'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020EA065E430>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020EA02C12E0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000020EA065EDC0>}, random_state=42)
In [70]:
print(gs.best_params_)
{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173, 'min_samples_leaf': 7, 'min_samples_split': 13}
In [71]:
# 최상 교차 검증 점수
print(np.max(gs.cv_results_['mean_test_score']))
0.8695428296438884
In [73]:
dt = gs.best_estimator_
print(dt.score(test_input, test_target))
0.86
In [ ]:
728x90
'머신러닝' 카테고리의 다른 글
gradient-descent (0) | 2022.03.22 |
---|---|
logistic-regression (0) | 2022.03.22 |
Multiple regression (0) | 2022.03.22 |
댓글