728x90
In [1]:
# Jupyter Notebook 환경 설정
from IPython.core.display import display, HTML
display(HTML(
"""<style>
div.container { width:100% !implotant; }
div.CodeMirror { font-family: Consolas; font-size: 16pt;}
div.output {font-size: 16pt; font-weight: bold; }
div.input {font-family: Consolas; font-size: 16pt; }
div.prompt { min-width: 100px;}
</style>
"""))
다중 회귀¶
In [3]:
# 다중 회귀(Multiple regression): 여러개의 특성을 사용하는 선형 회귀
# 특성 공학(feature engineering)
# 릿지(Ridge)
# 라쏘(Lasso)
# 하이퍼 파라미터(hyperparameter)
특성 공학과 규제¶
In [1]:
# 선형 회귀는 특성(feature)이 많을수록 분석 효과가 크다.
In [2]:
#다중 회귀
#여러개의 특성을 사용한 선형 회귀.
#1개의 특성을 사용했을 때 선형 회귀 모델이 학습하는 것은 직선.
#특성이 2개이면 선형회귀는 평면을 학습.
#특성이 2개이면, target 값과 함께 3차원 공간 형성(특성1, 특성2, 타깃)
#선형 회귀 방정식:target = a * 특성1 + b * 특성2 + 절편 ==> 평면
#특성이 많은 고차원에서는 선형 회귀가 매우 복잡한 모델 표현 가능
In [ ]:
#농어의 길이, 높이, 두께(3개의 특성)
#3개의 특성을 각각 제곱하여 추가함
#각 특성을 서로 곱해서 또 다른 특성 만들기
# ex) 농어 길이 * 농어 높이 를 새로운 특성으로 추가
#기존의 특성을 사용하여 새로운 특성을 뽑아내기: 특성공학(feature engineering)
In [5]:
# Pandas의 데이터프레임(dataframe) 사용
In [4]:
import pandas as pd
In [5]:
df = pd.read_csv("perch_full.csv")
In [6]:
print(df)
length height width 0 8.4 2.11 1.41 1 13.7 3.53 2.00 2 15.0 3.82 2.43 3 16.2 4.59 2.63 4 17.4 4.59 2.94 5 18.0 5.22 3.32 6 18.7 5.20 3.12 7 19.0 5.64 3.05 8 19.6 5.14 3.04 9 20.0 5.08 2.77 10 21.0 5.69 3.56 11 21.0 5.92 3.31 12 21.0 5.69 3.67 13 21.3 6.38 3.53 14 22.0 6.11 3.41 15 22.0 5.64 3.52 16 22.0 6.11 3.52 17 22.0 5.88 3.52 18 22.0 5.52 4.00 19 22.5 5.86 3.62 20 22.5 6.79 3.62 21 22.7 5.95 3.63 22 23.0 5.22 3.63 23 23.5 6.28 3.72 24 24.0 7.29 3.72 25 24.0 6.38 3.82 26 24.6 6.73 4.17 27 25.0 6.44 3.68 28 25.6 6.56 4.24 29 26.5 7.17 4.14 30 27.3 8.32 5.14 31 27.5 7.17 4.34 32 27.5 7.05 4.34 33 27.5 7.28 4.57 34 28.0 7.82 4.20 35 28.7 7.59 4.64 36 30.0 7.62 4.77 37 32.8 10.03 6.02 38 34.5 10.26 6.39 39 35.0 11.49 7.80 40 36.5 10.88 6.86 41 36.0 10.61 6.74 42 37.0 10.84 6.26 43 37.0 10.57 6.37 44 39.0 11.14 7.49 45 39.0 11.14 6.00 46 39.0 12.43 7.35 47 40.0 11.93 7.11 48 40.0 11.73 7.22 49 40.0 12.38 7.46 50 40.0 11.14 6.63 51 42.0 12.80 6.87 52 43.0 11.93 7.28 53 43.0 12.51 7.42 54 43.5 12.60 8.14 55 44.0 12.49 7.60
In [7]:
perch_full = df.to_numpy()
In [10]:
print(perch_full)
[[ 8.4 2.11 1.41] [13.7 3.53 2. ] [15. 3.82 2.43] [16.2 4.59 2.63] [17.4 4.59 2.94] [18. 5.22 3.32] [18.7 5.2 3.12] [19. 5.64 3.05] [19.6 5.14 3.04] [20. 5.08 2.77] [21. 5.69 3.56] [21. 5.92 3.31] [21. 5.69 3.67] [21.3 6.38 3.53] [22. 6.11 3.41] [22. 5.64 3.52] [22. 6.11 3.52] [22. 5.88 3.52] [22. 5.52 4. ] [22.5 5.86 3.62] [22.5 6.79 3.62] [22.7 5.95 3.63] [23. 5.22 3.63] [23.5 6.28 3.72] [24. 7.29 3.72] [24. 6.38 3.82] [24.6 6.73 4.17] [25. 6.44 3.68] [25.6 6.56 4.24] [26.5 7.17 4.14] [27.3 8.32 5.14] [27.5 7.17 4.34] [27.5 7.05 4.34] [27.5 7.28 4.57] [28. 7.82 4.2 ] [28.7 7.59 4.64] [30. 7.62 4.77] [32.8 10.03 6.02] [34.5 10.26 6.39] [35. 11.49 7.8 ] [36.5 10.88 6.86] [36. 10.61 6.74] [37. 10.84 6.26] [37. 10.57 6.37] [39. 11.14 7.49] [39. 11.14 6. ] [39. 12.43 7.35] [40. 11.93 7.11] [40. 11.73 7.22] [40. 12.38 7.46] [40. 11.14 6.63] [42. 12.8 6.87] [43. 11.93 7.28] [43. 12.51 7.42] [43.5 12.6 8.14] [44. 12.49 7.6 ]]
In [11]:
import numpy as np
In [12]:
perch_weight = np.array(
[5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0,
110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0,
130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0,
197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0,
514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0,
820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0,
1000.0, 1000.0]
)
In [13]:
from sklearn.model_selection import train_test_split
In [14]:
train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state=42)
In [15]:
#기존 데이터 사용해서 새로운 특성 만들기
In [16]:
# 싸이킷런의 변환기(transfomer)
from sklearn.preprocessing import PolynomialFeatures
In [22]:
poly = PolynomialFeatures(include_bias=False)
poly.fit([[2,3]])
Out[22]:
PolynomialFeatures(include_bias=False)
In [23]:
print(poly.transform([[2,3]]))
[[2. 3. 4. 6. 9.]]
In [24]:
# weight = a * length + b * height + c * width + d * 1
In [25]:
poly = PolynomialFeatures(include_bias=False)
In [26]:
poly.fit(train_input)
train_poly = poly.transform(train_input)
print(train_poly.shape)
(42, 9)
In [27]:
poly.get_feature_names()
Out[27]:
['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2', 'x2^2']
In [28]:
test_poly = poly.transform(test_input)
In [29]:
print(test_poly.shape)
(14, 9)
In [30]:
from sklearn.linear_model import LinearRegression
In [31]:
lr = LinearRegression()
lr.fit(train_poly, train_target)
print(lr.score(train_poly, train_target))
0.9903183436982125
In [32]:
print(lr.score(test_poly, test_target))
0.9714559911594155
In [33]:
poly = PolynomialFeatures(degree=5, include_bias=False)
In [34]:
poly.fit(train_input)
train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)
print(train_poly.shape)
(42, 55)
In [35]:
lr.fit(train_poly, train_target)
print(lr.score(train_poly, train_target))
0.9999999999938143
In [36]:
print(lr.score(test_poly, test_target))
-144.40744532797535
In [37]:
# Feature 갯수를 줄여서 과대적합을 개선
In [38]:
# 규제(Regularization): 조정
# 특성에 곱해지는 계수(기울기)의 크기를 작게 만드는 것
In [39]:
from sklearn.preprocessing import StandardScaler
In [40]:
ss = StandardScaler()
ss.fit(train_poly)
train_scaled = ss.transform(train_poly) # train_scaled: 표준 점수로 변환한 데이터
test_scaled = ss.transform(test_poly)
In [41]:
# 릿지(Ridge): 계수를 제곱한 값을 기준으로 규제 적용
# 라쏘(Lasso): 계수의 절대값을 기준으로 규제 적용
릿지 회귀¶
In [42]:
from sklearn.linear_model import Ridge
In [43]:
ridge = Ridge()
In [44]:
ridge.fit(train_scaled, train_target)
print(ridge.score(train_scaled, train_target))
0.9896101671037343
In [45]:
print(ridge.score(test_scaled, test_target))
0.9790693977615386
In [46]:
# 적절한 Alpha 값 찾기 연습
In [47]:
import matplotlib.pyplot as plt
In [48]:
train_score = []
test_score = []
In [50]:
alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
#릿지 모델 만들기
ridge = Ridge(alpha=alpha)
# 릿지 모델 훈련
ridge.fit(train_scaled, train_target)
train_score.append(ridge.score(train_scaled, train_target))
test_score.append(ridge.score(test_scaled, test_target))
In [53]:
plt.plot(np.log10(alpha_list), train_score)
plt.plot(np.log10(alpha_list), test_score)
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.show()
In [54]:
ridge = Ridge(alpha = 0.1)
ridge.fit(train_scaled, train_target)
print(ridge.score(train_scaled, train_target))
print(ridge.score(test_scaled, test_target))
0.9903815817570368 0.9827976465386896
라쏘 회귀¶
In [56]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(train_scaled, train_target)
print(lasso.score(train_scaled, train_target))
0.989789897208096
In [57]:
ridge.score(test_scaled, test_target)
Out[57]:
0.9827976465386896
In [59]:
train_score = []
test_score = []
alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
#라쏘 모델 만들기
lasso = Lasso(alpha=alpha, max_iter=10000)
# 라쏘 모델 훈련
lasso.fit(train_scaled, train_target)
train_score.append(lasso.score(train_scaled, train_target))
test_score.append(lasso.score(test_scaled, test_target))
C:\Users\BIT\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 18778.697957792032, tolerance: 518.2793833333334 model = cd_fast.enet_coordinate_descent( C:\Users\BIT\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12972.821345401393, tolerance: 518.2793833333334 model = cd_fast.enet_coordinate_descent(
In [60]:
plt.plot(np.log10(alpha_list), train_score)
plt.plot(np.log10(alpha_list), test_score)
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.show()
In [61]:
lasso = Lasso(alpha=10)
lasso.fit(train_scaled, train_target)
print(lasso.score(train_scaled, train_target))
print(lasso.score(test_scaled, test_target))
0.9888067471131867 0.9824470598706695
In [62]:
print(np.sum(lasso.coef_==0))
40
In [ ]:
728x90
'머신러닝' 카테고리의 다른 글
cross-validation (0) | 2022.03.22 |
---|---|
gradient-descent (0) | 2022.03.22 |
logistic-regression (0) | 2022.03.22 |
댓글