|
학습목차
1. 실습을 위한 KT AIDU 환경변수 설정
In [1]:
# 코드실행시 경고 메시지 무시
import warnings
warnings.filterwarnings(action='ignore')
AIDU 라이브러리 임포트
In [2]:
from aicentro.session import Session
from aicentro.framework.keras import Keras as AiduFrm
aidu_session = Session(verify=False)
aidu_framework = AiduFrm(session=aidu_session)
AIDU 디렉토리 환경변수
In [3]:
# aidu_framework.config.data_dir 내용 확인 #위치 확인
aidu_framework.config.data_dir
Out[3]:
'/aihub/data'
2. 머신러닝 모델 프로세스
① 라이브러리 임포트(import)
② 데이터 가져오기(Loading the data)
③ 탐색적 데이터 분석(Exploratory Data Analysis)
④ 데이터 전처리(Data PreProcessing) : 데이터타입 변환, Null 데이터 처리, 누락데이터 처리, 더미특성 생성, 특성 추출 (feature engineering) 등
⑤ Train, Test 데이터셋 분할
⑥ 데이터 정규화(Normalizing the Data)
⑦ 모델 개발(Creating the Model)
⑧ 모델 성능 평가
① 라이브러리 임포트
필요 라이브러리 임포트
In [4]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
② 데이터 로드
cust_data.csv 파일 컬럼명
In [5]:
# cust_data.csv 파일 읽기
df = pd.read_csv('cust_data.csv')
③ 데이터 분석
In [6]:
# 12컬럼, 7814 라인
df.info()
In [8]:
df.tail()
Out[8]:
In [9]:
# termination 레이블 불균형
df['termination'].value_counts().plot(kind='bar') #데이터 시각화 확인.
Out[9]:
<AxesSubplot:>
④ 데이터 전처리
In [10]:
cal_cols = ['class', 'sex', 'stop', 'npay', 'termination', 'bill_rating'] #범주형 컬럼으로 정의
In [11]:
df1 = pd.get_dummies(data=df, columns=cal_cols, drop_first=True)
In [12]:
# 19컬럼, 7814 라인
df1.info()
<class 'pandas.core.frame.DataFrame'>
⑤ Train, Test 데이터셋 분할 #복붙!
In [13]:
from sklearn.model_selection import train_test_split #복붙!!!!
In [14]:
X = df1.drop('termination_Y', axis=1).values #y값만 제외하고 다 가져와라
y = df1['termination_Y'].values #y 값만 가져오기
In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
In [16]:
X_train.shape
Out[16]:
(5469, 18)
In [17]:
y_train.shape
Out[17]:
(5469,)
⑥ 데이터 정규화/스케일링(Normalizing/Scaling)
In [18]:
# 숫자 분포 이루어진 컬럼 확인
df1.tail()
Out[18]:
In [19]:
from sklearn.preprocessing import MinMaxScaler #복붙!! 0~1 사이로 바꾸기
In [20]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train) #scaler된값을 넣고 X_train에 다시 지정.
X_test = scaler.transform(X_test)
In [21]:
X_train[:2], y_train[:2]
Out[21]:
모델 입력갯수, 출력갯수 확인
In [22]:
X_train.shape
Out[22]:
(5469, 18)
In [23]:
y_train.shape
Out[23]:
(5469,)
⑦ 모델 개발
모델별 바차트 그려주고 성능 확인을 위한 함수
----------------------------
In [24]: #가져다 쓰기
# 모델별로 Accuracy 점수 저장
# 모델 Accuracy 점수 순서대로 바차트를 그려 모델별로 성능 확인 가능
from sklearn.metrics import accuracy_score
my_predictions = {}
colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive',
'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray',
'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato'
]
# 모델명, 예측값, 실제값을 주면 위의 plot_predictions 함수 호출하여 Scatter 그래프 그리며
# 모델별 MSE값을 Bar chart로 그려줌
def accuracy_eval(name_, pred, actual):
global predictions
global colors
plt.figure(figsize=(12, 9))
acc = accuracy_score(actual, pred)
my_predictions[name_] = acc * 100
y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame(y_value, columns=['model', 'accuracy'])
print(df)
length = len(df)
plt.figure(figsize=(10, length))
ax = plt.subplot()
ax.set_yticks(np.arange(len(df)))
ax.set_yticklabels(df['model'], fontsize=15)
bars = ax.barh(np.arange(len(df)), df['accuracy'])
for i, v in enumerate(df['accuracy']):
idx = np.random.choice(len(colors))
bars[i].set_color(colors[idx])
ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=15, fontweight='bold')
plt.title('accuracy', fontsize=18)
plt.xlim(0, 100)
plt.show()
---------------------
1) 로지스틱 회귀 (LogisticRegression, 분류)
In [25]: #import 불러오기
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
In [26]:
lg = LogisticRegression(C=1.0,max_iter=2000)
lg.fit(X_train, y_train) #train데이터로 fit
Out[26]:
LogisticRegression(max_iter=2000)
In [27]:
# 분류기 성능 평가(score)
lg.score(X_test, y_test) #test 데이터로 확인해 보기
Out[27]:
0.929637526652452 #92%의 성능
분류기 성능 평가 지표
In [28]:
lg_pred = lg.predict(X_test)
In [29]:
# 오차행렬
# TN FP
# FN TP
confusion_matrix(y_test, lg_pred) #정답과 함께 넣어주기
Out[29]:
In [30]:
# 정확도 : 굉장히 높다
accuracy_score(y_test, lg_pred)
Out[30]:
0.929637526652452
In [31]:
# 정밀도
precision_score(y_test, lg_pred)
Out[31]:
0.8817204301075269
In [32]:
# 재현율 : 굉장히 낮다.
recall_score(y_test, lg_pred)
Out[32]:
0.3474576271186441
In [33]:
# 정밀도 + 재현율
f1_score(y_test, lg_pred)
Out[33]:
0.4984802431610942
In [34]:
print(classification_report(y_test, lg_pred)) #모두 합쳐놓은 함수
precision recall f1-score support
0 0.93 0.99 0.96 2109
1 0.88 0.35 0.50 236
accuracy 0.93 2345
macro avg 0.91 0.67 0.73 2345
weighted avg 0.93 0.93 0.92 2345
In [35]:
accuracy_eval('LogisticRegression', lg_pred, y_test) #처음에 복붙한 함수
model accuracy
0 LogisticRegression 92.963753
<Figure size 864x648 with 0 Axes>
2) KNN (K-Nearest Neighbor)
In [36]: 불러오기
from sklearn.neighbors import KNeighborsClassifier #1불러오기
In [37]:
knn = KNeighborsClassifier(n_neighbors=5) #2. 정의 + 학습, 이웃을 5개
knn.fit(X_train, y_train) #학습 X_train사용
Out[37]:
KNeighborsClassifier()
In [38]:
knn_pred = knn.predict(X_test) #3예측 X_test사용
In [39]:
accuracy_eval('K-Nearest Neighbor', knn_pred, y_test) #정확도 확인.
model accuracy
0 K-Nearest Neighbor 94.712154
1 LogisticRegression 92.963753
<Figure size 864x648 with 0 Axes>
3) 결정트리(Decision Tree)
In [40]:
from sklearn.tree import DecisionTreeClassifier
In [41]:
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
dt.fit(X_train, y_train)
Out[41]:
DecisionTreeClassifier(max_depth=10, random_state=42)
In [42]:
dt_pred = dt.predict(X_test)
In [43]:
accuracy_eval('DecisionTree', dt_pred, y_test)
model accuracy
0 DecisionTree 97.313433
1 K-Nearest Neighbor 94.712154
2 LogisticRegression 92.963753
<Figure size 864x648 with 0 Axes>
앙상블 기법의 종류
4) 랜덤포레스트(RandomForest)
주요 Hyperparameter
In [44]:
from sklearn.ensemble import RandomForestClassifier #1.import
In [45]:
rfc = RandomForestClassifier(n_estimators=3, random_state=42) #2.정의+
rfc.fit(X_train, y_train) #학습 #decision tree 3개로 학습
Out[45]:
RandomForestClassifier(n_estimators=3, random_state=42)
In [46]:
rfc_pred = rfc.predict(X_test) #3.예측
In [47]:
accuracy_eval('RandomForest Ensemble', rfc_pred, y_test) #4.결과
model accuracy
0 RandomForest Ensemble 97.611940
1 DecisionTree 97.313433
2 K-Nearest Neighbor 94.712154
3 LogisticRegression 92.963753
<Figure size 864x648 with 0 Axes>
참고,
print("RMSE on Test set : {0:.5f}".format(mean_squared_error(test_y,pred_y)**0.5))
print("R-squared Score on Test set : {0:.5f}".format(r2_score(test_y,pred_y)))
5) XGBoost
주요 특징
주요 Hyperparameter
In [48]: #설치
!pip install xgboost
Requirement already satisfied: xgboost in /usr/local/lib/python3.6/dist-packages (0.90)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.19.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.5.4)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
In [49]:
from xgboost import XGBClassifier
In [50]:
xgb = XGBClassifier(n_estimators=3, random_state=42) # 10초 소요
xgb.fit(X_train, y_train)
Out[50]:
XGBClassifier(n_estimators=3, random_state=42)
In [51]:
xgb_pred = xgb.predict(X_test)
In [52]:
accuracy_eval('XGBoost', xgb_pred, y_test)
model accuracy
0 RandomForest Ensemble 97.611940
1 XGBoost 97.611940
2 DecisionTree 97.313433
3 K-Nearest Neighbor 94.712154
4 LogisticRegression 92.963753
<Figure size 864x648 with 0 Axes>
6) Light GBM
주요 특징
주요 Hyperparameter
In [53]: #설치
!pip install lightgbm
Requirement already satisfied: lightgbm in /usr/local/lib/python3.6/dist-packages (2.3.0)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from lightgbm) (0.24.2)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.5.4)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.19.5)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->lightgbm) (3.1.0)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->lightgbm) (1.1.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
In [54]:
from lightgbm import LGBMClassifier
In [55]:
lgbm = LGBMClassifier(n_estimators=3, random_state=42) # 1분 소요
lgbm.fit(X_train, y_train)
Out[55]:
LGBMClassifier(n_estimators=3, random_state=42) #성능이 안좋으면 estimators를 높이면 된다
In [56]:
lgbm_pred = lgbm.predict(X_test)
In [57]:
accuracy_eval('LGBM', lgbm_pred, y_test)
model accuracy
0 RandomForest Ensemble 97.611940
1 XGBoost 97.611940
2 DecisionTree 97.313433
3 K-Nearest Neighbor 94.712154
4 LogisticRegression 92.963753
5 LGBM 89.936034
<Figure size 864x648 with 0 Axes>
7) Stacking #앙상블의 끝판왕
개별 모델이 예측한 데이터를 기반으로 final_estimator 종합하여 예측을 수행합니다.
In [58]:
from sklearn.ensemble import StackingRegressor, StackingClassifier
In [59]:
stack_models = [
('LogisticRegression', lg),
('KNN', knn),
('DecisionTree', dt),
]
In [60]:
# stack_models로 선언된 모델(LogisticRegression,KNN,DecisionTree)의 예측결과를 최종 meta_model(final_estimator)을 RandomForest(rfc) 사용하여 분류 예측
stacking = StackingClassifier(stack_models, final_estimator=rfc, n_jobs=-1) #rfc= random forest classifier
In [61]:
stacking.fit(X_train, y_train) # 1분 20초 소요
Out[61]:
StackingClassifier(estimators=[('LogisticRegression',
LogisticRegression(max_iter=2000)),
('KNN', KNeighborsClassifier()),
('DecisionTree',
DecisionTreeClassifier(max_depth=10,
random_state=42))],
final_estimator=RandomForestClassifier(n_estimators=3,
random_state=42),
n_jobs=-1)
In [62]:
stacking_pred = stacking.predict(X_test)
In [63]:
accuracy_eval('Stacking Ensemble', stacking_pred, y_test)
model accuracy
0 RandomForest Ensemble 97.611940
1 XGBoost 97.611940
2 DecisionTree 97.313433
3 Stacking Ensemble 96.247335
4 K-Nearest Neighbor 94.712154
5 LogisticRegression 92.963753
6 LGBM 89.936034
<Figure size 864x648 with 0 Axes>
8) Weighted Blending #예측값에 가중치 부여
각 모델의 예측값에 대하여 weight를 곱하여 최종 output 계산
In [64]:
final_outputs = {
'DecisionTree': dt_pred, #위에서 이미 dt_pred 정의 함. 예측값에 가중치 부여.
'randomforest': rfc_pred, # 나머지도 마찬가지
'xgb': xgb_pred,
'lgbm': lgbm_pred,
'stacking': stacking_pred,
}
In [65]:
final_prediction=\ #가중치 부여 #\ 은 /대조 값
final_outputs['DecisionTree'] * 0.1\
+final_outputs['randomforest'] * 0.2\
+final_outputs['xgb'] * 0.25\
+final_outputs['lgbm'] * 0.15\
+final_outputs['stacking'] * 0.3\
In [66]:
# 가중치 계산값이 0.5 초과하면 1, 그렇지 않으면 0
final_prediction = np.where(final_prediction > 0.5, 1, 0)
In [67]:
accuracy_eval('Weighted Blending', final_prediction, y_test)
model accuracy
0 RandomForest Ensemble 97.611940
1 XGBoost 97.611940
2 Weighted Blending 97.569296
3 DecisionTree 97.313433
4 Stacking Ensemble 96.247335
5 K-Nearest Neighbor 94.712154
6 LogisticRegression 92.963753
7 LGBM 89.936034
<Figure size 864x648 with 0 Axes>
배운 내용 정리
# 통계기법에서 LinearRegression
!pip install statsmodels
import statsmodels.api as sm
results = sm.OLS(train_y, train_x).fit()
results.summary()
# *** p<0.001, ** p<0.01, * p<0.05
# https://stats.stackovernet.xyz/ko/q/37406
# 기계학습에서 LinearRegression
from sklearn.linear_model import LinearRegression as lr
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error, r2_score
model=lr()
model.fit(train_x, train_y)
print("모델의 회귀계수는 : ", model.coef_, "이고 모델의 절편은 : ",model.intercept_)
pred_y = model.predict(test_x)
print("RMSE on Test set : {0:.5f}".format(mean_squared_error(test_y,pred_y)**0.5))
print("R-squared Score on Test set : {0:.5f}".format(r2_score(test_y,pred_y)))
gradient boosting
from sklearn.ensemble import GradientBoostingRegressor as grb
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error, r2_score
# 다차원 배열을 1차원으로 평평하게 만들어주기!
train_y = np.ravel(train_y, order='C')
model=grb(n_estimators=100,learning_rate=0.1,max_depth=5,min_samples_split=30,min_samples_leaf=15)
model.fit(train_x, train_y)
pred_y = model.predict(test_x)
print("RMSE on Test set : {0:.5f}".format(mean_squared_error(test_y,pred_y)**0.5))
print("R-squared Score on Test set : {0:.5f}".format(r2_score(test_y,pred_y)))
RMSE on Test set : 276.47308
R-squared Score on Test set : 0.71681
[14]:
# Feature의 중요도 확인
import matplotlib.pyplot as plt
import seaborn as sns
grb_importances_values = model.feature_importances_
grb_importances = pd.Series(grb_importances_values, index = train_x.columns)
grb_top10 = grb_importances.sort_values(ascending=False)[:10]
plt.rcParams["font.family"] = 'NanumGothicCoding'
plt.figure(figsize=(8,6))
plt.title('Top 10 Feature Importances')
sns.barplot(x=grb_top10, y=grb_top10.index,palette = "RdBu")
plt.show()
|