# 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 데이터 불러오기
df = pd.read_csv('cust_data.csv')
df.info()
df.head()
# 정답지 칼럼 'termination' 분포도
df['termination'].value_counts(normalize=True).plot(kind='bar')
# LabelEncoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df1['target']=le.fit_transform(df1['target'])
# 문자 타입 칼럼 가져오기
cols = list(df.select_dtypes('object'))
cols
# 문자 타입 칼럼에 대해 One-Hot-Encoding 수행
df1 = pd.get_dummies(data=df, columns=cols, drop_first=True)
df1.info()
# 문제지, 정답지 분리
X = df1.drop('termination_Y', axis=1).values
y = df1['termination_Y'].values
X.shape, y.shape
# 훈련데이터, 검증데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, y_train.shape
# 0 ~ 1 값으로 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train[:2], X_test [:2]
------------------------- 머신러닝 모델별 훈련, 예측, 성능평가 -------------------------
1. 로지스틱 회귀 (LogisticRegression, 분류)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
# 모델생성
lg = LogisticRegression()
# 훈련
lg.fit(X_train, y_train)
# 예측
y_pred = lg.predict(X_test)
# 성능
lg.score(X_test, y_test)
# 오차행렬
confusion_matrix(y_test, y_pred)
# 정확도
accuracy_score(y_test, y_pred)
# 정밀도
precision_score(y_test, y_pred)
# 재현율
recall_score(y_test, y_pred)
# 정밀도 + 재현율
f1_score(y_test, y_pred)
# 성능 리포트
classification_report(y_test, y_pred)
2. KNN (K-Nearest Neighbor)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# 모델생성
knn = KNeighborsClassifier(n_neighbors=5)
# 훈련
knn.fit(X_train, y_train)
# 예측
y_pred = knn.predict(X_test)
# 성능
knn.score(X_test, y_test)
# 오차행렬
confusion_matrix(y_test, y_pred)
# 정확도
accuracy_score(y_test, y_pred)
# 정밀도
precision_score(y_test, y_pred)
# 재현율
recall_score(y_test, y_pred)
# 정밀도 + 재현율
f1_score(y_test, y_pred)
# 성능 리포트
classification_report(y_test, y_pred)
3. 결정트리(DecisionTree)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# 모델생성
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
# 훈련
dt.fit(X_train, y_train)
# 예측
y_pred = dt.predict(X_test)
# 성능
dt.score(X_train, y_train)
# 오차행렬
confusion_matrix(y_test, y_pred)
# 정확도
accuracy_score(y_test, y_pred)
# 정밀도
precision_score(y_test, y_pred)
# 재현율
recall_score(y_test, y_pred)
# 정밀도 + 재현율
f1_score(y_test, y_pred)
# 성능 리포트
classification_report(y_test, y_pred)
4. 랜덤포레스트(RandomForest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# 모델생성
rfc = RandomForestClassifier(n_estimators=3, random_state=42)
# 훈련
rfc.fit(X_train, y_train)
# 예측
y_pred = rfc.predict(X_test)
# 성능
rfc.score(X_train, y_train)
# 오차행렬
confusion_matrix(y_test, y_pred)
# 정확도
accuracy_score(y_test, y_pred)
# 정밀도
precision_score(y_test, y_pred)
# 재현율
recall_score(y_test, y_pred)
# 정밀도 + 재현율
f1_score(y_test, y_pred)
# 성능 리포트
classification_report(y_test, y_pred)
5. XGBoost
!pip install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# 모델생성
xgb = XGBClassifier(n_estimators=3, random_state=42)
# 훈련
xgb.fit(X_train, y_train)
# 예측
y_pred = xgb.predict(X_test)
# 성능
xgb.score(X_train, y_train)
# 오차행렬
confusion_matrix(y_test, y_pred)
# 정확도
accuracy_score(y_test, y_pred)
# 정밀도
precision_score(y_test, y_pred)
# 재현율
recall_score(y_test, y_pred)
# 정밀도 + 재현율
f1_score(y_test, y_pred)
# 성능 리포트
classification_report(y_test, y_pred)
6. Light GBM
!pip install lightgbm
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
# 모델생성
lgbm = LGBMClassifier(n_estimators=3, random_state=42)
# 훈련
lgbm.fit(X_train, y_train)
# 예측
y_pred = lgbm.predict(X_test)
# 성능
lgbm.score(X_train, y_train)
# 오차행렬
confusion_matrix(y_test, y_pred)
# 정확도
accuracy_score(y_test, y_pred)
# 정밀도
precision_score(y_test, y_pred)
# 재현율
recall_score(y_test, y_pred)
# 정밀도 + 재현율
f1_score(y_test, y_pred)
# 성능 리포트
classification_report(y_test, y_pred)