import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('cust_data.csv')
df.info()
df.tail()
# termination 레이블 불균형
df['termination'].value_counts().plot(kind='bar')
# Object 컬럼 리스트 정의
cal_cols = ['class', 'sex', 'stop', 'npay', 'termination', 'bill_rating']
# pandas get_dummies 함수 사용하여 Object 컬럼에 대해 One-Hot-Encoding 수행
df1 = pd.get_dummies(data=df, columns=cal_cols, drop_first=True)
df1.info()
X = df1.drop('termination_Y', axis=1).values
y = df1['termination_Y'].values
X.shape, y.shape
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,random_state=42)
X_train.shape, y_train.shape
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train[:2], y_train[:2]
from sklearn.metrics import accuracy_score
1. 로지스틱 회귀 (LogisticRegression, 분류)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
lg = LogisticRegression()
lg.fit(X_train, y_train)
lg.score(X_test, y_test)
lg_pred = lg.predict(X_test)
confusion_matrix(y_test, lg_pred)
accuracy_score(y_test, lg_pred)
precision_score(y_test, lg_pred)
recall_score(y_test, lg_pred)
f1_score(y_test, lg_pred)
classification_report(y_test, lg_pred)
2. KNN (K-Nearest Neighbor)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)
knn_pred = knn.predict(X_test)
confusion_matrix(y_test, knn_pred)
accuracy_score(y_test, knn_pred)
precision_score(y_test, knn_pred)
recall_score(y_test, knn_pred)
f1_score(y_test, knn_pred)
classification_report(y_test, knn_pred)
3. 결정트리(DecisionTree)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)
dt_pred = dt.predict(X_test)
confusion_matrix(y_test, dt_pred)
accuracy_score(y_test, dt_pred)
precision_score(y_test, dt_pred)
recall_score(y_test, dt_pred)
f1_score(y_test, dt_pred)
classification_report(y_test, dt_pred)
4. 랜덤포레스트(RandomForest)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=3, random_state=42)
rfc.fit(X_train, y_train)
rfc.score(X_train, y_train)
rfc_pred = rfc.predict(X_test)
confusion_matrix(y_test, rfc_pred)
accuracy_score(y_test, rfc_pred)
precision_score(y_test, rfc_pred)
recall_score(y_test, rfc_pred)
f1_score(y_test, rfc_pred)
classification_report(y_test, rfc_pred)
5. XGBoost
!pip install xgboost
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=3, random_state=42)
xgb.fit(X_train, y_train)
xgb.score(X_train, y_train)
xgb_pred = xgb.predict(X_test)
confusion_matrix(y_test, xgb_pred)
accuracy_score(y_test, xgb_pred)
precision_score(y_test, xgb_pred)
recall_score(y_test, xgb_pred)
f1_score(y_test, xgb_pred)
classification_report(y_test, xgb_pred)