# ■ mushrooms 독버섯데이터로 나이브베이즈
import pandas as pd # 데이터 전처리를 위해서
import seaborn as sns # 시각화를 위해서
df = pd.read_csv('c:\\data\\mushrooms.csv')
df = pd.get_dummies(df)
# X = 전체 행, 마지막 열 제외한 모든 열 데이터 -> n차원 공간의 포인트
X = df.iloc[:, 2:].to_numpy()
y = df.iloc[:,1].to_numpy()
from sklearn.model_selection import train_test_split
# 훈련 데이터 75, 테스트 데이터 25으로 나눈다.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state = 10)
import numpy as np
from sklearn.naive_bayes import GaussianNB
errors = []
for i in np.arange(0.001,0.01,0.001): # 0.001, 0.002 , ... , 0.010
nb = GaussianNB(var_smoothing=i)
nb.fit(X_train, y_train)
pred_i = nb.predict(X_test)
errors.append(np.mean(pred_i != y_test))
print(errors)
for k, i in zip(np.arange(0.001,0.01,0.001), errors):
print (round(k, 3), '--->', i) # 0.004 ---> 0.008370260955194485
결과:
0.001 ---> 0.009354997538158542
0.002 ---> 0.008862629246676515
0.003 ---> 0.008862629246676515
0.004 ---> 0.008370260955194485
0.005 ---> 0.008370260955194485
0.006 ---> 0.008370260955194485
0.007 ---> 0.008370260955194485
0.008 ---> 0.008370260955194485
0.009 ---> 0.008370260955194485
import matplotlib.pyplot as plt
plt.plot(np.arange(0.001,0.01,0.001), errors, marker='o')
plt.title('Mean error with K-Value')
plt.xlabel('laplace value')
plt.ylabel('mean error')
plt.show()
#%%
# 학습/예측(Training/Pradiction)
from sklearn.naive_bayes import GaussianNB
# 나이브베이즈 분류기를 생성
classifier = GaussianNB(var_smoothing=0.004)
# 분류기 학습
classifier.fit(X_train, y_train)
# 예측
y_pred= classifier.predict(X_test)
# 작은 이원교차표
from sklearn.metrics import confusion_matrix
conf_matrix= confusion_matrix(y_test, y_pred)
print(conf_matrix)
결과:
[[1055 17]
[ 0 959]]
# 정밀도 , 재현율, f1 score 확인
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)
결과:
precision recall f1-score support
0 1.00 0.98 0.99 1072
1 0.98 1.00 0.99 959
accuracy 0.99 2031
macro avg 0.99 0.99 0.99 2031
weighted avg 0.99 0.99 0.99 2031
# 정확도 확인하는 코드
from sklearn.metrics import accuracy_score
accuracy = accuracy_score( y_test, y_pred)
print(accuracy)
정확도 : 0.9916297390448056 (Laplace : 0.004)