import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve
import numpy as np
# 1단계: 데이터 불러오기
df = pd.read_csv('https://archive.ics.uci.edu/ml/'
'machine-learning-databases'
'/breast-cancer-wisconsin/wdbc.data', header=None)
# 2단계: 데이터 전처리리
# 인코딩
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.transform(['M', 'B'])
# 데이터셋 분류 (8:2)
X_train, X_test, y_train, y_test = \
train_test_split(X, y,
test_size=0.20,
stratify=y,
random_state=1)
# # 3단계: 파이프라인
# pipe_lr = make_pipeline(StandardScaler(), # 변환기: 표준화
# PCA(n_components=2), # 변환기: 차원 축소: 주성분 분석(PCA)
# LogisticRegression()) # 추정기
# pipe_lr.fit(X_train, y_train) # 학습
# y_pred = pipe_lr.predict(X_test) # 예측
# # test_acc = pipe_lr.score(X_test, y_test) # 테스트 정확도
# k-겹 교차 검증(k-fold cross-validation)을 내부적으로 사용하여
# 검증 곡선을 그리기 위해 데이터 평가
pipe_lr = make_pipeline(StandardScaler(),
LogisticRegression(penalty='l2', max_iter=10000))
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
estimator=pipe_lr,
X=X_train,
y=y_train,
param_name='logisticregression__C',
param_range=param_range,
cv=10) # 10-겹 교차 검증
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean,
color='blue', marker='o',
markersize=5, label='Training accuracy')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, alpha=0.15,
color='blue')
plt.plot(param_range, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='Validation accuracy')
plt.fill_between(param_range,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1.0])
plt.tight_layout()
# plt.savefig('figures/06_06.png', dpi=300)
plt.show()