1. 데이터 불러오기
import pandas as pd
data = pd.read_csv("vote.csv")
2. 특성(x)과 레이블(y)로 나누기
X=data[["gender", "region", "edu", "income", "age", "score_gov", "score_progress", "score_intention"]]
X=data[data.columns[0:8]]
X=data.loc[:, 'gender':'score_intention']
y1 = data[['vote']]
3. train-test 데이터셋 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y1,stratify=y1, random_state=42) # 결과의 비율에 맞게 뽑기
aa = pd.DataFrame(y_train)
aa['vote'].value_counts()
4. 모델 적용
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4) # 파라미터를 변경해서 처리한다.
knn.fit(X_train, y_train)
5. 모델 평가
knn.score(X_train, y_train) # 정확도
pd.crosstab(y_train['vote'], pred_train) # 실제와 예상의 차이 확인
knn.score(X_test, y_test)
from sklearn.metrics import confusion_matrix
pred_train=knn3.predict(X_train)
confusion_train=confusion_matrix(y_train, pred_train)
pred_test = knn3.predict(X_test)
confusion_test = confusion_matrix(y_test, pred_test)
from sklearn.metrics import f1_score
f1 = f1_score(y_test, pred_test, average='micro') # 다항 분류시 average를 사용해야 한다.
f2 = f1_score(y_test, pred_test, average="macro")
from sklearn.metrics import classification_report
cfreport = classification_report(y_test, pred_test)