wine <- read.csv("wine2.csv", stringsAsFactors = TRUE)
head(wine)
str(wine) # Type 종속변수
unique(wine$Type)
colSums(is.na(wine))
dim(wine) # 177 14
set.seed(1)
train_num <- createDataPartition(wine$Type, p=0.8, list=F)
train_data <- wine[train_num, ]
test_data <- wine[-train_num, ]
nrow(train_data)#143
nrow(test_data) # 34
wine_model <- C5.0(train_data[,-1], train_data[,1])
wine_model
summary(wine_model)
train_result <- predict(wine_model, train_data[,-1])
test_result <- predict(wine_model, test_data[ , -1])
sum(train_result==train_data[,1])/143*100
sum(test_result==test_data[,1])/34*100
y <- 0
jumpby <-1
for ( i in 1:10 ) {
y <- y + jumpby
credit_model2<-C5.0(train_data[ ,-1],train_data[ ,1], trials=y)
test_result2 <- predict(credit_model2, test_data[ ,-1])
train_result2 <- predict(credit_model2, train_data[ ,-1])
a<- sum(test_result2 == test_data[ ,1])/34*100
b<- sum(train_result2 == train_data[ ,1])/143*100
print(paste(i,'일때 테스트데이터',a,'/훈련데이터',b))
}
trials가 4일 때 가장 정확도가 높다.