필요한 Package를 불러오기
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.3에서 작성되었습니다
library(MASS)
## Warning: 패키지 'MASS'는 R 버전 4.2.3에서 작성되었습니다
library(corrplot)
## Warning: 패키지 'corrplot'는 R 버전 4.2.3에서 작성되었습니다
library(nnet)
## Warning: 패키지 'nnet'는 R 버전 4.2.3에서 작성되었습니다
library(olsrr)
## Warning: 패키지 'olsrr'는 R 버전 4.2.3에서 작성되었습니다
library(car)
## Warning: 패키지 'car'는 R 버전 4.2.3에서 작성되었습니다
## Warning: 패키지 'carData'는 R 버전 4.2.3에서 작성되었습니다
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.2.3에서 작성되었습니다
## Warning: 패키지 'ggplot2'는 R 버전 4.2.3에서 작성되었습니다
와인 정보 데이터를 읽어오기
file_path <- "C:/Users/admin/Desktop/wine.csv"
data = read.csv(file_path, h=T)
data = as.data.frame(data)
데이터 서머리
head(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free_sulfur_dioxide total_sulfur_dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
summary(data)
## fixed_acidity volatile_acidity citric_acid residual_sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free_sulfur_dioxide total_sulfur_dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
# 종속변수의 분포
hist(data[,12], main = "Distribution of Dependent Variable", xlab = "Dependent Variable", ylab = "Frequency", col = "skyblue")
데이터에서 독립변수들 사이의 상관관계 확인(다중 공선성 점검)
cor_mat = cor(data[,-12])
corrplot(cor_mat, method = "number", type = "upper",
tl.cex = 0.8, tl.col = "black",
addCoef.col = "black", number.cex = 0.8)
종속변수 범주형 등급(1등급~9등급)을 데이터에 반영
#3등급(min) ~ 8등급(max)을 등급을 나타내는 factor로 바꿈. 등급이 높을 수록 좋은 것이라는 순서를 부여
data$quality <- factor(data$quality, ordered = TRUE)
# 모델링에 필요한 변수를 선택
selected_vars <- c("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "pH", "sulphates", "alcohol", "quality")
data_selected <- data[, selected_vars]
선형모델은 와인의 품질이 이산점수로 나타나 있어 순서형 로지스틱
회귀분석을 실시
모델의 정확성 측정을 위해 학습 데이터와 테스트
데이터를 분리
index = 2:1200
train_set = data[index, ]
test_set = data[-index, ]
dim(train_set)
## [1] 1199 12
dim(test_set)
## [1] 400 12
train_set = as.data.frame(train_set)
test_set = as.data.frame(test_set)
quality_numeric <- as.numeric(test_set$quality)
#Ordinal regression model (순서형 로지스틱 모델) method에 logistic, probit, loglog, cauchit, cloglog 방법이 있다. 그 중 probit 모델을 선정함.
ordinal_model = polr(quality ~ ., method = "probit", data = train_set, Hess=TRUE)
ordinal_model = stepAIC(ordinal_model, direction="both", trace = FALSE)
summary(ordinal_model)
## Call:
## polr(formula = quality ~ volatile_acidity + citric_acid + chlorides +
## free_sulfur_dioxide + total_sulfur_dioxide + pH + sulphates +
## alcohol, data = train_set, Hess = TRUE, method = "probit")
##
## Coefficients:
## Value Std. Error t value
## volatile_acidity -1.907193 0.236181 -8.075
## citric_acid -0.364365 0.243650 -1.495
## chlorides -3.157698 0.786822 -4.013
## free_sulfur_dioxide 0.007553 0.004527 1.668
## total_sulfur_dioxide -0.007229 0.001404 -5.149
## pH -0.876357 0.267399 -3.277
## sulphates 1.265030 0.214074 5.909
## alcohol 0.515123 0.035483 14.518
##
## Intercepts:
## Value Std. Error t value
## 3|4 -1.4718 0.9578 -1.5366
## 4|5 -0.5977 0.9416 -0.6348
## 5|6 1.5215 0.9360 1.6256
## 6|7 3.0838 0.9408 3.2778
## 7|8 4.7650 0.9534 4.9976
##
## Residual Deviance: 2291.159
## AIC: 2317.159
Anova(ordinal_model, type = "II")
## Analysis of Deviance Table (Type II tests)
##
## Response: quality
## LR Chisq Df Pr(>Chisq)
## volatile_acidity 49.326 1 2.168e-12 ***
## citric_acid -8.651 1 1.0000
## chlorides 2.589 1 0.1076
## free_sulfur_dioxide -6.938 1 1.0000
## total_sulfur_dioxide 20.081 1 7.422e-06 ***
## pH -2.926 1 1.0000
## sulphates 27.980 1 1.225e-07 ***
## alcohol 202.595 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ordinal_model_revised = polr(quality ~ volatile_acidity + total_sulfur_dioxide + sulphates + alcohol, method = "probit", data = train_set, Hess=TRUE)
result2 = predict(ordinal_model, test_set)
result3 = predict(ordinal_model_revised, test_set)
# 예측된 클래스와 실제 클래스를 비교하여 정확도를 계산하고 출력
print(paste("Ordinal Model Accuracy:", mean(as.numeric(result2) == quality_numeric)))
## [1] "Ordinal Model Accuracy: 0.61"
print(paste("Revised Ordinal Model Accuracy:", mean(as.numeric(result3) == quality_numeric)))
## [1] "Revised Ordinal Model Accuracy: 0.61"
summary(ordinal_model_revised)
## Call:
## polr(formula = quality ~ volatile_acidity + total_sulfur_dioxide +
## sulphates + alcohol, data = train_set, Hess = TRUE, method = "probit")
##
## Coefficients:
## Value Std. Error t value
## volatile_acidity -1.975186 0.198315 -9.960
## total_sulfur_dioxide -0.005427 0.001003 -5.412
## sulphates 0.964964 0.191601 5.036
## alcohol 0.514164 0.033313 15.434
##
## Intercepts:
## Value Std. Error t value
## 3|4 1.5789 0.4288 3.6820
## 4|5 2.4379 0.4021 6.0625
## 5|6 4.5282 0.4006 11.3022
## 6|7 6.0654 0.4181 14.5059
## 7|8 7.7308 0.4504 17.1638
##
## Residual Deviance: 2318.809
## AIC: 2336.809
Anova(ordinal_model_revised, type = "II")
## Analysis of Deviance Table (Type II tests)
##
## Response: quality
## LR Chisq Df Pr(>Chisq)
## volatile_acidity 74.697 1 < 2.2e-16 ***
## total_sulfur_dioxide 19.411 1 1.054e-05 ***
## sulphates 15.059 1 0.0001042 ***
## alcohol 230.217 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(vif(ordinal_model))
## volatile_acidity citric_acid chlorides
## 1.473020 2.080466 1.316724
## free_sulfur_dioxide total_sulfur_dioxide pH
## 1.980707 1.988613 1.642988
## sulphates alcohol
## 1.301695 1.143917
print(vif(ordinal_model_revised))
## volatile_acidity total_sulfur_dioxide sulphates
## 1.045860 1.020877 1.048296
## alcohol
## 1.017230
혼동행렬(Confusion matrix)와 AUC 확인하기
# 혼동행렬을 통한 정확도 확인
create_confusion_matrix <- function(actual, predicted) {
all_classes <- sort(unique(c(actual, predicted)))
tab <- table(factor(actual, levels = all_classes), factor(predicted, levels = all_classes))
tab <- as.matrix(tab)
colnames(tab) <- paste("Predicted", colnames(tab))
rownames(tab) <- paste("Actual", rownames(tab))
return(tab)
}
# Confusion Matrix 출력
conf_matrix <- create_confusion_matrix(quality_numeric, as.numeric(result3))
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
##
## Predicted 1 Predicted 2 Predicted 3 Predicted 4 Predicted 5
## Actual 1 0 0 5 0 0
## Actual 2 0 0 11 7 0
## Actual 3 0 0 123 46 0
## Actual 4 0 0 50 115 9
## Actual 5 0 0 3 20 6
## Actual 6 0 0 0 4 1
##
## Predicted 6
## Actual 1 0
## Actual 2 0
## Actual 3 0
## Actual 4 0
## Actual 5 0
## Actual 6 0
# Accuracy 계산
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.61"
# 다중 클래스 AUC
library(pROC)
## Warning: 패키지 'pROC'는 R 버전 4.2.3에서 작성되었습니다
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_curve <- multiclass.roc(as.factor(quality_numeric), as.numeric(result3))
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
auc_values <- auc(roc_curve)
print(paste("Multiclass AUC:", auc_values))
## [1] "Multiclass AUC: 0.747697184586876"
# 클래스 3 vs 나머지 클래스
class_combined_3 <- ifelse(quality_numeric == 3, "Class 3", "Other")
roc_curve_3 <- roc(class_combined_3, as.numeric(result3), levels = c("Class 3", "Other"))
## Setting direction: controls < cases
auc_value_3 <- auc(roc_curve_3)
# 클래스 4 vs 나머지 클래스
class_combined_4 <- ifelse(quality_numeric == 4, "Class 4", "Other")
roc_curve_4 <- roc(class_combined_4, as.numeric(result3), levels = c("Class 4", "Other"))
## Setting direction: controls > cases
auc_value_4 <- auc(roc_curve_4)
print(paste("AUC for Class 3 vs. Other:", auc_value_3))
## [1] "AUC for Class 3 vs. Other: 0.72398114705807"
print(paste("AUC for Class 4 vs. Other:", auc_value_4))
## [1] "AUC for Class 4 vs. Other: 0.66905706438816"
# ROC 곡선 그리기
plot(roc_curve_3, main = "ROC Curve for Class 3 vs. Other", col = "blue")
plot(roc_curve_4, add = TRUE, col = "red")
legend("bottomright", legend = c("Class 3 vs. Other", "Class 4 vs. Other"), col = c("blue", "red"), lty = 1)