Kevin Donovan
April 23, 2021
Steps of process:
What data to use for what process?
Need sufficient sample to learn patterns
Train and test on same data \(\rightarrow\) bias
What do we mean by bias?
\[ \begin{align} &Y_i = \text{outcome for person }i \\ &X_i = \text{predictor value for person }i \\ &\text{Model: }\hat{Y}_i=\beta_0+\beta_1X_i \\ & \\ &\text{Loss function: }L(\beta_0, \beta_1,Y,X)=\sum_{i=1}^{n}[Y_i-(\beta_0+\beta_1X_i)]^2\\ &\{\hat{\beta}_0, \hat{\beta}_1\} = \{\beta_0, \beta_1\} \text{ which minimizes } L(\beta_0, \beta_1,Y,X) \end{align} \]
\[ CV_{(K)}=\sum_{k=1}^{K}\frac{n_k}{n}MSE_k \]
where \(MSE_k=\sum_{i\in C_k}(y_i-\hat{y_i})^2/n_k\) where \(y_i\) is outcome and \(\hat{y_i}\) is predicted outcome from training on \(C_k\) only
\[ CV_K=\sum_{k=1}^{K}\frac{n_k}{n}\text{Error}_k=\frac{1}{K}\sum_{k=1}^{K}\text{Error}_k \]
where \(\text{Error}_k=\sum_{i \in C_k}I(y_i \neq \hat{y_i})/n_k\)
\[ \hat{\text{SE}}(\text{CV}_K)=\sqrt{\sum_{k=1}^{K}(\text{Error}_k-\overline{\text{Error}_k})^2/(K-1)} \]
train
and createFolds
from caret
package
train
can be used to tune and train many types of modelspostResample
and confusionMatrix
to see metrics# Create folds
tt_indices <- createFolds(y=ibis_brain_data_complete$RiskGroup, k=10)
# Do train and test for each fold
for(i in 1:length(tt_indices)){
# Create train and test sets
train_data <- ibis_brain_data_complete[-tt_indices[[i]],]
test_data <- ibis_brain_data_complete[-tt_indices[[i]],]
knn_fit <-
train(RiskGroup~EACSF_V12+TBV_V12, data=train_data,
tuneLength=10, method="knn", preProcess = c("scale", "center")
trControl=trainControl(method="cv"))
}
# Create folds
tt_indices <- createFolds(y=ibis_brain_data_complete$RiskGroup, k=10)
# Do train and test for each fold
cv_results <- list()
for(i in 1:length(tt_indices)){
# Create train and test sets
train_data <- ibis_brain_data_complete[-tt_indices[[i]],]
test_data <- ibis_brain_data_complete[-tt_indices[[i]],]
knn_fit <-
train(RiskGroup~EACSF_V12+TBV_V12, data=train_data,
tuneLength=10, method="knn", preProcess = c("scale", "center"),
trControl=trainControl(method="cv"))
# Save confusion matrix for each fold
cv_results[[i]] <- confusionMatrix(data=predict(knn_fit, newdata=test_data),
reference=test_data$RiskGroup,
positive="HR-ASD")
}
# Print confusion matrix for a fold
cv_results[[1]]
## Confusion Matrix and Statistics
##
## Reference
## Prediction HR-ASD HR-Neg
## HR-ASD 4 3
## HR-Neg 20 116
##
## Accuracy : 0.8392
## 95% CI : (0.7685, 0.8952)
## No Information Rate : 0.8322
## P-Value [Acc > NIR] : 0.4652809
##
## Kappa : 0.1972
##
## Mcnemar's Test P-Value : 0.0008492
##
## Sensitivity : 0.16667
## Specificity : 0.97479
## Pos Pred Value : 0.57143
## Neg Pred Value : 0.85294
## Prevalence : 0.16783
## Detection Rate : 0.02797
## Detection Prevalence : 0.04895
## Balanced Accuracy : 0.57073
##
## 'Positive' Class : HR-ASD
##
By default, each obs gets same weight \(\rightarrow\) contribute to error the same amount
But, error in predicting one type may have higher cost then for other type
Can implement this using chosen weights
Ex. linear regression
\[ \hat{y} = \underset{\beta}{\mathrm{argmin}}\sum_{i=1}^n(y_i-\beta x_i)^2 \]
\[ \begin{align} & \hat{y}_w = \underset{\beta}{\mathrm{argmin}}\sum_{i=1}^nw_i(y_i-\beta x_i)^2 \\ & \text{where } \{w_i\}_{i=1}^n \text{ are subject-specific weights} \end{align} \]
\[ \begin{align} & w_i = 1/\hat{\pi}_k \text{ for } k=1,\ldots, K \\ & \text{where } \hat{\pi}_k=\sum_{i=1}^{n} I(i \text{ in group } K)/n \end{align} \]
caret
can implement in train
function (weights
argument)DmWR
package
smotefamily
and bimba
Inverse probability weighting
# Create training set weights per person
train_weights <- ifelse(ibis_brain_data_complete$RiskGroup=="HR-ASD",
1/prop.table(table(ibis_brain_data_complete$RiskGroup))["HR-ASD"],
1/prop.table(table(ibis_brain_data_complete$RiskGroup))["HR-Neg"])
knn_fit <-
train(RiskGroup~EACSF_V12+TBV_V12, data=ibis_brain_data_complete,
tuneLength=10, method="knn", preProcess = c("scale", "center")
trControl=trainControl(method="cv"))
SMOTE
##
## HR-ASD HR-Neg
## 27 132
# Run SMOTE
ibis_brain_data_complete_smote <- SMOTE(RiskGroup~EACSF_V12+TBV_V12,
data = ibis_brain_data_complete,
perc.under = 150)
# Now look at frequencies
table(ibis_brain_data_complete_smote$RiskGroup)
##
## HR-ASD HR-Neg
## 81 81
knnflex
)# Create folds
tt_indices <- createFolds(y=ibis_brain_data_complete$RiskGroup, k=10)
# Do train and test for each fold
cv_results <- list()
for(i in 1:length(tt_indices)){
# Create train and test sets
train_data <- ibis_brain_data_complete[-tt_indices[[i]],]
test_data <- ibis_brain_data_complete[-tt_indices[[i]],]
# Run logistic regression model
logreg_fit <-
glm(RiskGroup~EACSF_V12+TBV_V12, data=train_data,
family=binomial)
# Save confusion matrix for each fold, use 50% probability threshold
test_preds <- factor(ifelse(predict(logreg_fit, newdata=test_data, type="response")>0.5,
"HR-Neg", "HR-ASD"))
cv_results[[i]] <- confusionMatrix(data=predict(logreg_fit, newdata=test_data, type="response"),
reference=test_data$RiskGroup,
positive="HR-ASD")
}
cv_results[[1]]
pROC
and ggroc
# Run logistic regression model
logreg_fit <-
glm(RiskGroup~EACSF_V12+TBV_V12, data=ibis_brain_data_complete,
family=binomial)
# Get estimated probabilities of HR-ASD
est_probs <- 1-predict(logreg_fit, newdata=ibis_brain_data_complete, type="response")
# Compute ROC curve with AUC
roc_curve <- roc(response=ibis_brain_data_complete$RiskGroup,
predictor=est_probs,
levels=c("HR-Neg", "HR-ASD"))
# Plot ROC curve
ggroc(roc_curve)+
labs(title = paste0("AUC = ", round(auc(roc_curve),2)))+
theme_bw()
ggroc
to customize plot more# Return max Youden's index, with specificity and sensitivity
best_thres_data <-
data.frame(coords(roc_curve, x="best", best.method = c("youden", "closest.topleft")))
# View performance at "best threshold"
best_thres_data
## threshold specificity sensitivity
## 1 0.2284176 0.8257576 0.5185185
data_add_line <-
data.frame("sensitvity"=c(1-best_thres_data$specificity,
best_thres_data$sensitivity),
"specificity"=c(best_thres_data$specificity,
best_thres_data$specificity))
ggroc(roc_curve)+
geom_point(
data = best_thres_data,
mapping = aes(x=specificity, y=sensitivity), size=2, color="red")+
geom_point(mapping=aes(x=best_thres_data$specificity,
y=1-best_thres_data$specificity),
size=2, color="red")+
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1),
color="darkgrey", linetype="dashed")+
geom_text(data = best_thres_data,
mapping=aes(x=specificity, y=0.90,
label=paste0("Threshold = ", round(threshold,2),
"\nSensitivity = ", round(sensitivity,2),
"\nSpecificity = ", round(specificity,2),
"\nAUC = ", round(auc(roc_curve),2))))+
geom_line(data=data_add_line,
mapping=aes(x=specificity, y=sensitvity),
linetype="dashed")+
theme_classic()