Kevin Donovan
May 14, 2021
caret
, e1071
, randomForest
tidymodels
train
or usual lm
# LM
lm_fit <- lm(V24_VDQ~EACSF_V12+TCV_V12+LeftAmygdala_V12+RightAmygdala_V12+
CCSplenium_V12+Uncinate_L_V12+Uncinate_R_V12,
data=ibis_brain_data)
tidy(lm_fit) %>% flextable()
term | estimate | std.error | statistic | p.value |
(Intercept) | 1.0e+02 | 3.5e+01 | 2.9490 | 0.0036 |
EACSF_V12 | -1.7e-05 | 7.4e-05 | -0.2240 | 0.8230 |
TCV_V12 | 1.4e-07 | 3.3e-05 | 0.0044 | 0.9965 |
LeftAmygdala_V12 | -4.9e-02 | 3.5e-02 | -1.4308 | 0.1541 |
RightAmygdala_V12 | 2.1e-02 | 3.8e-02 | 0.5477 | 0.5846 |
CCSplenium_V12 | -1.2e+01 | 5.0e+01 | -0.2511 | 0.8020 |
Uncinate_L_V12 | 3.4e+01 | 1.2e+02 | 0.2894 | 0.7726 |
Uncinate_R_V12 | 4.3e+01 | 1.1e+02 | 0.3735 | 0.7092 |
## Get predicted values
v24dq_predict <- predict(lm_fit, newdata=ibis_brain_data)
## Remove missing values
train_data <- model.frame(lm_fit)
v24dq_predict <- predict(lm_fit, newdata=train_data)
# or <- lm_fit$fitted.values
## Look at accuracy
## Easiest way: postResample
postResample(pred=v24dq_predict, obs=train_data$V24_VDQ)
## RMSE Rsquared MAE
## 20.02582369 0.02862786 15.13710249
# train
lm_fit <- train(V24_VDQ~EACSF_V12+TCV_V12+LeftAmygdala_V12+RightAmygdala_V12+
CCSplenium_V12+Uncinate_L_V12+Uncinate_R_V12,
data=ibis_brain_data,
method="lm",
na.action=na.omit)
tidy(lm_fit$finalModel) %>% flextable()
term | estimate | std.error | statistic | p.value |
(Intercept) | 1.0e+02 | 3.5e+01 | 2.9490 | 0.0036 |
EACSF_V12 | -1.7e-05 | 7.4e-05 | -0.2240 | 0.8230 |
TCV_V12 | 1.4e-07 | 3.3e-05 | 0.0044 | 0.9965 |
LeftAmygdala_V12 | -4.9e-02 | 3.5e-02 | -1.4308 | 0.1541 |
RightAmygdala_V12 | 2.1e-02 | 3.8e-02 | 0.5477 | 0.5846 |
CCSplenium_V12 | -1.2e+01 | 5.0e+01 | -0.2511 | 0.8020 |
Uncinate_L_V12 | 3.4e+01 | 1.2e+02 | 0.2894 | 0.7726 |
Uncinate_R_V12 | 4.3e+01 | 1.1e+02 | 0.3735 | 0.7092 |
## get training data
## lm_fit$trainingData
## get predicted outcome
v24dq_predict <- predict(lm_fit, newdata=drop_na(lm_fit$trainingData))
## get accuracies
lm_fit$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 21.89869 0.01490892 16.64219 2.106743 0.01642689 1.442991
## RMSE Rsquared MAE
## 20.02582369 0.02862786 15.13710249
## can get prediction intervals also
pred_intervals <- predict(lm_fit$finalModel, newdata=drop_na(lm_fit$trainingData),
interval="predict") %>% as.data.frame() %>%
rownames_to_column(var="id")
ggplot(data=pred_intervals[1:100,], mapping = aes(x=id, y=fit))+
geom_point()+
geom_errorbar(mapping=aes(ymin=lwr, ymax=upr))+
geom_hline(yintercept=mean(pred_intervals$fit[1:100]), color="red")+
coord_flip()+
theme_bw()+
theme(axis.text.y = element_blank())
gam
function or GAM models to extend to GLMs
splines
package
bs()
used for standard splines, ns()
used for natural splines# LM-cubic spline
lm_fit_cubic <- lm(V24_VDQ~bs(EACSF_V12, degree=3, df=6),
data=ibis_brain_data)
#tidy(lm_fit) %>% flextable()
## Knots
cubic_knots <- attr(bs(ibis_brain_data$EACSF_V12, degree=3, df=6), "knots")
# LM-linear spline
lm_fit_linear <- lm(V24_VDQ~bs(EACSF_V12, degree=1, df=6),
data=ibis_brain_data)
#tidy(lm_fit) %>% flextable()
## Knots
linear_knots <- attr(bs(ibis_brain_data$EACSF_V12, degree=1, df=6), "knots")
## Get predicted values
ibis_brain_data_plot <- cbind(ibis_brain_data,
"pred_val_cubic"=predict(lm_fit_cubic, newdata=ibis_brain_data),
"pred_val_linear"=predict(lm_fit_linear, newdata=ibis_brain_data))
ggarrange(plotlist=list(
ggplot(data=ibis_brain_data_plot, mapping=aes(x=EACSF_V12, y=V24_VDQ))+
geom_point()+
geom_line(mapping=aes(y=pred_val_linear), color="red", size=1.5)+
geom_vline(xintercept = linear_knots, linetype="dashed")+
labs(x="12 month EACSF", y="24 month VDQ", title="Linear Spline")+
theme_bw(),
ggplot(data=ibis_brain_data_plot, mapping=aes(x=EACSF_V12, y=V24_VDQ))+
geom_point()+
geom_line(mapping=aes(y=pred_val_cubic), color="red", size=1.5)+
geom_vline(xintercept = cubic_knots, linetype="dashed")+
labs(x="12 month EACSF", y="24 month VDQ", title="Cubic Spline")+
theme_bw(),
ggplot(data=ibis_brain_data_plot, mapping=aes(x=EACSF_V12, y=V24_VDQ))+
geom_point()+
geom_smooth(se=FALSE, method="loess", color="red", size=1.5)+
labs(x="12 month EACSF", y="24 month VDQ", title="LOESS (Local Regress.)")+
theme_bw(),
ggplot(data=ibis_brain_data_plot, mapping=aes(x=EACSF_V12, y=V24_VDQ))+
geom_point()+
geom_smooth(se=FALSE, method="lm", color="red", size=1.5)+
labs(x="12 month EACSF", y="24 month VDQ", title="Line of Best Fit")+
theme_bw()),
nrow=1, ncol=4)
# GLM-cubic spline
# Make sure your outcome is either a factor or 0,1 else will see error!
glm_fit_cubic <- glm(RiskGroup~bs(EACSF_V12, degree=3, df=6), family=binomial,
data=ibis_brain_data)
#tidy(lm_fit) %>% flextable()
## Knots
cubic_knots <- attr(bs(ibis_brain_data$EACSF_V12, degree=3, df=6), "knots")
## Get predicted probabilities
ibis_brain_data_plot <- cbind(ibis_brain_data,
"pred_val_cubic"=predict(glm_fit_cubic, newdata=ibis_brain_data,
type="response"))
ggplot(data=ibis_brain_data_plot, mapping=aes(x=EACSF_V12, y=as.numeric(RiskGroup)-1))+
geom_point()+
geom_line(mapping=aes(y=pred_val_cubic), color="red", size=1.5)+
geom_vline(xintercept = cubic_knots, linetype="dashed")+
labs(x="12 month EACSF", y="Diagnosis", title="Cubic Spline")+
theme_bw()
\[ \begin{align} &Y = \beta_0+\beta_1X_1+\ldots+\beta_pX_p+\epsilon \\ &\hat{\beta} = \min_{\beta} \sum_{i=1}^{n} [Y_i-(\beta_0+\beta_1X_1+\ldots+\beta_pX_p)]^2 \end{align} \]
\[ \begin{align} &Y = \beta_0+\beta_1X_1+\ldots+\beta_pX_p+\epsilon \\ &\hat{\beta} = \min_{\beta} \sum_{i=1}^{n} [Y_i-(\beta_0+\beta_1X_1+\ldots+\beta_pX_p)]^2+\lambda\sum_{j=1}^{p}||\beta_j||^q \\ &\text{where } \lambda >0 \end{align} \]
glmnet
package or train
in combo with glmnet
# Lasso: train
all_image_vars_v12 <- names(ibis_brain_data)[grepl("_V12", names(ibis_brain_data))]
penalized_fit <-
train(as.formula(paste0("V24_VDQ", "~", paste0(all_image_vars_v12, collapse="+"))),
data=ibis_brain_data %>% drop_na(),
method="glmnet",
tuneGrid=expand.grid(alpha=1, lambda=seq(0.1, 10, 0.1)),
trControl=trainControl(method="cv"))
# Look at results for each lambda
ggplot(data=penalized_fit$results, mapping=aes(x=lambda, y=RMSE))+
geom_point()+
geom_vline(xintercept = penalized_fit$bestTune$lambda)+
theme_bw()
## Get coefficients
lasso_betas <- coef(penalized_fit$finalModel, s=penalized_fit$bestTune$lambda) %>%
as.matrix() %>%
data.frame() %>%
rownames_to_column(var="predictor")
ggplot(data=lasso_betas, mapping=aes(x=predictor, y=X1))+
geom_point()+
labs(y="Beta estimate", x="Predictor")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
\[ \sum_{m=1}^{|T|}\sum_{i:x_i \in R_m}(y_i-\hat{y}_{R_m})^2+\alpha|T| \]
rpart
and party
packages in Rtrain
works with rpart
for `caret implementationtrain
, cp
denotes \(\alpha\) and is tuning parameter# CART using train
cart_fit <- train(as.formula(paste0("V24_VDQ", "~", paste0(all_image_vars_v12, collapse="+"))),
data=ibis_brain_data,
na.action=na.omit,
method='rpart',
tuneGrid=expand.grid(cp=seq(0.01, 0.05, length.out = 20)))
ggplot(data=cart_fit$results, mapping=aes(x=cp, y=RMSE))+
geom_point()+
theme_bw()
party
with train
using method="ctree"
or ="ctree2" - Not covered here;
partyhas some additional functionality over
rpart` randomForest
package in Rtrain
interacts with this package as well## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 20.40686 0.03801072 15.17725 2.347924 0.02858749 1.0526173
## 2 2 20.41269 0.03897567 15.30056 2.237690 0.05109106 0.9426112
## 3 3 20.40645 0.05045851 15.28892 2.361476 0.05314886 1.0486974
## 4 4 20.59696 0.04308042 15.48535 2.155838 0.03874349 0.8438994
## 5 5 20.43453 0.05275723 15.30286 2.263919 0.07982596 0.9336384
## mtry Accuracy Kappa AccuracySD KappaSD
## 1 1 0.8392308 0.00000000 0.01308540 0.0000000
## 2 2 0.8392308 0.00000000 0.01308540 0.0000000
## 3 3 0.8442308 0.05074627 0.02075676 0.1134721
## 4 4 0.8391026 0.04155087 0.02318795 0.1202717
## 5 5 0.8391026 0.04155087 0.02318795 0.1202717