library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
library(flextable)
##
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
##
## compose
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Read in data and convert from wide to long
ibis_data <- read_csv("../Data/Cross-sec_full.csv", na=c(".", "", " ")) %>%
mutate(SSM_ASD_v24_num = ifelse(SSM_ASD_v24=="YES_ASD", 1,
ifelse(SSM_ASD_v24=="NO_ASD", 0, NA)))
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## SSM_ASD_v24 = col_character(),
## `V24 demographics,Risk` = col_character(),
## GROUP = col_character(),
## Study_Site = col_character(),
## Gender = col_character()
## )
## i Use `spec()` for the full column specifications.
names(ibis_data) <- gsub(" |,",".",names(ibis_data))
# Fit model, 24 month MSEL composite standard score as predictor
logistic_fit <- glm(SSM_ASD_v24~V24.mullen.composite_standard_score,
family=binomial(),
data=ibis_data)
## Error in eval(family$initialize): y values must be 0 <= y <= 1
## Return error? ASD variable is character, must be factor or numeric
logistic_fit <- glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score,
family=binomial(),
data=ibis_data)
logistic_fit
##
## Call: glm(formula = factor(SSM_ASD_v24) ~ V24.mullen.composite_standard_score,
## family = binomial(), data = ibis_data)
##
## Coefficients:
## (Intercept) V24.mullen.composite_standard_score
## 6.67117 -0.08888
##
## Degrees of Freedom: 572 Total (i.e. Null); 571 Residual
## (14 observations deleted due to missingness)
## Null Deviance: 514.7
## Residual Deviance: 368.9 AIC: 372.9
# Note, category order with factors automatically alphabetical. First set to "reference", ok in this case (as NO comes before YES)
logistic_fit <- glm(SSM_ASD_v24_num~V24.mullen.composite_standard_score,
family=binomial(),
data=ibis_data)
logistic_fit
##
## Call: glm(formula = SSM_ASD_v24_num ~ V24.mullen.composite_standard_score,
## family = binomial(), data = ibis_data)
##
## Coefficients:
## (Intercept) V24.mullen.composite_standard_score
## 6.67117 -0.08888
##
## Degrees of Freedom: 572 Total (i.e. Null); 571 Residual
## (14 observations deleted due to missingness)
## Null Deviance: 514.7
## Residual Deviance: 368.9 AIC: 372.9
# Same results
# Raw output
summary(logistic_fit)
##
## Call:
## glm(formula = SSM_ASD_v24_num ~ V24.mullen.composite_standard_score,
## family = binomial(), data = ibis_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1954 -0.5375 -0.3091 -0.1466 3.2387
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.671167 0.827694 8.06 7.63e-16 ***
## V24.mullen.composite_standard_score -0.088884 0.009268 -9.59 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 514.73 on 572 degrees of freedom
## Residual deviance: 368.89 on 571 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 372.89
##
## Number of Fisher Scoring iterations: 6
# Format output
tidy(logistic_fit) %>%
mutate(p.value=ifelse(p.value<0.005, "<0.005",
as.character(round(p.value, 3))),
term=fct_recode(factor(term),
"Intercept"="(Intercept)",
"24 Month MSEL Composite Standard Score"=
"V24.mullen.composite_standard_score")) %>%
flextable() %>%
set_header_labels("term"="Variable",
"estimate"="Estimate",
"std.error"="Std. Error",
"statistic"="Z Statistic",
"p.value"="P-value") %>%
autofit()
Variable | Estimate | Std. Error | Z Statistic | P-value |
Intercept | 6.671 | 0.8277 | 8.1 | <0.005 |
24 Month MSEL Composite Standard Score | -0.089 | 0.0093 | -9.6 | <0.005 |
# Fit model, 24 month MSEL composite standard score, gender, and site as predictors
logistic_fit <-
glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score+Gender+Study_Site,
family=binomial(),
data=ibis_data)
# Raw output
summary(logistic_fit)
##
## Call:
## glm(formula = factor(SSM_ASD_v24) ~ V24.mullen.composite_standard_score +
## Gender + Study_Site, family = binomial(), data = ibis_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0168 -0.5062 -0.2806 -0.1158 3.0678
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.279258 0.918950 6.833 8.31e-12 ***
## V24.mullen.composite_standard_score -0.090224 0.009579 -9.419 < 2e-16 ***
## GenderMale 0.794041 0.313635 2.532 0.0113 *
## Study_SiteSEA 0.320019 0.362354 0.883 0.3771
## Study_SiteSTL -0.634443 0.404098 -1.570 0.1164
## Study_SiteUNC 0.035305 0.396514 0.089 0.9291
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 514.73 on 572 degrees of freedom
## Residual deviance: 355.49 on 567 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 367.49
##
## Number of Fisher Scoring iterations: 6
# Format output
tidy(logistic_fit) %>%
mutate(p.value=ifelse(p.value<0.005, "<0.005***",
as.character(round(p.value, 3))),
term=fct_recode(factor(term),
"Intercept"="(Intercept)",
"24 Month MSEL Composite Standard Score"=
"V24.mullen.composite_standard_score",
"Male Gender"="GenderMale",
"Seattle Site"="Study_SiteSEA",
"St. Louis Site"="Study_SiteSTL",
"Chapel Hill Site"="Study_SiteUNC")) %>%
flextable() %>%
set_header_labels("term"="Variable",
"estimate"="Estimate",
"std.error"="Std. Error",
"statistic"="Z Statistic",
"p.value"="P-value") %>%
autofit()
Variable | Estimate | Std. Error | Z Statistic | P-value |
Intercept | 6.279 | 0.9190 | 6.833 | <0.005*** |
24 Month MSEL Composite Standard Score | -0.090 | 0.0096 | -9.419 | <0.005*** |
Male Gender | 0.794 | 0.3136 | 2.532 | 0.011 |
Seattle Site | 0.320 | 0.3624 | 0.883 | 0.377 |
St. Louis Site | -0.634 | 0.4041 | -1.570 | 0.116 |
Chapel Hill Site | 0.035 | 0.3965 | 0.089 | 0.929 |
# Fit model, 24 month MSEL composite standard score as predictor
logistic_fit <- glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score,
family=binomial(),
data=ibis_data)
# Output estimated probabilities of ASD from model, can use predict() fn
ibis_data$estimated_asd_probabilities <-
predict(logistic_fit, type="response")
## Error: Assigned data `predict(logistic_fit, type = "response")` must be compatible with existing data.
## x Existing data has 587 rows.
## x Assigned data has 573 rows.
## i Only vectors of size 1 are recycled.
# Provides error about different lengths. Why?
# If one has a missing MSEL, can't compute probability from model
sum(is.na(ibis_data$V24.mullen.composite_standard_score)) # have 14 missing values
## [1] 14
# Thus, need to compute and add estimated probabilities to dataset WITHOUT missing MSEL
# Easy way, use model.frame()
ibis_data_glm <- model.frame(logistic_fit)
ibis_data_glm$estimated_asd_probabilities <-
predict(logistic_fit, type="response", newdata=ibis_data_glm)
# newdata argument specifies dataset you want to compute est. probabilities based on model
# Now, can plot these probabilities
ggplot(data = ibis_data_glm %>%
mutate(SSM_ASD_v24_num = ifelse(`factor(SSM_ASD_v24)`=="YES_ASD", 1,
ifelse(`factor(SSM_ASD_v24)`=="NO_ASD", 0, NA))),
mapping = aes(x=V24.mullen.composite_standard_score, y=SSM_ASD_v24_num))+
geom_point(aes(color=factor(SSM_ASD_v24_num)))+
geom_line(mapping=aes(y=estimated_asd_probabilities))+
scale_y_continuous(breaks=seq(0, 1.2, 0.2))+
labs(x="24 Month MSEL Composite Score", y="ASD Diagnosis (1=YES)",
color="ASD Diagnosis")+
theme_classic()+
theme(legend.position = "none")
# Fit model, 24 month MSEL composite standard score, gender, and site as predictors
logistic_fit <-
glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score+Gender+Study_Site,
family=binomial(),
data=ibis_data)
# Output estimated probabilities of ASD from model, can use predict() fn
ibis_data_glm <- model.frame(logistic_fit)
ibis_data_glm$estimated_asd_probabilities <-
predict(logistic_fit, type="response", newdata=ibis_data_glm)
# Look at estimated probabilities by diagnosis
ggplot(data=ibis_data_glm,
mapping=aes(x=`factor(SSM_ASD_v24)`, y=estimated_asd_probabilities,
fill=`factor(SSM_ASD_v24)`))+
geom_boxplot()+
labs(y="Estimated probability\nof ASD diagnosis",
x="True diagnosis")+
theme_classic()+
theme(legend.position = "none",
text=element_text(size=20))
Two options: - 1. Threshold estimated probabilities (ex. using 0.5 as cut-off) - 2. Examine performance across many thresholds using Receiver Operating Characteristic (ROC) Curve
# Fit model, 24 month MSEL composite standard score, gender, and site as predictors
logistic_fit <-
glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score+Gender+Study_Site,
family=binomial(),
data=ibis_data)
# Output estimated probabilities of ASD from model, can use predict() fn
ibis_data_glm <- model.frame(logistic_fit)
ibis_data_glm$estimated_asd_probabilities <-
predict(logistic_fit, type="response", newdata=ibis_data_glm)
# Add in predictions using thresholding
ibis_data_glm <-
ibis_data_glm %>%
mutate(pred_asd =
relevel(factor(ifelse(estimated_asd_probabilities>0.5, "YES_ASD", "NO_ASD")),
ref = "NO_ASD"))
# Compute confusion matrix
confusionMatrix(data = ibis_data_glm$pred_asd,
reference = ibis_data_glm$`factor(SSM_ASD_v24)`,
positive = "YES_ASD")
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO_ASD YES_ASD
## NO_ASD 463 57
## YES_ASD 15 38
##
## Accuracy : 0.8743
## 95% CI : (0.8444, 0.9004)
## No Information Rate : 0.8342
## P-Value [Acc > NIR] : 0.004607
##
## Kappa : 0.448
##
## Mcnemar's Test P-Value : 1.352e-06
##
## Sensitivity : 0.40000
## Specificity : 0.96862
## Pos Pred Value : 0.71698
## Neg Pred Value : 0.89038
## Prevalence : 0.16579
## Detection Rate : 0.06632
## Detection Prevalence : 0.09250
## Balanced Accuracy : 0.68431
##
## 'Positive' Class : YES_ASD
##
# Now, examine ROC curve
# Using pROC, add ROC curve using estimated probabilities of heart disease in test set
roc_obj <-
roc(response = ibis_data_glm$`factor(SSM_ASD_v24)`,
predictor = ibis_data_glm$estimated_asd_probabilities)
## Setting levels: control = NO_ASD, case = YES_ASD
## Setting direction: controls < cases
# Print obj
roc_obj
##
## Call:
## roc.default(response = ibis_data_glm$`factor(SSM_ASD_v24)`, predictor = ibis_data_glm$estimated_asd_probabilities)
##
## Data: ibis_data_glm$estimated_asd_probabilities in 478 controls (ibis_data_glm$`factor(SSM_ASD_v24)` NO_ASD) < 95 cases (ibis_data_glm$`factor(SSM_ASD_v24)` YES_ASD).
## Area under the curve: 0.858
# Return max Youden's index, with specificity and sensitivity
best_thres_data <-
data.frame(coords(roc_obj, x="best", best.method = c("youden")))
best_thres_data
## threshold specificity sensitivity
## 1 0.1335091 0.7259414 0.8526316
# Plot curve, add in line at elbow point
data_add_line <-
data.frame("sensitvity"=c(1-best_thres_data$specificity,
best_thres_data$sensitivity),
"specificity"=c(best_thres_data$specificity,
best_thres_data$specificity))
ggroc(roc_obj)+
geom_point(
data = best_thres_data,
mapping = aes(x=specificity, y=sensitivity), size=2, color="red")+
geom_point(mapping=aes(x=best_thres_data$specificity,
y=1-best_thres_data$specificity),
size=2, color="red")+
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1),
color="darkgrey", linetype="dashed")+
geom_text(data = best_thres_data,
mapping=aes(x=specificity, y=1,
label=paste0("Threshold = ", round(threshold,2),
"\nSensitivity = ", round(sensitivity,2),
"\nSpecificity = ", round(specificity,2),
"\nAUC = ", round(auc(roc_obj),2))))+
geom_line(data=data_add_line,
mapping=aes(x=specificity, y=sensitvity),
linetype="dashed")+
ylim(c(0, 1.075))+
theme_classic()