library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(broom)
library(flextable)
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
## 
##     compose
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Read in data and convert from wide to long
ibis_data <- read_csv("../Data/Cross-sec_full.csv", na=c(".", "", " ")) %>%
  mutate(SSM_ASD_v24_num = ifelse(SSM_ASD_v24=="YES_ASD", 1,
                                  ifelse(SSM_ASD_v24=="NO_ASD", 0, NA)))
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   SSM_ASD_v24 = col_character(),
##   `V24 demographics,Risk` = col_character(),
##   GROUP = col_character(),
##   Study_Site = col_character(),
##   Gender = col_character()
## )
## i Use `spec()` for the full column specifications.
names(ibis_data) <- gsub(" |,",".",names(ibis_data))

Example 1: Logistic regression with single predictor

# Fit model, 24 month MSEL composite standard score as predictor
logistic_fit <- glm(SSM_ASD_v24~V24.mullen.composite_standard_score, 
              family=binomial(),
             data=ibis_data)
## Error in eval(family$initialize): y values must be 0 <= y <= 1
## Return error? ASD variable is character, must be factor or numeric

logistic_fit <- glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score, 
              family=binomial(),
             data=ibis_data)
logistic_fit
## 
## Call:  glm(formula = factor(SSM_ASD_v24) ~ V24.mullen.composite_standard_score, 
##     family = binomial(), data = ibis_data)
## 
## Coefficients:
##                         (Intercept)  V24.mullen.composite_standard_score  
##                             6.67117                             -0.08888  
## 
## Degrees of Freedom: 572 Total (i.e. Null);  571 Residual
##   (14 observations deleted due to missingness)
## Null Deviance:       514.7 
## Residual Deviance: 368.9     AIC: 372.9
# Note, category order with factors automatically alphabetical.  First set to "reference", ok in this case (as NO comes before YES)

logistic_fit <- glm(SSM_ASD_v24_num~V24.mullen.composite_standard_score, 
              family=binomial(),
             data=ibis_data)
logistic_fit
## 
## Call:  glm(formula = SSM_ASD_v24_num ~ V24.mullen.composite_standard_score, 
##     family = binomial(), data = ibis_data)
## 
## Coefficients:
##                         (Intercept)  V24.mullen.composite_standard_score  
##                             6.67117                             -0.08888  
## 
## Degrees of Freedom: 572 Total (i.e. Null);  571 Residual
##   (14 observations deleted due to missingness)
## Null Deviance:       514.7 
## Residual Deviance: 368.9     AIC: 372.9
# Same results

# Raw output
summary(logistic_fit)
## 
## Call:
## glm(formula = SSM_ASD_v24_num ~ V24.mullen.composite_standard_score, 
##     family = binomial(), data = ibis_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1954  -0.5375  -0.3091  -0.1466   3.2387  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          6.671167   0.827694    8.06 7.63e-16 ***
## V24.mullen.composite_standard_score -0.088884   0.009268   -9.59  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 514.73  on 572  degrees of freedom
## Residual deviance: 368.89  on 571  degrees of freedom
##   (14 observations deleted due to missingness)
## AIC: 372.89
## 
## Number of Fisher Scoring iterations: 6
# Format output
tidy(logistic_fit) %>%
  mutate(p.value=ifelse(p.value<0.005, "<0.005", 
                        as.character(round(p.value, 3))),
         term=fct_recode(factor(term),
                         "Intercept"="(Intercept)",
                         "24 Month MSEL Composite Standard Score"=
                         "V24.mullen.composite_standard_score")) %>%
  flextable() %>%
  set_header_labels("term"="Variable",
                    "estimate"="Estimate",
                    "std.error"="Std. Error",
                    "statistic"="Z Statistic",
                    "p.value"="P-value") %>%
  autofit()

Example 2: Logistic regression with many predictors

# Fit model, 24 month MSEL composite standard score, gender, and site as predictors
logistic_fit <- 
  glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score+Gender+Study_Site, 
              family=binomial(),
             data=ibis_data)

# Raw output
summary(logistic_fit)
## 
## Call:
## glm(formula = factor(SSM_ASD_v24) ~ V24.mullen.composite_standard_score + 
##     Gender + Study_Site, family = binomial(), data = ibis_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0168  -0.5062  -0.2806  -0.1158   3.0678  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          6.279258   0.918950   6.833 8.31e-12 ***
## V24.mullen.composite_standard_score -0.090224   0.009579  -9.419  < 2e-16 ***
## GenderMale                           0.794041   0.313635   2.532   0.0113 *  
## Study_SiteSEA                        0.320019   0.362354   0.883   0.3771    
## Study_SiteSTL                       -0.634443   0.404098  -1.570   0.1164    
## Study_SiteUNC                        0.035305   0.396514   0.089   0.9291    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 514.73  on 572  degrees of freedom
## Residual deviance: 355.49  on 567  degrees of freedom
##   (14 observations deleted due to missingness)
## AIC: 367.49
## 
## Number of Fisher Scoring iterations: 6
# Format output
tidy(logistic_fit) %>%
  mutate(p.value=ifelse(p.value<0.005, "<0.005***", 
                        as.character(round(p.value, 3))),
         term=fct_recode(factor(term),
                         "Intercept"="(Intercept)",
                         "24 Month MSEL Composite Standard Score"=
                         "V24.mullen.composite_standard_score",
                         "Male Gender"="GenderMale",
                         "Seattle Site"="Study_SiteSEA",
                         "St. Louis Site"="Study_SiteSTL",
                         "Chapel Hill Site"="Study_SiteUNC")) %>%
  flextable() %>%
  set_header_labels("term"="Variable",
                    "estimate"="Estimate",
                    "std.error"="Std. Error",
                    "statistic"="Z Statistic",
                    "p.value"="P-value") %>%
  autofit()

Extracting estimated probabilities (in Example 1)

# Fit model, 24 month MSEL composite standard score as predictor
logistic_fit <- glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score, 
              family=binomial(),
             data=ibis_data)

# Output estimated probabilities of ASD from model, can use predict() fn
ibis_data$estimated_asd_probabilities <- 
  predict(logistic_fit, type="response")
## Error: Assigned data `predict(logistic_fit, type = "response")` must be compatible with existing data.
## x Existing data has 587 rows.
## x Assigned data has 573 rows.
## i Only vectors of size 1 are recycled.
# Provides error about different lengths.  Why?
# If one has a missing MSEL, can't compute probability from model
sum(is.na(ibis_data$V24.mullen.composite_standard_score)) # have 14 missing values
## [1] 14
# Thus, need to compute and add estimated probabilities to dataset WITHOUT missing MSEL
# Easy way, use model.frame()

ibis_data_glm <- model.frame(logistic_fit)
ibis_data_glm$estimated_asd_probabilities <- 
  predict(logistic_fit, type="response", newdata=ibis_data_glm)
# newdata argument specifies dataset you want to compute est. probabilities based on model

# Now, can plot these probabilities
ggplot(data = ibis_data_glm %>% 
         mutate(SSM_ASD_v24_num = ifelse(`factor(SSM_ASD_v24)`=="YES_ASD", 1,
                                  ifelse(`factor(SSM_ASD_v24)`=="NO_ASD", 0, NA))), 
       mapping = aes(x=V24.mullen.composite_standard_score, y=SSM_ASD_v24_num))+
  geom_point(aes(color=factor(SSM_ASD_v24_num)))+
  geom_line(mapping=aes(y=estimated_asd_probabilities))+
  scale_y_continuous(breaks=seq(0, 1.2, 0.2))+
  labs(x="24 Month MSEL Composite Score", y="ASD Diagnosis (1=YES)",
       color="ASD Diagnosis")+
  theme_classic()+
  theme(legend.position = "none")

Extracting estimated probabilities (in Example 2)

# Fit model, 24 month MSEL composite standard score, gender, and site as predictors
logistic_fit <- 
  glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score+Gender+Study_Site, 
              family=binomial(),
             data=ibis_data)

# Output estimated probabilities of ASD from model, can use predict() fn
ibis_data_glm <- model.frame(logistic_fit)
ibis_data_glm$estimated_asd_probabilities <- 
  predict(logistic_fit, type="response", newdata=ibis_data_glm)

# Look at estimated probabilities by diagnosis
ggplot(data=ibis_data_glm,
       mapping=aes(x=`factor(SSM_ASD_v24)`, y=estimated_asd_probabilities,
                   fill=`factor(SSM_ASD_v24)`))+
  geom_boxplot()+
  labs(y="Estimated probability\nof ASD diagnosis",
       x="True diagnosis")+
  theme_classic()+
  theme(legend.position = "none",
        text=element_text(size=20))

Predicting diagnosis (in Example 2)

Two options: - 1. Threshold estimated probabilities (ex. using 0.5 as cut-off) - 2. Examine performance across many thresholds using Receiver Operating Characteristic (ROC) Curve

# Fit model, 24 month MSEL composite standard score, gender, and site as predictors
logistic_fit <- 
  glm(factor(SSM_ASD_v24)~V24.mullen.composite_standard_score+Gender+Study_Site, 
              family=binomial(),
             data=ibis_data)

# Output estimated probabilities of ASD from model, can use predict() fn
ibis_data_glm <- model.frame(logistic_fit)
ibis_data_glm$estimated_asd_probabilities <- 
  predict(logistic_fit, type="response", newdata=ibis_data_glm)

# Add in predictions using thresholding
ibis_data_glm <-
  ibis_data_glm %>%
  mutate(pred_asd = 
           relevel(factor(ifelse(estimated_asd_probabilities>0.5, "YES_ASD", "NO_ASD")),
                   ref = "NO_ASD"))

# Compute confusion matrix
confusionMatrix(data = ibis_data_glm$pred_asd,
                reference = ibis_data_glm$`factor(SSM_ASD_v24)`,
                positive = "YES_ASD")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO_ASD YES_ASD
##    NO_ASD     463      57
##    YES_ASD     15      38
##                                           
##                Accuracy : 0.8743          
##                  95% CI : (0.8444, 0.9004)
##     No Information Rate : 0.8342          
##     P-Value [Acc > NIR] : 0.004607        
##                                           
##                   Kappa : 0.448           
##                                           
##  Mcnemar's Test P-Value : 1.352e-06       
##                                           
##             Sensitivity : 0.40000         
##             Specificity : 0.96862         
##          Pos Pred Value : 0.71698         
##          Neg Pred Value : 0.89038         
##              Prevalence : 0.16579         
##          Detection Rate : 0.06632         
##    Detection Prevalence : 0.09250         
##       Balanced Accuracy : 0.68431         
##                                           
##        'Positive' Class : YES_ASD         
## 
# Now, examine ROC curve
# Using pROC, add ROC curve using estimated probabilities of heart disease in test set
roc_obj <- 
  roc(response = ibis_data_glm$`factor(SSM_ASD_v24)`, 
    predictor = ibis_data_glm$estimated_asd_probabilities)
## Setting levels: control = NO_ASD, case = YES_ASD
## Setting direction: controls < cases
# Print obj
roc_obj
## 
## Call:
## roc.default(response = ibis_data_glm$`factor(SSM_ASD_v24)`, predictor = ibis_data_glm$estimated_asd_probabilities)
## 
## Data: ibis_data_glm$estimated_asd_probabilities in 478 controls (ibis_data_glm$`factor(SSM_ASD_v24)` NO_ASD) < 95 cases (ibis_data_glm$`factor(SSM_ASD_v24)` YES_ASD).
## Area under the curve: 0.858
# Return max Youden's index, with specificity and sensitivity
best_thres_data <- 
  data.frame(coords(roc_obj, x="best", best.method = c("youden")))

best_thres_data
##   threshold specificity sensitivity
## 1 0.1335091   0.7259414   0.8526316
# Plot curve, add in line at elbow point
data_add_line <-
  data.frame("sensitvity"=c(1-best_thres_data$specificity,
                            best_thres_data$sensitivity),
             "specificity"=c(best_thres_data$specificity,
                            best_thres_data$specificity))
  
ggroc(roc_obj)+
    geom_point(
    data = best_thres_data,
    mapping = aes(x=specificity, y=sensitivity), size=2, color="red")+
    geom_point(mapping=aes(x=best_thres_data$specificity, 
               y=1-best_thres_data$specificity), 
               size=2, color="red")+
    geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1),
                 color="darkgrey", linetype="dashed")+
    geom_text(data = best_thres_data,
              mapping=aes(x=specificity, y=1,
                          label=paste0("Threshold = ", round(threshold,2),
                                       "\nSensitivity = ", round(sensitivity,2),
                                       "\nSpecificity = ", round(specificity,2),
                                       "\nAUC = ", round(auc(roc_obj),2))))+
    geom_line(data=data_add_line,
              mapping=aes(x=specificity, y=sensitvity),
              linetype="dashed")+
  ylim(c(0, 1.075))+
  theme_classic()