Verbal Autopsy Analysis Using openVA

Zehang Richard Li

11 January, 2017

Plans for today

What we hope to achieve

Things we want people to take away

Things helpful for us

R

Why R?

Get started

How R packages work

As a package author

Structure of openVA

OpenVA consist of

Advantages

Implementation of openVA

What algorithms are implemented

How implementation differ

Summary of current implementations

Get started with openVA

Install package

install.packages("openVA")
library(openVA)

Making sure packages are up to date

packageVersion("openVA")  # should be at least 1.0.3
packageVersion("InterVA4") # should be at least 1.7.3
packageVersion("Tariff") # should beat least  1.0.1
packageVersion("nbc4va")  # should beat least  1.0
packageVersion("InSilicoVA")  # should be at least 1.1.3

Again, R codes used for this presentation can be obtained at this link

Data preparation

Processing raw VA records

‘InterVA’ format

data(RandomVA1)
dim(RandomVA1)
## [1] 1000  246
head(RandomVA1[, 1:10])
##   ID elder midage adult child under5 infant neonate male female
## 1 d1     Y                                             Y       
## 2 d2     Y                                                    Y
## 3 d3            Y                                      Y       
## 4 d4                  Y                                       Y
## 5 d5                  Y                                Y       
## 6 d6                  Y                                       Y

‘PHMRC’ format

Data preparation: PHMRC

Read data directly into R

PHMRC_first1000 <- read.csv(getPHMRC_url("adult"), nrows = 1000)

Dichotomize data with

  1. the default cut-off values
  2. cut-off values derived as described in the paper
convert.default <- ConvertData.phmrc(PHMRC_first1000, phmrc.type = "adult", 
                                     cutoff = "default", cause = "va34")
## The first column is site, assign IDs to each row by default
## 1000 deaths in input. Format: adult
## 177 binary symptoms generated
## 
## Number of Yes        21027
## Number of No         124732
## Number of Not known  31241
convert.adapt <- ConvertData.phmrc(PHMRC_first1000, phmrc.type = "adult", 
                                   cutoff = "adapt", cause = "va34")
## The first column is site, assign IDs to each row by default
## 1000 deaths in input. Format: adult
## 177 binary symptoms generated
## 
## Number of Yes        21715
## Number of No         124044
## Number of Not known  31241

Note: re-derived adaptive cut-offs are also useful when new data or only subsets of PHMRC data are used

Data preparation: Non-standard

# make up a fake 2 by 3 dataset with 2 deaths and 3 symptoms
toyid <- c("d1", "d2")
toycause <- c("A", "B")
toyData <- matrix(c("Yes", "No", "Don't know", 
                "Yes", "Refused to answer", "No"), 
                byrow = TRUE, nrow = 2, ncol = 3)
toyData <- cbind(toyid, toycause, toyData)
colnames(toyData) <- c("ID", "Cause", "S1", "S2", "S3")
toyData
##      ID   Cause S1    S2                  S3          
## [1,] "d1" "A"   "Yes" "No"                "Don't know"
## [2,] "d2" "B"   "Yes" "Refused to answer" "No"
toyDataNew <- ConvertData(toyData, yesLabel = "Yes", noLabel = "No", 
                  missLabel = c("Don't know", "Refused to answer"))
toyDataNew
##   ID Cause S1 S2 S3
## 1 d1     A  Y     .
## 2 d2     B  Y  .

Case study I: Overview

What we will do

What to take away

Case study I: two datasets

data(RandomVA1)
dim(RandomVA1)
## [1] 1000  246
head(RandomVA1[, 1:10])
##   ID elder midage adult child under5 infant neonate male female
## 1 d1     Y                                             Y       
## 2 d2     Y                                                    Y
## 3 d3            Y                                      Y       
## 4 d4                  Y                                       Y
## 5 d5                  Y                                Y       
## 6 d6                  Y                                       Y
PHMRC_all <- read.csv(getPHMRC_url("adult"))
AP <- which(PHMRC_all$site == "AP")
test <- PHMRC_all[AP, ]
train <- PHMRC_all[-AP, ]
dim(test)
## [1] 1554  946
dim(train)
## [1] 6287  946

Case study I: Tariff

tariff2 <- codeVA(data = test, data.type = "PHMRC", model = "Tariff",
                  data.train = train, causes.train = "gs_text34", 
                  phmrc.type = "adult")
## The first column is site, assign IDs to each row by default
## 6287 deaths in input. Format: adult
## 1554 deaths in test input. Format: adult
## 177 binary symptoms generated
## 
## Number of Yes        162507
## Number of No         978272
## Number of Not known  247078
## 
## Start re-sampling for significant Tariff cells
## Calculating ranks
tariff2_a <- codeVA(data = test, data.type = "PHMRC", model = "Tariff",
                  data.train = train, causes.train = "gs_text34", 
                  phmrc.type = "adult", cutoff = "adapt")
## The first column is site, assign IDs to each row by default
## 6287 deaths in input. Format: adult
## 1554 deaths in test input. Format: adult
## 177 binary symptoms generated
## 
## Number of Yes        162972
## Number of No         977807
## Number of Not known  247078
## 
## Start re-sampling for significant Tariff cells
## Calculating ranks
csmf_defaut <- getCSMF(tariff2)
csmf_adapt <- getCSMF(tariff2_a)
plot(as.numeric(csmf_defaut), as.numeric(csmf_adapt), 
     xlab = "CSMF using default cut-off from Murray et al., 2011", 
     ylab = "CSMF using reproduced cut-off")
abline(a=0,b=1,col = "red")

- For the purpose of this presentation, using the default cut-offs from this point onwards

Case study I: InterVA

interva1_4_02 <- codeVA(data = RandomVA1, data.type = "WHO", 
                        model = "InterVA", version = "4.02")
## ..........10% completed
## ..........20% completed
## ..........30% completed
## ..........40% completed
## ..........50% completed
## ..........60% completed
## ..........70% completed
## ..........80% completed
## ..........90% completed
## ..........100% completed
interva1_4_03 <- codeVA(data = RandomVA1, data.type = "WHO", 
                        model = "InterVA", version = "4.03")
## ..........10% completed
## ..........20% completed
## ..........30% completed
## ..........40% completed
## ..........50% completed
## ..........60% completed
## ..........70% completed
## ..........80% completed
## ..........90% completed
## ..........100% completed
csmf_v4_02 <- getCSMF(interva1_4_02)
csmf_v4_03 <- getCSMF(interva1_4_03)
plot(as.numeric(csmf_v4_02), as.numeric(csmf_v4_03), 
     xlab = "CSMF InterVA4.02", 
     ylab = "CSMF InterVA4.03")
abline(a=0,b=1,col = "red")

- Remember the original InterVA uses the \(P_{s|c}\) that looks like

interva2 <- codeVA(data = test, data.type = "PHMRC", model = "InterVA", 
                   data.train = train, causes.train = "gs_text34", 
                   phmrc.type = "adult")
## The first column is site, assign IDs to each row by default
## 6287 deaths in input. Format: adult
## 1554 deaths in test input. Format: adult
## 177 binary symptoms generated
## 
## Number of Yes        162507
## Number of No         978272
## Number of Not known  247078
## 
##  20 missing symptom-cause combinations in training data, imputed using average symptom prevalence
## .........10% completed
## ..........20% completed
## ..........30% completed
## .........40% completed
## ..........50% completed
## ..........60% completed
## .........70% completed
## ..........80% completed
## ..........90% completed
## .........100% completed
## .

Case study I: NBC

nbc2 <- codeVA(data = test, data.type = "PHMRC", model = "NBC", 
                   data.train = train, causes.train = "gs_text34", 
                   phmrc.type = "adult")
## The first column is site, assign IDs to each row by default
## 6287 deaths in input. Format: adult
## 1554 deaths in test input. Format: adult
## 177 binary symptoms generated
## 
## Number of Yes        162507
## Number of No         978272
## Number of Not known  247078

Case study I: InSilicoVA

insilico1 <- codeVA(RandomVA1, data.type = "WHO", model = "InSilicoVA",
                    Nsim=10000, auto.length = FALSE)
## Performing data consistency check...
## Data check finished.
## Not all causes with CSMF > 0.02 are convergent.
##  Please check using csmf.diag() for more information.
insilico2 <- codeVA(data = test, data.type = "PHMRC", model = "InSilicoVA",
                    data.train = train, causes.train = "gs_text34", 
                    phmrc.type = "adult", 
                    jump.scale = 0.05, convert.type = "fixed",
                    Nsim=10000, auto.length = FALSE)
## The first column is site, assign IDs to each row by default
## 6287 deaths in input. Format: adult
## 1554 deaths in test input. Format: adult
## 177 binary symptoms generated
## 
## Number of Yes        162507
## Number of No         978272
## Number of Not known  247078
## 
##  20 missing symptom-cause combinations in training data, imputed using average symptom prevalence
## Not all causes with CSMF > 0.02 are convergent.
##  Please check using csmf.diag() for more information.

Case study I: Summary of CSMF

summary(tariff2)
## Tariff fitted on 1554 deaths
## 
## Top 5 CSMFs:
##                               CSMF
## Falls                   0.13191763
## Esophageal Cancer       0.07979408
## Malaria                 0.07850708
## Bite of Venomous Animal 0.06821107
## AIDS                    0.05598456
summary(interva2)
## InterVA-4 fitted on 1554 deaths
## CSMF calculated using reported causes by InterVA-4 only
## The remaining probabilities are assigned to 'Undetermined'
## 
## Top 5 CSMFs:
##  cause        likelihood
##  Undetermined 0.4916    
##  Cirrhosis    0.0876    
##  Stroke       0.0648    
##  Lung Cancer  0.0575    
##  COPD         0.0514
summary(nbc2)
## Naive Bayes Classifier (NBC) fitted on 6287 deaths
## 
##  Top 5 causes by predicted CSMF:
##                             Predicted.CSMF
## Stroke                          0.10810811
## Pneumonia                       0.10038610
## Acute Myocardial Infarction     0.09781210
## Maternal                        0.06628057
## TB                              0.05469755
summary(insilico2)
## InSilicoVA Call: 
## 1554 death processed
## 10000 iterations performed, with first 5000 iterations discarded
##  500 iterations saved after thinning
## Fitted with re-estimated conditional probability level table
## 
## Top 10 CSMFs:
##                               Mean Std.Error  Lower Median  Upper
## Acute Myocardial Infarction 0.0980    0.0094 0.0794 0.0981 0.1166
## Diarrhea/Dysentery          0.0978    0.0101 0.0779 0.0983 0.1165
## Stroke                      0.0913    0.0082 0.0758 0.0909 0.1088
## TB                          0.0850    0.0086 0.0682 0.0848 0.1014
## Maternal                    0.0602    0.0064 0.0486 0.0598 0.0734
## Renal Failure               0.0591    0.0077 0.0457 0.0589 0.0766
## Malaria                     0.0575    0.0072 0.0434 0.0578 0.0724
## Poisonings                  0.0460    0.0067 0.0339 0.0460 0.0585
## Road Traffic                0.0425    0.0061 0.0312 0.0418 0.0546
## Lung Cancer                 0.0330    0.0057 0.0232 0.0328 0.0457
csmf.tariff2 <- getCSMF(tariff2)
csmf.interva2 <- getCSMF(interva2)
csmf.nbc2 <- getCSMF(nbc2)
csmf.insilico2 <- getCSMF(insilico2)
cbind(Tariff = csmf.tariff2, InterVA = csmf.interva2[1:34], 
      NBC = csmf.nbc2, InSilicoVA = csmf.insilico2[, "Mean"])
##                                      Tariff      InterVA         NBC   InSilicoVA
## Cirrhosis                       0.028314028 0.0876395713 0.027027027 0.0053221800
## COPD                            0.049549550 0.0514381419 0.017374517 0.0247806144
## Acute Myocardial Infarction     0.023809524 0.0136978882 0.097812098 0.0979887346
## Fires                           0.019305019 0.0114111550 0.030244530 0.0327888217
## Renal Failure                   0.004504505 0.0437879661 0.034105534 0.0590883791
## AIDS                            0.055984556 0.0359746875 0.046332046 0.0284315781
## Lung Cancer                     0.007722008 0.0574764649 0.007722008 0.0330033500
## Maternal                        0.013513514 0.0233331426 0.066280566 0.0601668992
## Drowning                        0.019305019 0.0029847174 0.016087516 0.0179709313
## Other Non-communicable Diseases 0.025740026 0.0035726895 0.023166023 0.0287200547
## Falls                           0.131917632 0.0001650913 0.024453024 0.0081121436
## Stroke                          0.009009009 0.0648488618 0.108108108 0.0913093802
## Road Traffic                    0.018661519 0.0016762920 0.026383526 0.0425124326
## Diabetes                        0.013513514 0.0284158712 0.029601030 0.0326770982
## Other Cardiovascular Diseases   0.038610039 0.0033062419 0.012226512 0.0069693226
## Other Infectious Diseases       0.050193050 0.0087631326 0.039253539 0.0293151507
## Pneumonia                       0.036679537 0.0023638562 0.100386100 0.0247179570
## Suicide                         0.004504505 0.0034581476 0.018661519 0.0116495395
## Cervical Cancer                 0.019305019 0.0000000000 0.003217503 0.0017893181
## TB                              0.046332046 0.0073436208 0.054697555 0.0849897295
## Malaria                         0.078507079 0.0014871028 0.036036036 0.0575295239
## Asthma                          0.001287001 0.0000000000 0.010939511 0.0083476027
## Diarrhea/Dysentery              0.020592021 0.0001359887 0.047619048 0.0978265753
## Colorectal Cancer               0.016087516 0.0024292257 0.002574003 0.0006274046
## Homicide                        0.001287001 0.0030117147 0.023166023 0.0131287613
## Breast Cancer                   0.007722008 0.0019317716 0.001930502 0.0007880810
## Leukemia/Lymphomas              0.009652510 0.0047328126 0.001930502 0.0014840168
## Other Injuries                  0.012226512 0.0000000000 0.034749035 0.0237273406
## Poisonings                      0.008365508 0.0069091069 0.023809524 0.0459890081
## Epilepsy                        0.047619048 0.0259540874 0.012226512 0.0192280704
## Prostate Cancer                 0.014157014 0.0000000000 0.005148005 0.0031377158
## Esophageal Cancer               0.079794080 0.0002684373 0.004504505 0.0011989444
## Stomach Cancer                  0.018018018 0.0093219473 0.001930502 0.0011466973
## Bite of Venomous Animal         0.068211068 0.0005951462 0.010296010 0.0035366427
csmf.true <- table(test$gs_text34)
csmf.true <- csmf.true[names(csmf.tariff2)]
csmf.true <- as.numeric(csmf.true / sum(csmf.true))
cod_names <- names(csmf.tariff2)

csmf.all <- list(Tariff = csmf.tariff2, InterVA = csmf.interva2, 
                 NBC = csmf.nbc2, InSilico = csmf.insilico2[, "Mean"])

par(mfrow = c(2, 2))
for(i in 1:length(csmf.all)){
  plot(csmf.true, csmf.all[[i]][1:34], xlim = c(0, 0.15), ylim = c(0, 0.15),
      xlab = "True CSMF", ylab = "Fitted CSMF", 
      main = names(csmf.all)[i]) 
  abline(a=0, b=1, col="red")
}

- Calculate CSMF accuracy: \[CSMF_{acc} = 1 - \frac{\sum_i^C CSMF_i - CSMF_i^{(true)}}{2(1 - \min CSMF^{(true)})}\]

getCSMF_accuracy(csmf.tariff2, csmf.true)
## [1] 0.5370251
getCSMF_accuracy(csmf.interva2, csmf.true, "Undetermined")
## [1] 0.5965441
getCSMF_accuracy(csmf.nbc2, csmf.true)
## [1] 0.7681906
getCSMF_accuracy(csmf.insilico2[, "Mean"], csmf.true)
## [1] 0.7272282
csmf_accuarcy_insilico <- getCSMF_accuracy(insilico2, csmf.true)
hist(csmf_accuarcy_insilico)
boxplot(csmf_accuarcy_insilico)

- Comparing the results for ALPHA data is similar

csmf.interva1_4_02 <- getCSMF(interva1_4_02)
csmf.interva1_4_03 <- getCSMF(interva1_4_03)
csmf.insilico1 <- getCSMF(insilico1)

Case study I: Most likely COD assignment

cod.tariff2 <- getTopCOD(tariff2)
cod.interva2 <- getTopCOD(interva2)
cod.nbc2 <- getTopCOD(nbc2)
cod.insilico2 <- getTopCOD(insilico2)
# install.packages(c("lattice", "gplots"))
library(lattice)
library(gplots) 
cod_names <- c(levels(test[, "gs_text34"]), "Undetermined")
cod.true <- factor(test[, "gs_text34"], levels = cod_names)
cod.all <- list(Tariff = cod.tariff2, InterVA = cod.interva2,
                NBC = cod.nbc2, InSilico = cod.insilico2)
for(i in 1:length(cod.all)){
  cod.fit <- factor(cod.all[[i]][,"cause"], 
              levels = cod_names)
  tab <- table(cod.true, cod.fit)
  acc <-  round(sum(diag(tab)) / sum(tab), 4) * 100
  print(
  levelplot(tab, 
        scales=list(tck=0, x=list(rot=90)), 
        col.regions=colorpanel(11, "white", "grey10"), 
        at=seq(0, 100, len = 11), 
        main=paste0(names(cod.all)[i], " - Accuracy: ", acc, "%"), 
        xlab="True Causes", ylab="Predicted Causes")  
  )
}