###########################################
# Final Revised Versions                  #
# Statistical Analysis of Vulnearbilities #
# For BlackHat'2013 presentation          #
#                                         #
# Copyright Luca Allodi, Fabio Massacci   #
# Universita' degli Studi di Trento       #
#                                         #
# Revised 21/Aug/2013                     #
###########################################

######################################################################
# Load Data - Set the HomeDir Directory use the variable HOMEDIR     #
# You should make sure that there is a 'Outcome' subdirectory        #
######################################################################  

#   setwd('/Users/stewie/Documents/PhD/Vulnerabilities Repo/Vulnerability-Metrics/Papers/TISSEC-2013/Analysis_No_Unix/')
#   source('TISSEC-analysis-14.R')

HOMEDIR <- './'
gen_tab<-read.csv(paste0(HOMEDIR,'general_table.csv'), sep=',', header=T,stringsAsFactors=F)
gen_tab<-subset(gen_tab, gen_tab$UNIX==F)
gen_tab$cvss <- as.numeric(as.character(gen_tab$cvss))
gen_tab$pub_date <- as.POSIXlt(gen_tab$pub_date, format="%d/%m/%Y")

# Extract Symantec Vulnerabilities
sym <- subset(gen_tab, gen_tab$symantec==1)
softInSym <- unique(subset(gen_tab$software,gen_tab$symantec==1))
minYearSym=as.POSIXlt("01/01/2009", format="%d/%m/%Y")$year
symCIA_YR_SW <- subset(sym,select=c('conf','integ','avail','pub_date','software'))
symCIA_YR_SW$pub_date <- symCIA_YR_SW$pub_date$year
colnames(symCIA_YR_SW)[4] <- 'year'
symCIA_YR_SW <- unique(symCIA_YR_SW)

############################
#  Experimental Parameters #
############################
startControl <- 1
controlsToRun <- 800 
iterations <- 400  # Number of iterations for bootstrapping

####################################################
# Create and fill controls data structure          #
# censorYr (only among the same years in SYM),     #
# censorSw (only among the software in SYM),       #
# checkYr (exactly the year as in SYM),            #
# checkSW (exactly the software as in SYM)         #
# The latter may introduce signficant bias         #
# as the sw field is often brittle                 #
# and the same software may have different field values  #
####################################################

controls <- 800
controlVector <- data.frame(
                        cvss=integer(controls),
                        db=character(controls),
                        censorYr=logical(controls),
                        censorSw=logical(controls),
                        checkYr=logical(controls),
                        checkSw=logical(controls),
                        stringsAsFactors=F)
k <- 0
for (checkSw  in c(F,T)) {
for (checkYr  in c(F,T)) {
for (censorSw in c(F,T)) {
for (censorYr in c(F,T)) {
for (db in c('nvd','edb','ekits','edbNOekits','nvdNOekitsNOedb')) {
for (cvss in 1:10){
    k <- k+1
    controlVector[k,]$db <- db
    controlVector[k,]$censorYr <- censorYr
    controlVector[k,]$censorSw <- censorSw
    controlVector[k,]$checkYr <- checkYr
    controlVector[k,]$checkSw <- checkSw
    controlVector[k,]$cvss <- cvss
}}}}}}

####################################################
# Create and fill controls data structure          #
####################################################
numControlValues <- length(sym)
controlValues <- data.frame(db=character(numControlValues),
                        conf=character(numControlValues),
                        integ=character(numControlValues),
                        avail=character(numControlValues),
                        year=integer(numControlValues),
                        software=character(numControlValues),
                        stringsAsFactors=F)


# These functions do the assessment of each vulnerability 
countHigh_Sym <- function(tab,vulns, threshold) {
  nrow(subset(tab,tab$cve %in% vulns & tab$cvss >= threshold & tab$symantec==1))
}
countLow_Sym <- function(tab,vulns, threshold) {
  nrow(subset(tab,tab$cve %in% vulns & tab$cvss < threshold & tab$symantec==1))
}
countHigh_NotSym <- function(tab,vulns, threshold) {
  nrow(subset(tab,tab$cve %in% vulns & tab$cvss >= threshold & tab$symantec==0))
}
countLow_NotSym <- function(tab,vulns, threshold) {
  nrow(subset(tab,tab$cve %in% vulns & tab$cvss < threshold & tab$symantec==0))
}


# Pre-create data structures for holding results
# and datastructure for holding sampling
results <- data.frame(  db=character(controls),
                        censorYr=logical(controls),
                        censorSw=logical(controls),
                        checkYr=logical(controls),
                        checkSw=logical(controls),
                        cvss=integer(controls),
                        iterations=integer(controls),
                        medianHigh_Sym=double(controls),
                        medianLow_Sym=double(controls),
                        medianHigh_NotSym=double(controls),
                        medianLow_NotSym=double(controls),          
                        medianSensi=double(controls),
                        medianSpeci=double(controls),
                        invalidSensiSpeci=integer(controls),
                        geomSensi=double(controls),
                        geomSpeci=double(controls),
                        medianDiffRisk=double(controls),
                        medianFactRisk=double(controls),
                        medianOddsRatio=double(controls),
                        invalidRisk=integer(controls),
                        geomFactRisk=double(controls),
                        medianPValue=double(controls),

                        infHigh_Sym=double(controls),
                        infLow_Sym=double(controls),
                        infHigh_NotSym=double(controls),
                        infLow_NotSym=double(controls),
                        infSensi=double(controls),
                        infSpeci=double(controls),
                        infDiffRisk=double(controls),
                        infOddsRatio=double(controls),
                        infFactRisk=double(controls),
                        infPValue=double(controls),

                        supHigh_Sym=double(controls),
                        supLow_Sym=double(controls),
                        supHigh_NotSym=double(controls),
                        supLow_NotSym=double(controls),
                        supSensi=double(controls),
                        supSpeci=double(controls),
                        supDiffRisk=double(controls),
                        supFactRisk=double(controls),
                        supOddsRatio=double(controls),
                        supPValue=double(controls),
                        stringsAsFactors=F)

outcomes <- data.frame( cvssHigh_Sym=integer(iterations),
                        cvssLow_Sym=integer(iterations),
                        cvssHigh_NotSym=integer(iterations),
                        cvssLow_NotSym=integer(iterations),
                        sensi=double(iterations),
                        speci=double(iterations),
                        diffRisk=double(iterations),
                        factRisk=double(iterations),
                        oddsRatio=double(iterations),
                        pValue=double(iterations))

# Run a subset of the controls we are interested in for the paper
for (k in 1:controlsToRun) {
        db       <- as.character(controlVector[startControl+k-1,]$db)
        censorYr <- as.logical(controlVector[startControl+k-1,]$censorYr)
        censorSw <- as.logical(controlVector[startControl+k-1,]$censorSw)
        checkYr  <- as.logical(controlVector[startControl+k-1,]$checkYr)
        checkSw  <- as.logical(controlVector[startControl+k-1,]$checkSw)
        cvss  <- as.numeric(controlVector[startControl+k-1,]$cvss)
        
        if (db=='nvd') {
           gen_sample <- subset(gen_tab,gen_tab$nvd==1)
        } else { 
            if (db=='edb') {
                 gen_sample <- subset(gen_tab, gen_tab$edb==1)
            } else {
                if (db=='ekits') {
                    gen_sample <- subset(gen_tab, gen_tab$ekits==1)
                } else {
                    if (db=='edbNOekits') {
                        gen_sample <-subset(gen_tab, gen_tab$edb==1 & (!gen_tab$ekits==1 | gen_tab$cvss<6))
                    } else { 
                        if (db=='nvdNOekitsNOedb') {
                            gen_sample <-subset(gen_tab, gen_tab$nvd==1 & (!(gen_tab$edb==1 | gen_tab$ekits==1) | gen_tab$cvss<6))
                        }
                    }
                }
            }
        }
        
        sym_sample <- symCIA_YR_SW
        
        if (censorYr) {
             gen_sample <- subset(gen_sample,gen_sample$pub_date$year>=minYearSym)
             sym_sample <- subset(sym_sample,sym_sample$year>=minYearSym)
        }
        if (censorSw) {
             gen_sample <- subset(gen_sample,gen_sample$software %in% softInSym)
             # vacuosly true sym_sample <- subset(sym_sample,sym_sample$software %in% softInSym)
        }
                        
        if (!checkYr) {
            sym_sample$year <- rep(NA,nrow(sym_sample))
        }
        if (!checkSw) {
             sym_sample$software <- rep(NA,nrow(sym_sample))           
        }
        sym_sample <- unique(sym_sample,stringsAsFactors=F)
        sym_sample[,'freq'] <- NA
        
        for (j in 1:nrow(sym_sample)){
            C <- as.character(sym_sample[j,]$conf)
            I <- as.character(sym_sample[j,]$integ)
            A <- as.character(sym_sample[j,]$avail)
            YR <- as.integer(sym_sample[j,]$year)
            SW <- as.character(sym_sample[j,]$software)
            sym_sample[j,]$freq <- nrow(subset(sym,sym$conf==C & 
                                                    sym$integ==I & 
                                                    sym$avail==A &
                                                    (!checkYr | sym$pub_date$year==YR) &
                                                    (!checkSw | sym$software==SW)))
        }

    dataTitle <- paste0(controlVector[startControl+k-1,]$db,'_',controlVector[startControl+k-1,]$cvss,
                          if(controlVector[startControl+k-1,]$censorYr) '_censorYr' else '',
                          if(controlVector[startControl+k-1,]$censorSw) '_censorSw' else '',
                          if(controlVector[startControl+k-1,]$checkYr)  '_checkYr'  else '',
                          if(controlVector[startControl+k-1,]$checkSw)  '_checkSw'  else '')
    
    write.csv(sym_sample,file=paste0(HOMEDIR,'Outcome/controlValues_',dataTitle,'.csv'),row.names=F)
  
    # Run the bootstrapping procedure
    for (i in 1:iterations) {
        outcomes[i,]$cvssHigh_Sym <- 0
        outcomes[i,]$cvssLow_Sym <- 0
        outcomes[i,]$cvssHigh_NotSym <- 0
        outcomes[i,]$cvssLow_NotSym <- 0
        outcomes[i,]$sensi <- NA
        outcomes[i,]$speci <- NA
        outcomes[i,]$diffRisk <- NA
        outcomes[i,]$factRisk <- NA
        outcomes[i,]$oddsRatio <- NA
        outcomes[i,]$pValue <- NA
        
        for (j in 1:nrow(sym_sample)) {
            C <- as.character(sym_sample[j,]$conf)
            I <- as.character(sym_sample[j,]$integ)
            A <- as.character(sym_sample[j,]$avail)
            YR <- as.integer(sym_sample[j,]$year)
            SW <- as.character(sym_sample[j,]$software)
            FREQ <- as.integer(sym_sample[j,]$freq)
             
            controlled_sample <- subset(gen_sample,gen_sample$conf==C & 
                                                gen_sample$integ==I & 
                                                gen_sample$avail==A &
                                                (!checkYr | gen_sample$pub_date$year==YR) &
                                                (!checkSw | gen_sample$software==SW))
            if(nrow(controlled_sample)>0) {
                vulns <- sample(controlled_sample$cve,FREQ,replace=T)
                
                outcomes[i,]$cvssHigh_Sym <- (outcomes[i,]$cvssHigh_Sym 
                                        + countHigh_Sym(controlled_sample,vulns,cvss))
                outcomes[i,]$cvssLow_Sym <- (outcomes[i,]$cvssLow_Sym
                                        + countLow_Sym(controlled_sample,vulns,cvss))
                outcomes[i,]$cvssHigh_NotSym <- (outcomes[i,]$cvssHigh_NotSym 
                                        + countHigh_NotSym(controlled_sample,vulns,cvss))
                outcomes[i,]$cvssLow_NotSym <- (outcomes[i,]$cvssLow_NotSym
                                        + countLow_NotSym(controlled_sample,vulns,cvss))
            }
        }
        outcomes[i,]$sensi <- outcomes[i,]$cvssHigh_Sym/(outcomes[i,]$cvssHigh_Sym + outcomes[i,]$cvssLow_Sym)
        outcomes[i,]$speci <- outcomes[i,]$cvssLow_NotSym/(outcomes[i,]$cvssLow_NotSym + outcomes[i,]$cvssHigh_NotSym)
        highRisk <- outcomes[i,]$cvssHigh_Sym/(outcomes[i,]$cvssHigh_Sym+outcomes[i,]$cvssHigh_NotSym)
        lowRisk <- outcomes[i,]$cvssLow_Sym/(outcomes[i,]$cvssLow_Sym+outcomes[i,]$cvssLow_NotSym) 
        outcomes[i,]$diffRisk <- highRisk - lowRisk
        outcomes[i,]$factRisk <- highRisk/lowRisk
        outcomes[i,]$oddsRatio <-  (outcomes[i,]$cvssHigh_Sym/outcomes[i,]$cvssHigh_NotSym)/(outcomes[i,]$cvssLow_Sym/outcomes[i,]$cvssLow_NotSym)                           
        fisher_mat <- c(outcomes[i,]$cvssHigh_Sym,outcomes[i,]$cvssLow_Sym,outcomes[i,]$cvssHigh_NotSym,outcomes[i,]$cvssLow_NotSym)
        dim(fisher_mat) <-c(2,2)
        outcomes[i,]$pValue <- fisher.test(fisher_mat)$p
    }
    
    ##########################################################
    # Save Results of the iterations                         #
    ##########################################################        
    write.csv(outcomes,file=paste0(HOMEDIR,'Outcome/distr_',dataTitle,
                          '_iteration_', as.character(iterations),'.csv'),row.names=F)
    largestX <- round(max(outcomes[,1:4])*1.1)
    largestY <- iterations
    
    # Each graphic file has a different name 
    pdf(file=paste0(HOMEDIR,'Outcome/',dataTitle,'.pdf'))
    hist(outcomes$cvssHigh_Sym,col='red',
            xlim=c(0,largestX),ylim=c(0,largestY),
            xlab=NULL,ylab=NULL,main=NULL)
    hist(outcomes$cvssLow_Sym,col='yellow',
            xlim=c(0,largestX),ylim=c(0,largestY),
            xlab=NULL,ylab=NULL,main=NULL,add=T)
    hist(outcomes$cvssHigh_NotSym,col='blue',
            xlim=c(0,largestX),ylim=c(0,largestY),
            xlab=NULL,ylab=NULL,main=NULL,add=T)                
    hist(outcomes$cvssLow_NotSym,col='green',
        xlim=c(0,largestX),ylim=c(0,largestY),
        xlab=NULL,ylab=NULL,main=NULL,add=T)                    
    title(main= dataTitle, xlab = '#Vulns with Characteristics', ylab = 'Frequencies')
    dev.off()

    ###### DESCRIPTIVE STATISTICS for the k-th Control ############

    # Round on the median is used because symantec might have an even number of vulns

    results[k,]$db <- controlVector[startControl+k-1,]$db
    results[k,]$censorYr <- controlVector[startControl+k-1,]$censorYr
    results[k,]$censorSw <- controlVector[startControl+k-1,]$censorSw
    results[k,]$checkYr <- controlVector[startControl+k-1,]$checkYr
    results[k,]$checkSw <- controlVector[startControl+k-1,]$checkSw
    results[k,]$iterations <- iterations
    results[k,]$cvss <- cvss
    
    results[k,]$medianHigh_Sym     <- median(outcomes$cvssHigh_Sym)
    results[k,]$medianLow_Sym      <- median(outcomes$cvssLow_Sym)
    results[k,]$medianHigh_NotSym  <- median(outcomes$cvssHigh_NotSym)                        
    results[k,]$medianLow_NotSym   <- median(outcomes$cvssLow_NotSym)
    results[k,]$medianSensi        <- median(outcomes$sensi,na.rm=T)
    results[k,]$medianSpeci        <- median(outcomes$speci,na.rm=T)
    results[k,]$geomSensi          <- exp(sum(log(outcomes$sensi))/iterations)
    results[k,]$geomSpeci          <- exp(sum(log(outcomes$speci))/iterations)
    results[k,]$invalidSensiSpeci  <- length(c(outcomes$sensi,outcomes$speci))-length(na.omit(c(outcomes$sensi,outcomes$speci)))
    results[k,]$medianDiffRisk     <- median(outcomes$diffRisk,na.rm=T)
    results[k,]$medianFactRisk     <- median(outcomes$factRisk,na.rm=T)
    results[k,]$medianOddsRatio    <- median(outcomes$oddsRatio,na.rm=T)
    results[k,]$invalidRisk        <- length(c(outcomes$diffRisk,outcomes$factRisk))-length(na.omit(c(outcomes$diffRisk,outcomes$factRisk)))
    results[k,]$geomFactRisk       <- exp(sum(log(outcomes$factRisk))/iterations)
    results[k,]$medianPValue       <- median(outcomes$pValue,na.rm=T)
       
    results[k,]$infHigh_Sym        <- quantile(outcomes$cvssHigh_Sym,probs=0.025,names=F,na.rm=T)   
    results[k,]$infLow_Sym         <- quantile(outcomes$cvssLow_Sym,probs=0.025,names=F,na.rm=T)    
    results[k,]$infHigh_NotSym     <- quantile(outcomes$cvssHigh_NotSym,probs=0.025,names=F,na.rm=T)
    results[k,]$infLow_NotSym      <- quantile(outcomes$cvssLow_NotSym,probs=0.025,names=F,na.rm=T)
    results[k,]$infSensi           <- quantile(outcomes$sensi,probs=0.025,names=F,na.rm=T)
    results[k,]$infSpeci           <- quantile(outcomes$speci,probs=0.025,names=F,na.rm=T)
    results[k,]$infDiffRisk        <- quantile(outcomes$diffRisk,probs=0.025,names=F,na.rm=T)
    results[k,]$infOddsRatio       <- quantile(outcomes$oddsRatio,probs=0.025,names=F,na.rm=T)
    results[k,]$infFactRisk        <- quantile(outcomes$factRisk,probs=0.025,names=F,na.rm=T)
    results[k,]$infPValue          <- quantile(outcomes$pValue,probs=0.025,names=F,na.rm=T)

    results[k,]$supHigh_Sym        <- quantile(outcomes$cvssHigh_Sym,probs=0.975,names=F,na.rm=T)   
    results[k,]$supLow_Sym         <- quantile(outcomes$cvssLow_Sym,probs=0.975,names=F,na.rm=T)    
    results[k,]$supHigh_NotSym     <- quantile(outcomes$cvssHigh_NotSym,probs=0.975,names=F,na.rm=T)
    results[k,]$supLow_NotSym      <- quantile(outcomes$cvssLow_NotSym,probs=0.975,names=F,na.rm=T)
    results[k,]$supSensi           <- quantile(outcomes$sensi,probs=0.975,names=F,na.rm=T)
    results[k,]$supSpeci           <- quantile(outcomes$speci,probs=0.975,names=F,na.rm=T)
    results[k,]$supDiffRisk        <- quantile(outcomes$diffRisk,probs=0.975,names=F,na.rm=T)
    results[k,]$supFactRisk        <- quantile(outcomes$factRisk,probs=0.975,names=F,na.rm=T)
    results[k,]$supOddsRatio       <- quantile(outcomes$oddsRatio,probs=0.975,names=F,na.rm=T)
    results[k,]$supPValue          <- quantile(outcomes$pValue,probs=0.975,names=F,na.rm=T)

    # Write results in a different file: in case k crashes at least
    # the first k-1 files are saved
    write.csv(results[1:k,],file=paste0(HOMEDIR,'Outcome/results_Controls_',as.character(startControl),'_',as.character(startControl+k-1),'.csv'),
        row.names=F)
}

# Use results[1:k,c(1,7:10,15)] to visualize the main control without censoring
# results[1:k,c(1,28:32,34)] to visualize the upper bound
