Missing Value Imputation

Step 1

Load libraries.

libraries <- c('missForest')
lapply(libraries, require, character.only = TRUE)

## [[1]]
## [1] TRUE

Step 2

discharge.na = read.csv("D:/OneDrive - University of Vermont/Curriculum/19_ Personal webpage/TropicalFreshwaterEcology/_pages/Lectures/data/StreamEcology_MissingValue.csv")
head(discharge.na)

##   Day Rain  Discharge
## 1   1 2.03 0.00910000
## 2   2 2.03 0.00904000
## 3   3 2.03 0.00903000
## 4   4 2.03 0.00903000
## 5   5 2.03 0.01058932
## 6   6 7.49 0.01070414

summary(discharge.na)

##       Day           Rain          Discharge      
##  Min.   :  1   Min.   : 0.000   Min.   :0.00841  
##  1st Qu.: 92   1st Qu.: 0.340   1st Qu.:0.00962  
##  Median :183   Median : 2.790   Median :0.01233  
##  Mean   :183   Mean   : 5.578   Mean   :0.01561  
##  3rd Qu.:274   3rd Qu.: 8.740   3rd Qu.:0.01814  
##  Max.   :365   Max.   :47.500   Max.   :0.11147  
##                                 NA's   :110

Step 3

Examine data.

summary(discharge.na)

##       Day           Rain          Discharge      
##  Min.   :  1   Min.   : 0.000   Min.   :0.00841  
##  1st Qu.: 92   1st Qu.: 0.340   1st Qu.:0.00962  
##  Median :183   Median : 2.790   Median :0.01233  
##  Mean   :183   Mean   : 5.578   Mean   :0.01561  
##  3rd Qu.:274   3rd Qu.: 8.740   3rd Qu.:0.01814  
##  Max.   :365   Max.   :47.500   Max.   :0.11147  
##                                 NA's   :110

Step 4

Plot data.

plot(discharge.na$Discharge,type="l", lwd=2.0)

Step 5

Imputate missing data.

# ntree number of trees to grow in each forest.
# maxiter maximum number of iterations to be performed given the stopping criterion is
# not met beforehand.
discharge.imp <- missForest(discharge.na, maxiter = 4, ntree = 100,
                           variablewise = FALSE, decreasing = FALSE, verbose = F, replace = TRUE,
                           classwt = NULL, cutoff = NULL, strata = NULL,
                           sampsize = NULL, nodesize = NULL, maxnodes = NULL,
                           xtrue = NA, parallelize = "no")

discharge.missf <- discharge.imp$ximp

Step 6

Examine data imputation.

summary(discharge.missf)

##       Day           Rain          Discharge      
##  Min.   :  1   Min.   : 0.000   Min.   :0.00841  
##  1st Qu.: 92   1st Qu.: 0.340   1st Qu.:0.00994  
##  Median :183   Median : 2.790   Median :0.01163  
##  Mean   :183   Mean   : 5.578   Mean   :0.01454  
##  3rd Qu.:274   3rd Qu.: 8.740   3rd Qu.:0.01622  
##  Max.   :365   Max.   :47.500   Max.   :0.11147

Step 7

Plot data imputation.

plot(discharge.missf$Discharge,type="l", lwd=2.0)

Step 8

Comparison of imputed data.

par(mfrow=c(1,3))
plot(discharge.na$Discharge,type="l",main="Original Discharge", lwd=2.0)
abline(h=0.0154, lty=2, lwd=4, col='green')
plot(discharge.missf$Discharge,type="l",main="Imputated Discharge", lwd=2.0)
abline(h=0.0154, lty=2, lwd=4, col='red')
plot(discharge.na$Rain,type="l",main="Rain", lwd=2.0)
abline(h=10.1, lty=2, lwd=4, col='blue')

Step 9

How big are the errors of the imputation?

# Out Of Bag (OOBerror)= imputation error estimate (normalized root mean squared error)
# The OOB observations can be used for example for estimating the prediction error of RF.

# The performance is assessed using the normalized root mean squared error (NRMSE)
# Good performance leads to a value close to 0 and bad performance to a value around 1.
NRMSE <- discharge.imp$OOBerror
NRMSE

##        NRMSE 
## 6.541362e-05

# How big are the errors of the imputation?
# Not too bad, around >1%. NRMSE is The normalized root mean squared error, it is defined as:
# The Normalized Root Mean Square Error (NRMSE) the RMSE facilitates the comparison between models with different scales.

Hmisc

Harrell Miscellaneous

Step 1

# rm(list=ls(all=TRUE)) #give R a blank slate

Step 2

library("Hmisc")

Step 3

discharge.na = read.csv("D:/OneDrive - University of Vermont/Curriculum/19_ Personal webpage/TropicalFreshwaterEcology/_pages/Lectures/data/StreamEcology_MissingValue.csv")
head(discharge.na)

##   Day Rain  Discharge
## 1   1 2.03 0.00910000
## 2   2 2.03 0.00904000
## 3   3 2.03 0.00903000
## 4   4 2.03 0.00903000
## 5   5 2.03 0.01058932
## 6   6 7.49 0.01070414

Step 4

summary(discharge.na)

##       Day           Rain          Discharge      
##  Min.   :  1   Min.   : 0.000   Min.   :0.00841  
##  1st Qu.: 92   1st Qu.: 0.340   1st Qu.:0.00962  
##  Median :183   Median : 2.790   Median :0.01233  
##  Mean   :183   Mean   : 5.578   Mean   :0.01561  
##  3rd Qu.:274   3rd Qu.: 8.740   3rd Qu.:0.01814  
##  Max.   :365   Max.   :47.500   Max.   :0.11147  
##                                 NA's   :110

plot(discharge.na$Discharge,type="l")

Step 5

paste('How many missing values?', ' ', length(which(is.na(discharge.na))))

## [1] "How many missing values?   110"

Step 6

# formula: The ~ Sepal.Length + . argument indicates what formula to use. In this case, we want all variables' missing data to be imputed, so we add each one.
# data: the data frame with missing data.
# n.impute: number of multiple imputations. 5 is frequently used, but 10 or more doesn't hurt
set.seed(1)
impute_arg <- aregImpute(Discharge ~ Rain , data = discharge.na, n.impute=100)

## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 
Iteration 9 
Iteration 10 
Iteration 11 
Iteration 12 
Iteration 13 
Iteration 14 
Iteration 15 
Iteration 16 
Iteration 17 
Iteration 18 
Iteration 19 
Iteration 20 
Iteration 21 
Iteration 22 
Iteration 23 
Iteration 24 
Iteration 25 
Iteration 26 
Iteration 27 
Iteration 28 
Iteration 29 
Iteration 30 
Iteration 31 
Iteration 32 
Iteration 33 
Iteration 34 
Iteration 35 
Iteration 36 
Iteration 37 
Iteration 38 
Iteration 39 
Iteration 40 
Iteration 41 
Iteration 42 
Iteration 43 
Iteration 44 
Iteration 45 
Iteration 46 
Iteration 47 
Iteration 48 
Iteration 49 
Iteration 50 
Iteration 51 
Iteration 52 
Iteration 53 
Iteration 54 
Iteration 55 
Iteration 56 
Iteration 57 
Iteration 58 
Iteration 59 
Iteration 60 
Iteration 61 
Iteration 62 
Iteration 63 
Iteration 64 
Iteration 65 
Iteration 66 
Iteration 67 
Iteration 68 
Iteration 69 
Iteration 70 
Iteration 71 
Iteration 72 
Iteration 73 
Iteration 74 
Iteration 75 
Iteration 76 
Iteration 77 
Iteration 78 
Iteration 79 
Iteration 80 
Iteration 81 
Iteration 82 
Iteration 83 
Iteration 84 
Iteration 85 
Iteration 86 
Iteration 87 
Iteration 88 
Iteration 89 
Iteration 90 
Iteration 91 
Iteration 92 
Iteration 93 
Iteration 94 
Iteration 95 
Iteration 96 
Iteration 97 
Iteration 98 
Iteration 99 
Iteration 100

# print(impute_arg, digits=3)
# impute_arg$imputed$Discharge

Step 7

# The R square values indicate the liklihood that the predicted missing values in the 'irt' 
# dataframe match what we would actually observe if participants had answered all questions. 
# Higher values are better.

# To use one of the iterations in our original data set, we can use the transcend function:
completetable <- impute.transcan(impute_arg, imputation=4, data=discharge.na, 
                                 list.out=TRUE,pr=FALSE, check=FALSE)

# head(completetable)

Step 8

plot(completetable$Discharge,type="l")

Compare methods

Step 1

# Compare methods 
windows()
par(mfrow=c(1,2))
plot(completetable$Discharge,type="l",main="HMisc")
abline(h=0.0154, lty=2)
plot(discharge.missf$Discharge,type="l",main="Missforest")
abline(h=0.0154, lty=2)

Missing Value Imputation

Pablo E. Gutiérrez-Fonseca

26 February, 2023

Step 1

Step 2

Step 3

Step 4

Step 5

Step 6

Step 7

Step 8

Step 9

Hmisc

Harrell Miscellaneous

Step 1

Step 2

Step 3

Step 4

Step 5

Step 6

Step 7

Step 8

Compare methods

Step 1