Load libraries.
libraries <- c('missForest')
lapply(libraries, require, character.only = TRUE)
## [[1]]
## [1] TRUE
discharge.na = read.csv("D:/OneDrive - University of Vermont/Curriculum/19_ Personal webpage/TropicalFreshwaterEcology/_pages/Lectures/data/StreamEcology_MissingValue.csv")
head(discharge.na)
## Day Rain Discharge
## 1 1 2.03 0.00910000
## 2 2 2.03 0.00904000
## 3 3 2.03 0.00903000
## 4 4 2.03 0.00903000
## 5 5 2.03 0.01058932
## 6 6 7.49 0.01070414
summary(discharge.na)
## Day Rain Discharge
## Min. : 1 Min. : 0.000 Min. :0.00841
## 1st Qu.: 92 1st Qu.: 0.340 1st Qu.:0.00962
## Median :183 Median : 2.790 Median :0.01233
## Mean :183 Mean : 5.578 Mean :0.01561
## 3rd Qu.:274 3rd Qu.: 8.740 3rd Qu.:0.01814
## Max. :365 Max. :47.500 Max. :0.11147
## NA's :110
Examine data.
summary(discharge.na)
## Day Rain Discharge
## Min. : 1 Min. : 0.000 Min. :0.00841
## 1st Qu.: 92 1st Qu.: 0.340 1st Qu.:0.00962
## Median :183 Median : 2.790 Median :0.01233
## Mean :183 Mean : 5.578 Mean :0.01561
## 3rd Qu.:274 3rd Qu.: 8.740 3rd Qu.:0.01814
## Max. :365 Max. :47.500 Max. :0.11147
## NA's :110
Plot data.
plot(discharge.na$Discharge,type="l", lwd=2.0)
Imputate missing data.
# ntree number of trees to grow in each forest.
# maxiter maximum number of iterations to be performed given the stopping criterion is
# not met beforehand.
discharge.imp <- missForest(discharge.na, maxiter = 4, ntree = 100,
variablewise = FALSE, decreasing = FALSE, verbose = F, replace = TRUE,
classwt = NULL, cutoff = NULL, strata = NULL,
sampsize = NULL, nodesize = NULL, maxnodes = NULL,
xtrue = NA, parallelize = "no")
discharge.missf <- discharge.imp$ximp
Examine data imputation.
summary(discharge.missf)
## Day Rain Discharge
## Min. : 1 Min. : 0.000 Min. :0.00841
## 1st Qu.: 92 1st Qu.: 0.340 1st Qu.:0.00994
## Median :183 Median : 2.790 Median :0.01163
## Mean :183 Mean : 5.578 Mean :0.01454
## 3rd Qu.:274 3rd Qu.: 8.740 3rd Qu.:0.01622
## Max. :365 Max. :47.500 Max. :0.11147
Plot data imputation.
plot(discharge.missf$Discharge,type="l", lwd=2.0)
Comparison of imputed data.
par(mfrow=c(1,3))
plot(discharge.na$Discharge,type="l",main="Original Discharge", lwd=2.0)
abline(h=0.0154, lty=2, lwd=4, col='green')
plot(discharge.missf$Discharge,type="l",main="Imputated Discharge", lwd=2.0)
abline(h=0.0154, lty=2, lwd=4, col='red')
plot(discharge.na$Rain,type="l",main="Rain", lwd=2.0)
abline(h=10.1, lty=2, lwd=4, col='blue')
How big are the errors of the imputation?
# Out Of Bag (OOBerror)= imputation error estimate (normalized root mean squared error)
# The OOB observations can be used for example for estimating the prediction error of RF.
# The performance is assessed using the normalized root mean squared error (NRMSE)
# Good performance leads to a value close to 0 and bad performance to a value around 1.
NRMSE <- discharge.imp$OOBerror
NRMSE
## NRMSE
## 6.541362e-05
# How big are the errors of the imputation?
# Not too bad, around >1%. NRMSE is The normalized root mean squared error, it is defined as:
# The Normalized Root Mean Square Error (NRMSE) the RMSE facilitates the comparison between models with different scales.
# rm(list=ls(all=TRUE)) #give R a blank slate
library("Hmisc")
discharge.na = read.csv("D:/OneDrive - University of Vermont/Curriculum/19_ Personal webpage/TropicalFreshwaterEcology/_pages/Lectures/data/StreamEcology_MissingValue.csv")
head(discharge.na)
## Day Rain Discharge
## 1 1 2.03 0.00910000
## 2 2 2.03 0.00904000
## 3 3 2.03 0.00903000
## 4 4 2.03 0.00903000
## 5 5 2.03 0.01058932
## 6 6 7.49 0.01070414
summary(discharge.na)
## Day Rain Discharge
## Min. : 1 Min. : 0.000 Min. :0.00841
## 1st Qu.: 92 1st Qu.: 0.340 1st Qu.:0.00962
## Median :183 Median : 2.790 Median :0.01233
## Mean :183 Mean : 5.578 Mean :0.01561
## 3rd Qu.:274 3rd Qu.: 8.740 3rd Qu.:0.01814
## Max. :365 Max. :47.500 Max. :0.11147
## NA's :110
plot(discharge.na$Discharge,type="l")
paste('How many missing values?', ' ', length(which(is.na(discharge.na))))
## [1] "How many missing values? 110"
# formula: The ~ Sepal.Length + . argument indicates what formula to use. In this case, we want all variables' missing data to be imputed, so we add each one.
# data: the data frame with missing data.
# n.impute: number of multiple imputations. 5 is frequently used, but 10 or more doesn't hurt
set.seed(1)
impute_arg <- aregImpute(Discharge ~ Rain , data = discharge.na, n.impute=100)
## Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration 77
Iteration 78
Iteration 79
Iteration 80
Iteration 81
Iteration 82
Iteration 83
Iteration 84
Iteration 85
Iteration 86
Iteration 87
Iteration 88
Iteration 89
Iteration 90
Iteration 91
Iteration 92
Iteration 93
Iteration 94
Iteration 95
Iteration 96
Iteration 97
Iteration 98
Iteration 99
Iteration 100
# print(impute_arg, digits=3)
# impute_arg$imputed$Discharge
# The R square values indicate the liklihood that the predicted missing values in the 'irt'
# dataframe match what we would actually observe if participants had answered all questions.
# Higher values are better.
# To use one of the iterations in our original data set, we can use the transcend function:
completetable <- impute.transcan(impute_arg, imputation=4, data=discharge.na,
list.out=TRUE,pr=FALSE, check=FALSE)
# head(completetable)
plot(completetable$Discharge,type="l")
# Compare methods
windows()
par(mfrow=c(1,2))
plot(completetable$Discharge,type="l",main="HMisc")
abline(h=0.0154, lty=2)
plot(discharge.missf$Discharge,type="l",main="Missforest")
abline(h=0.0154, lty=2)