diff --git a/2012.Ugulino.WearableComputing.HAR.Data.PDF b/2012.Ugulino.WearableComputing.HAR.Data.PDF
new file mode 100644
index 0000000..baf62d9
Binary files /dev/null and b/2012.Ugulino.WearableComputing.HAR.Data.PDF differ
diff --git a/QuantifiedSelfAlgorithmTesting.R b/QuantifiedSelfAlgorithmTesting.R
new file mode 100644
index 0000000..e623ae2
--- /dev/null
+++ b/QuantifiedSelfAlgorithmTesting.R
@@ -0,0 +1,120 @@
+#
+# quantified self analysis: R version for testing alternate
+# machine learning algorithms
+
+
+# data download
+
+theFiles <- c("pml-testing.csv","pml-training.csv")
+theDirectory <- "./data/"
+dlMethod <- "curl"
+if(substr(Sys.getenv("OS"),1,7) == "Windows") dlMethod <- "wininet"
+if(!dir.exists(theDirectory)) dir.create(theDirectory)
+for (i in 1:length(theFiles)) {
+ aFile <- paste(theDirectory,theFiles[i],sep="")
+ if (!file.exists(aFile)) {
+ url <- paste("https://d396qusza40orc.cloudfront.net/predmachlearn/",
+ theFiles[i],
+ sep="")
+ download.file(url,destfile=aFile,
+ method=dlMethod,
+ mode="w") # use mode "w" for text
+ }
+}
+
+# read and clean data
+
+pkgs <- c("lattice","MASS","ggplot2","grid","readr","knitr","caret","YaleToolkit",
+ "iterators","parallel","foreach","doParallel")
+notInstalled <- pkgs[!(pkgs %in% installed.packages())]
+if(sum(!(pkgs %in% installed.packages())) > 0) {
+ for(i in notInstalled) install.packages(i)
+}
+
+for(pkg in pkgs) {
+ library(pkg,character.only = TRUE)
+}
+
+string40 <- "ncnnccnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
+string80 <- "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
+string120 <- "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
+string160 <- "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnc"
+colString <- paste(string40,string80,string120,string160,sep="")
+
+validation <- readr::read_csv("./data/pml-testing.csv",
+ col_names=TRUE,
+ col_types=colString)
+originalData <- readr::read_csv("./data/pml-training.csv",
+ col_names=TRUE,
+ col_types=colString)
+# fix missing column name for "observation / row number"
+theColNames <- colnames(originalData)
+theColNames[1] <- "obs"
+colnames(originalData) <- theColNames
+
+originalData$classe <- as.factor(originalData$classe)
+valResult <- whatis(originalData)
+# retain all columns with fewer than 50 missing values
+theNames <- as.character(valResult[valResult$missing < 50 & valResult$variable.name != "obs",1])
+originalSubset <- originalData[,theNames]
+# remove date variables and binary window
+originalSubset <- originalSubset[c(-2,-3,-4,-5)]
+# valSubset <- whatis(originalSubset)
+set.seed(102134)
+trainIndex <- createDataPartition(originalSubset$classe,p=.60,list=FALSE)
+training <- originalSubset[trainIndex,]
+testing <- originalSubset[-trainIndex,]
+
+# initiate parallel processing
+
+cluster <- makeCluster(detectCores() - 1)
+registerDoParallel(cluster)
+
+# build baseline model: linear discriminant analysis
+
+yvars <- training[,55]
+xvars <- training[,-55]
+intervalStart <- Sys.time()
+mod1Control <- trainControl(method="cv",number=5,allowParallel=TRUE)
+# modFit1 <- train(x=xvars,y=yvars,method="rpart",trControl=mod1Control)
+modFit1 <- train(classe ~ .,data=training,method="lda",trControl=mod1Control)
+# Model 1
+intervalEnd <- Sys.time()
+paste("Train model1 took: ",intervalEnd - intervalStart,attr(intervalEnd - intervalStart,"units"))
+pred1 <- predict(modFit1,training)
+# confusionMatrix(pred1,training$classe)
+predicted_test <- predict(modFit1,testing)
+confusionMatrix(predicted_test,testing$classe)
+# predicted_validation <- predict(modFit,validation)
+
+# build alternate model: random forest
+
+library(randomForest)
+intervalStart <- Sys.time()
+mod2Control <- trainControl(method="boot",number=25,allowParallel=TRUE)
+modFit2 <- train(classe ~ .,data=training,method="rf",trControl=mod2Control)
+intervalEnd <- Sys.time()
+print(modFit2)
+paste("Train model2 took: ",intervalEnd - intervalStart,attr(intervalEnd - intervalStart,"units"))
+pred2 <- predict(modFit2,training)
+confusionMatrix(pred2,training$classe)
+predicted_test <- predict(modFit2,testing)
+confusionMatrix(predicted_test,testing$classe)
+
+# build an alternate model: neural network
+library(nnet)
+intervalStart <- Sys.time()
+mod3Control <- trainControl(method="boot",number=25,allowParallel=TRUE)
+modFit3 <- train(classe ~ .,data=training,method="nnet",trControl=mod3Control)
+intervalEnd <- Sys.time()
+print(modFit3)
+paste("Train model3 took: ",intervalEnd - intervalStart,attr(intervalEnd - intervalStart,"units"))
+pred3 <- predict(modFit3,training)
+confusionMatrix(pred3,training$classe)
+predicted_test <- predict(modFit3,testing)
+confusionMatrix(predicted_test,testing$classe)
+
+
+# turn off parallel processing
+stopCluster(cluster)
+registerDoSEQ()
diff --git a/QuantifiedSelfAnalysis.Rmd b/QuantifiedSelfAnalysis.Rmd
index ad3287d..3b9ea02 100644
--- a/QuantifiedSelfAnalysis.Rmd
+++ b/QuantifiedSelfAnalysis.Rmd
@@ -90,15 +90,19 @@ The model has an overall accuracy of 77%, with the highest sensitivity being .84
The random forest technique generates multiple predictive models, and aggregates them to create a final result. Random forests have a high degree of predictive power, and can be tuned according to a variety of parameters, including a range of choices from k-fold cross validation to leave one out bootstrapping. As we did with the linear discriminant analysis, we use k-fold cross validation with five folds.
```{r ref.label="useParallel", echo=FALSE}
-# run LDA model
+# turn on parallel processing
```
```{r ref.label="buildModel2", echo=FALSE}
-# run LDA model
+# run randomForest model
```
-The random forest model is extremely powerful, correctly classifying all cases in our training data set. The algorithm produces optimal results with 30 predictors, reaching a maximum accuracy of 9.993 as illustrated by the following chart.
+```{r ref.label = "termParallel", echo = FALSE}
+ # stop parallel processing
+```
+
+The random forest model is extremely powerful, correctly classifying all cases in our training data set. The algorithm produces optimal results with 30 predictors, reaching a maximum accuracy of 0.994 as illustrated by the following chart.
```{r plotRFAccuracy, echo=FALSE}
plot(modFit2,
@@ -133,7 +137,7 @@ Finally, our accuracy at predicting the 20 cases in the validation data set was
## Appendix
-```{r dataDownload, echo=FALSE}
+```{r dataDownload, echo=FALSE,eval = FALSE}
theFiles <- c("pml-testing.csv","pml-training.csv")
theDirectory <- "./data/"
dlMethod <- "curl"
@@ -153,7 +157,12 @@ for (i in 1:length(theFiles)) {
```
-```{r readData, echo=FALSE}
+```{r readData, echo=TRUE, eval = FALSE}
+pkgs <- c("lattice","MASS","ggplot2","grid","readr","knitr","caret","YaleToolkit")
+notInstalled <- pkgs[!(pkgs %in% installed.packages())]
+if(sum(!(pkgs %in% installed.packages())) > 0) {
+ for(i in notInstalled) install.packages(i)
+}
library(lattice)
library(MASS)
library(ggplot2)
@@ -174,7 +183,7 @@ validation <- readr::read_csv("./data/pml-testing.csv",
originalData <- readr::read_csv("./data/pml-training.csv",
col_names=TRUE,
col_types=colString)
-# fix missing colunm name for "observation / row number"
+# fix missing column name for "observation / row number"
theColNames <- colnames(originalData)
theColNames[1] <- "obs"
colnames(originalData) <- theColNames
@@ -194,17 +203,17 @@ testing <- originalSubset[-trainIndex,]
```
-```{r useParallel, echo=FALSE}
+```{r useParallel, echo=TRUE,eval = FALSE}
library(iterators)
library(parallel)
library(foreach)
library(doParallel)
-cluster <- makeCluster(detectCores())
+cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster)
```
-```{r buildModel1, echo=FALSE, cache=TRUE}
+```{r buildModel1, echo=TRUE, cache=TRUE,eval = FALSE}
yvars <- training[,55]
xvars <- training[,-55]
intervalStart <- Sys.time()
@@ -221,10 +230,10 @@ confusionMatrix(pred1,training$classe)
# predicted_validation <- predict(modFit,validation)
```
-```{r buildModel2, echo=FALSE, cache=TRUE}
+```{r buildModel2, echo=TRUE, cache=TRUE,eval = FALSE}
library(randomForest)
intervalStart <- Sys.time()
-mod2Control <- trainControl(method="cv",number=5,allowParallel=TRUE)
+mod2Control <- trainControl(method="boot",number=25,allowParallel=TRUE)
modFit2 <- train(classe ~ .,data=training,method="rf",trControl=mod2Control)
intervalEnd <- Sys.time()
print(modFit2)
@@ -242,7 +251,7 @@ confusionMatrix(predicted_test,testing$classe)
-```{r writeFiles, echo=TRUE}
+```{r writeFiles, echo=TRUE,eval = FALSE}
# generate predictions on validation data set
predicted_validation <- predict(modFit2,validation)
# compare to correct answers as validated by submitting the individual files to Coursera for
@@ -265,6 +274,16 @@ pml_write_files(predicted_chars)
```
+```{r termParallel,echo = FALSE,eval = FALSE}
+ stopCluster(cluster)
+ registerDoSEQ()
+
+```
+
+```{r sessionData,echo = FALSE, eval = TRUE}
+ sessionInfo()
+```
+
# References
1. Dinsdale, L. and Edwards, R. -- [Random Forests Webpage](https://dinsdalelab.sdsu.edu/metag.stats/code/randomforest.html), retrieved from the _Metagenomics. Statistics._ website on December 19, 2015.
diff --git a/index.Rmd b/index.Rmd
index eba508f..ecbb11b 100644
--- a/index.Rmd
+++ b/index.Rmd
@@ -18,11 +18,11 @@ output:
## Executive Summary
-Classification of data from the [Qualitative Activity Recognition of Weight Lifting Exercises](http://groupware.les.inf.puc-rio.br/work.jsf?p1=11201) study to predict exercise quality for unknown observations from the study resulted in a 100% accuracy rate with a random forest technique. Key findings included:
+Classification of data from the [Qualitative Activity Recognition of Weight Lifting Exercises](https://github.com/lgreski/practicalmachinelearning/blob/gh-pages/2013.Velloso.QAR-WLE.pdf) study to predict exercise quality for unknown observations from the study resulted in a 100% accuracy rate with a random forest technique. Key findings included:
* Fully 62.5% of the data in the dataset was unusable, due to the high rates of missing values,
* Of the remaining 60 variables, 54 were used to predict the values of the quality variable, `classe`, and
-* A random forest model with 30 variables achieved 99.45% accuracy, correctly identifying 20 out of 20 unknown test cases.
+* A random forest model with 30 variables achieved 99.71% accuracy, correctly identifying 20 out of 20 unknown test cases.
## Online Versions
@@ -105,7 +105,7 @@ The random forest technique generates multiple predictive models, and aggregates
# run RF model
```
-The random forest model is extremely powerful, correctly classifying all cases in our training data set. When applied to the 40% holdout from the training data, the accuracy is .9967, very close to the 1.0 accuracy that was obtained with the 5 fold cross validation against the 60% sample of the training data. The algorithm produces optimal results with 30 predictors, reaching a maximum accuracy of approximately 0.995 as illustrated by the following chart.
+The random forest model is extremely powerful, correctly classifying all cases in our training data set. When applied to the 40% holdout from the training data, the accuracy is .9968, very close to the 1.0 accuracy that was obtained with the 5 fold cross validation against the 60% sample of the training data. The algorithm produces optimal results with 30 predictors, reaching a maximum accuracy of approximately 0.995 as illustrated by the following chart.
```{r plotRFAccuracy, echo=FALSE, warning=FALSE}
plot(modFit2,
@@ -133,7 +133,7 @@ Given the accuracy level achieved via cross-validation of the model against mult
## Results
-The results from our random forest model were excellent. Applying the model to the test data set that we held out of of the model building steps, we find that the model accurately predicts 99.67% of the test cases, incorrectly classifying only 26 of the 7,846 observations. The error rate for the test data set is only 0.33%, giving us a .936 probability that the model would correctly classify all 20 validation cases.
+The results from our random forest model were excellent. Applying the model to the test data set that we held out of of the model building steps, we find that the model accurately predicts 99.68% of the test cases, incorrectly classifying only 23 of the 7,846 observations. The error rate for the test data set is only 0.32%, giving us a .938 probability that the model would correctly classify all 20 validation cases.
Finally, our accuracy at predicting the 20 cases in the validation data set was 100%. All in all, a good effort for our first attempt at a random forest.
@@ -162,14 +162,14 @@ for (i in 1:length(theFiles)) {
```{r readData, echo=TRUE, warning=FALSE, eval=FALSE}
-library(lattice)
-library(MASS)
-library(ggplot2)
-library(grid)
-library(readr)
-library(knitr)
-library(caret)
-library(YaleToolkit)
+library(lattice,quietly=TRUE)
+library(MASS,quietly=TRUE)
+library(ggplot2,quietly=TRUE)
+library(grid,quietly=TRUE)
+library(readr,quietly=TRUE)
+library(knitr,quietly=TRUE)
+library(caret,quietly=TRUE)
+library(YaleToolkit,quietly=TRUE)
string40 <- "ncnnccnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
string80 <- "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
string120 <- "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
@@ -203,11 +203,11 @@ testing <- originalSubset[-trainIndex,]
```
```{r useParallel, echo=TRUE, warning=FALSE, eval=FALSE}
-library(iterators)
-library(parallel)
-library(foreach)
-library(doParallel)
-cluster <- makeCluster(detectCores())
+library(iterators,quietly=TRUE)
+library(parallel,quietly=TRUE)
+library(foreach,quietly=TRUE)
+library(doParallel,quietly=TRUE)
+cluster <- makeCluster(detectCores()-1)
registerDoParallel(cluster)
```
@@ -230,10 +230,10 @@ confusionMatrix(pred1,training$classe)
```
```{r buildModel2, echo=TRUE, warning=FALSE, eval=FALSE}
-library(randomForest)
+suppressPackageStartupMessages(library(randomForest,quietly=TRUE))
intervalStart <- Sys.time()
mod2Control <- trainControl(method="cv",number=5,allowParallel=TRUE)
-modFit2 <- train(classe ~ .,data=training,method="rf",trControl=mod2Control)
+system.time(modFit2 <- train(classe ~ .,data=training,method="rf",trControl=mod2Control))
intervalEnd <- Sys.time()
print(modFit2)
paste("Train model2 took: ",intervalEnd - intervalStart,attr(intervalEnd - intervalStart,"units"))
@@ -273,6 +273,10 @@ pml_write_files(predicted_chars)
```
+```{r sessionInfo, echo=TRUE, eval=TRUE}
+sessionInfo()
+```
+
# References
1. Dinsdale, L. and Edwards, R. (2015) -- [Random Forests Webpage](https://dinsdalelab.sdsu.edu/metag.stats/code/randomforest.html), retrieved from the _Metagenomics. Statistics._ website on December 19, 2015.
diff --git a/index.html b/index.html
index 3767b82..ebbdafc 100644
--- a/index.html
+++ b/index.html
@@ -4,7 +4,7 @@
-
+
@@ -21,10 +21,11 @@
+
+
+
-
-
+
-
@@ -476,7 +523,7 @@
References
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
- script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+ script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
diff --git a/index.md b/index.md
index f3c23b9..08fca7b 100644
--- a/index.md
+++ b/index.md
@@ -8,11 +8,11 @@ January 31, 2016
## Executive Summary
-Classification of data from the [Qualitative Activity Recognition of Weight Lifting Exercises](http://groupware.les.inf.puc-rio.br/work.jsf?p1=11201) study to predict exercise quality for unknown observations from the study resulted in a 100% accuracy rate with a random forest technique. Key findings included:
+Classification of data from the [Qualitative Activity Recognition of Weight Lifting Exercises](https://github.com/lgreski/practicalmachinelearning/blob/gh-pages/2013.Velloso.QAR-WLE.pdf) study to predict exercise quality for unknown observations from the study resulted in a 100% accuracy rate with a random forest technique. Key findings included:
* Fully 62.5% of the data in the dataset was unusable, due to the high rates of missing values,
* Of the remaining 60 variables, 54 were used to predict the values of the quality variable, `classe`, and
-* A random forest model with 30 variables achieved 99.45% accuracy, correctly identifying 20 out of 20 unknown test cases.
+* A random forest model with 30 variables achieved 99.71% accuracy, correctly identifying 20 out of 20 unknown test cases.
## Online Versions
@@ -68,7 +68,7 @@ We begin the predictive modeling exercise with a simple classification model bas
```
-## [1] "Train model1 took: 2.43909597396851 secs"
+## [1] "Train model1 took: 3.18626594543457 secs"
```
```
@@ -76,33 +76,33 @@ We begin the predictive modeling exercise with a simple classification model bas
##
## Reference
## Prediction A B C D E
-## A 2826 186 40 26 75
-## B 162 1612 191 73 180
-## C 186 301 1678 338 153
-## D 174 148 134 1389 153
-## E 0 32 11 104 1604
+## A 2857 313 182 99 73
+## B 90 1526 189 78 248
+## C 179 297 1424 253 142
+## D 220 71 228 1479 187
+## E 2 72 31 21 1515
##
## Overall Statistics
##
-## Accuracy : 0.7735
-## 95% CI : (0.7659, 0.7811)
+## Accuracy : 0.7474
+## 95% CI : (0.7394, 0.7552)
## No Information Rate : 0.2843
## P-Value [Acc > NIR] : < 2.2e-16
##
-## Kappa : 0.7144
+## Kappa : 0.6802
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
-## Sensitivity 0.8441 0.7073 0.8169 0.7197 0.7409
-## Specificity 0.9612 0.9362 0.8994 0.9381 0.9847
-## Pos Pred Value 0.8963 0.7268 0.6318 0.6952 0.9160
-## Neg Pred Value 0.9395 0.9302 0.9588 0.9447 0.9440
+## Sensitivity 0.8533 0.6696 0.6933 0.7663 0.6998
+## Specificity 0.9209 0.9363 0.9104 0.9283 0.9869
+## Pos Pred Value 0.8107 0.7161 0.6205 0.6769 0.9232
+## Neg Pred Value 0.9405 0.9219 0.9336 0.9530 0.9359
## Prevalence 0.2843 0.1935 0.1744 0.1639 0.1838
-## Detection Rate 0.2400 0.1369 0.1425 0.1180 0.1362
-## Detection Prevalence 0.2677 0.1883 0.2255 0.1697 0.1487
-## Balanced Accuracy 0.9026 0.8218 0.8582 0.8289 0.8628
+## Detection Rate 0.2426 0.1296 0.1209 0.1256 0.1287
+## Detection Prevalence 0.2993 0.1810 0.1949 0.1855 0.1394
+## Balanced Accuracy 0.8871 0.8029 0.8018 0.8473 0.8433
```
The model has an overall accuracy of 75%, with the highest sensitivity being .85 for classifying an exercise as class A when it is indeed A. The model performs worst on class B, with only 67% sensitivity. The confusion matrix illustrates that a classification model based on linear discriminant analysis does not have sufficient accuracy for us to expect perfect or near-perfect classification of our unknown validation cases.
@@ -116,22 +116,8 @@ The random forest technique generates multiple predictive models, and aggregates
```
-## randomForest 4.6-12
-```
-
-```
-## Type rfNews() to see new features/changes/bug fixes.
-```
-
-```
-##
-## Attaching package: 'randomForest'
-```
-
-```
-## The following object is masked from 'package:ggplot2':
-##
-## margin
+## user system elapsed
+## 46.21 0.33 377.27
```
```
@@ -143,20 +129,20 @@ The random forest technique generates multiple predictive models, and aggregates
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
-## Summary of sample sizes: 9420, 9420, 9420, 9422, 9422
+## Summary of sample sizes: 9420, 9421, 9420, 9423, 9420
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
-## 2 0.9878570 0.9846366
-## 30 0.9932913 0.9915143
-## 58 0.9880262 0.9848539
+## 2 0.9910838 0.9887205
+## 30 0.9953294 0.9940922
+## 58 0.9923575 0.9903324
##
-## Accuracy was used to select the optimal model using the largest value.
+## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 30.
```
```
-## [1] "Train model2 took: 5.8085341334343 mins"
+## [1] "Train model2 took: 6.29260436693827 mins"
```
```
@@ -202,36 +188,36 @@ The random forest technique generates multiple predictive models, and aggregates
##
## Reference
## Prediction A B C D E
-## A 2228 7 0 2 0
-## B 1 1502 5 0 3
-## C 2 8 1362 11 0
-## D 1 1 1 1272 0
-## E 0 0 0 1 1439
+## A 2231 8 0 0 0
+## B 0 1509 6 0 0
+## C 0 1 1362 3 0
+## D 0 0 0 1283 6
+## E 1 0 0 0 1436
##
## Overall Statistics
-##
-## Accuracy : 0.9945
-## 95% CI : (0.9926, 0.996)
-## No Information Rate : 0.2845
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.9931
-## Mcnemar's Test P-Value : NA
+##
+## Accuracy : 0.9968
+## 95% CI : (0.9953, 0.9979)
+## No Information Rate : 0.2845
+## P-Value [Acc > NIR] : < 2.2e-16
+##
+## Kappa : 0.996
+## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
-## Sensitivity 0.9982 0.9895 0.9956 0.9891 0.9979
-## Specificity 0.9984 0.9986 0.9968 0.9995 0.9998
-## Pos Pred Value 0.9960 0.9940 0.9848 0.9976 0.9993
-## Neg Pred Value 0.9993 0.9975 0.9991 0.9979 0.9995
+## Sensitivity 0.9996 0.9941 0.9956 0.9977 0.9958
+## Specificity 0.9986 0.9991 0.9994 0.9991 0.9998
+## Pos Pred Value 0.9964 0.9960 0.9971 0.9953 0.9993
+## Neg Pred Value 0.9998 0.9986 0.9991 0.9995 0.9991
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
-## Detection Rate 0.2840 0.1914 0.1736 0.1621 0.1834
-## Detection Prevalence 0.2851 0.1926 0.1763 0.1625 0.1835
-## Balanced Accuracy 0.9983 0.9940 0.9962 0.9943 0.9989
+## Detection Rate 0.2843 0.1923 0.1736 0.1635 0.1830
+## Detection Prevalence 0.2854 0.1931 0.1741 0.1643 0.1832
+## Balanced Accuracy 0.9991 0.9966 0.9975 0.9984 0.9978
```
-The random forest model is extremely powerful, correctly classifying all cases in our training data set. When applied to the 40% holdout from the training data, the accuracy is .9967, very close to the 1.0 accuracy that was obtained with the 5 fold cross validation against the 60% sample of the training data. The algorithm produces optimal results with 30 predictors, reaching a maximum accuracy of approximately 0.995 as illustrated by the following chart.
+The random forest model is extremely powerful, correctly classifying all cases in our training data set. When applied to the 40% holdout from the training data, the accuracy is .9968, very close to the 1.0 accuracy that was obtained with the 5 fold cross validation against the 60% sample of the training data. The algorithm produces optimal results with 30 predictors, reaching a maximum accuracy of approximately 0.995 as illustrated by the following chart.

@@ -249,7 +235,7 @@ Given the accuracy level achieved via cross-validation of the model against mult
## Results
-The results from our random forest model were excellent. Applying the model to the test data set that we held out of of the model building steps, we find that the model accurately predicts 99.67% of the test cases, incorrectly classifying only 26 of the 7,846 observations. The error rate for the test data set is only 0.33%, giving us a .936 probability that the model would correctly classify all 20 validation cases.
+The results from our random forest model were excellent. Applying the model to the test data set that we held out of of the model building steps, we find that the model accurately predicts 99.68% of the test cases, incorrectly classifying only 23 of the 7,846 observations. The error rate for the test data set is only 0.32%, giving us a .938 probability that the model would correctly classify all 20 validation cases.
Finally, our accuracy at predicting the 20 cases in the validation data set was 100%. All in all, a good effort for our first attempt at a random forest.
@@ -280,14 +266,14 @@ for (i in 1:length(theFiles)) {
```r
-library(lattice)
-library(MASS)
-library(ggplot2)
-library(grid)
-library(readr)
-library(knitr)
-library(caret)
-library(YaleToolkit)
+library(lattice,quietly=TRUE)
+library(MASS,quietly=TRUE)
+library(ggplot2,quietly=TRUE)
+library(grid,quietly=TRUE)
+library(readr,quietly=TRUE)
+library(knitr,quietly=TRUE)
+library(caret,quietly=TRUE)
+library(YaleToolkit,quietly=TRUE)
string40 <- "ncnnccnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
string80 <- "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
string120 <- "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"
@@ -321,11 +307,11 @@ testing <- originalSubset[-trainIndex,]
```r
-library(iterators)
-library(parallel)
-library(foreach)
-library(doParallel)
-cluster <- makeCluster(detectCores())
+library(iterators,quietly=TRUE)
+library(parallel,quietly=TRUE)
+library(foreach,quietly=TRUE)
+library(doParallel,quietly=TRUE)
+cluster <- makeCluster(detectCores()-1)
registerDoParallel(cluster)
```
@@ -349,10 +335,10 @@ confusionMatrix(pred1,training$classe)
```r
-library(randomForest)
+suppressPackageStartupMessages(library(randomForest,quietly=TRUE))
intervalStart <- Sys.time()
mod2Control <- trainControl(method="cv",number=5,allowParallel=TRUE)
-modFit2 <- train(classe ~ .,data=training,method="rf",trControl=mod2Control)
+system.time(modFit2 <- train(classe ~ .,data=training,method="rf",trControl=mod2Control))
intervalEnd <- Sys.time()
print(modFit2)
paste("Train model2 took: ",intervalEnd - intervalStart,attr(intervalEnd - intervalStart,"units"))
@@ -391,6 +377,60 @@ predicted_chars <- as.character(predicted_validation)
pml_write_files(predicted_chars)
```
+
+```r
+sessionInfo()
+```
+
+```
+## R version 3.4.2 (2017-09-28)
+## Platform: x86_64-w64-mingw32/x64 (64-bit)
+## Running under: Windows 10 x64 (build 15063)
+##
+## Matrix products: default
+##
+## locale:
+## [1] LC_COLLATE=English_United States.1252
+## [2] LC_CTYPE=English_United States.1252
+## [3] LC_MONETARY=English_United States.1252
+## [4] LC_NUMERIC=C
+## [5] LC_TIME=English_United States.1252
+##
+## attached base packages:
+## [1] parallel grid stats graphics grDevices utils datasets
+## [8] methods base
+##
+## other attached packages:
+## [1] randomForest_4.6-12 doParallel_1.0.11 foreach_1.4.3
+## [4] iterators_1.0.8 YaleToolkit_4.2.2 caret_6.0-77.9000
+## [7] knitr_1.17 readr_1.1.1 ggplot2_2.2.1
+## [10] MASS_7.3-47 lattice_0.20-35
+##
+## loaded via a namespace (and not attached):
+## [1] Rcpp_0.12.13 lubridate_1.6.0 tidyr_0.7.2
+## [4] class_7.3-14 assertthat_0.2.0 rprojroot_1.2
+## [7] digest_0.6.12 ipred_0.9-6 psych_1.7.8
+## [10] R6_2.2.2 plyr_1.8.4 backports_1.1.1
+## [13] stats4_3.4.2 e1071_1.6-8 evaluate_0.10.1
+## [16] highr_0.6 rlang_0.1.2 lazyeval_0.2.0
+## [19] kernlab_0.9-25 rpart_4.1-11 Matrix_1.2-11
+## [22] rmarkdown_1.6 splines_3.4.2 CVST_0.2-1
+## [25] ddalpha_1.3.1 gower_0.1.2 stringr_1.2.0
+## [28] foreign_0.8-69 munsell_0.4.3 broom_0.4.2
+## [31] compiler_3.4.2 pkgconfig_2.0.1 mnormt_1.5-5
+## [34] dimRed_0.1.0 htmltools_0.3.6 nnet_7.3-12
+## [37] tidyselect_0.2.2 tibble_1.3.4 prodlim_1.6.1
+## [40] DRR_0.0.2 codetools_0.2-15 RcppRoll_0.2.2
+## [43] withr_2.0.0 dplyr_0.7.4 recipes_0.1.0.9000
+## [46] ModelMetrics_1.1.0 nlme_3.1-131 gtable_0.2.0
+## [49] magrittr_1.5 scales_0.5.0 stringi_1.1.5
+## [52] reshape2_1.4.2 bindrcpp_0.2 timeDate_3012.100
+## [55] robustbase_0.92-7 lava_1.5.1 tools_3.4.2
+## [58] glue_1.1.1 DEoptimR_1.0-8 purrr_0.2.4
+## [61] sfsmisc_1.1-1 hms_0.3 survival_2.41-3
+## [64] yaml_2.1.14 colorspace_1.3-2 bindr_0.1
+```
+
# References
1. Dinsdale, L. and Edwards, R. (2015) -- [Random Forests Webpage](https://dinsdalelab.sdsu.edu/metag.stats/code/randomforest.html), retrieved from the _Metagenomics. Statistics._ website on December 19, 2015.
diff --git a/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.RData b/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.RData
new file mode 100644
index 0000000..5e18728
Binary files /dev/null and b/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.RData differ
diff --git a/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.rdb b/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.rdb
new file mode 100644
index 0000000..4c9d9fb
Binary files /dev/null and b/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.rdb differ
diff --git a/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.rdx b/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.rdx
new file mode 100644
index 0000000..7fdc660
Binary files /dev/null and b/index_cache/html/buildModel1_621b764493a8671721064dccabd5b1d6.rdx differ
diff --git a/index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.RData b/index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.RData
similarity index 92%
rename from index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.RData
rename to index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.RData
index 7ba854e..67f76dc 100644
Binary files a/index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.RData and b/index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.RData differ
diff --git a/index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.rdb b/index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.rdb
similarity index 100%
rename from index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.rdb
rename to index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.rdb
diff --git a/index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.rdx b/index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.rdx
new file mode 100644
index 0000000..466af01
Binary files /dev/null and b/index_cache/html/buildModel2_64874c78b65edd43477b7d115f63fb2e.rdx differ
diff --git a/index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.rdx b/index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.rdx
deleted file mode 100644
index 53971d7..0000000
Binary files a/index_cache/html/buildModel2_e1ae907b96aadc18be7bb22578793787.rdx and /dev/null differ
diff --git a/index_files/figure-html/plotErr-1.png b/index_files/figure-html/plotErr-1.png
index 5623789..a4c709a 100644
Binary files a/index_files/figure-html/plotErr-1.png and b/index_files/figure-html/plotErr-1.png differ
diff --git a/index_files/figure-html/plotRFAccuracy-1.png b/index_files/figure-html/plotRFAccuracy-1.png
index 6cf7ab6..65cfb9c 100644
Binary files a/index_files/figure-html/plotRFAccuracy-1.png and b/index_files/figure-html/plotRFAccuracy-1.png differ
diff --git a/index_files/figure-html/unnamed-chunk-2-1.png b/index_files/figure-html/unnamed-chunk-2-1.png
deleted file mode 100644
index ab8e1e3..0000000
Binary files a/index_files/figure-html/unnamed-chunk-2-1.png and /dev/null differ
diff --git a/index_files/figure-html/unnamed-chunk-3-1.png b/index_files/figure-html/unnamed-chunk-3-1.png
index ff4776b..d417764 100644
Binary files a/index_files/figure-html/unnamed-chunk-3-1.png and b/index_files/figure-html/unnamed-chunk-3-1.png differ
diff --git a/index_files/figure-html/varImp-1.png b/index_files/figure-html/varImp-1.png
index 4fa2f21..8325bd3 100644
Binary files a/index_files/figure-html/varImp-1.png and b/index_files/figure-html/varImp-1.png differ