Data-PreProcessing(R)

Data PreProcessing

Hui Lin

http://scientistcafe.com

Package Installation

# install packages from CRAN
p_needed <- c("dplyr", "caret","e1071","imputeMissings",
              "nnet","car","reshape2","psych","tidyr",'RANN',
              "ggplot2","readr","corrplot")

packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
  install.packages(p_to_install)
}

lapply(p_needed, require, character.only = TRUE)
Installing packages into ‘/databricks/spark/R/lib’
(as ‘lib’ is unspecified)
also installing the dependency ‘hms’

trying URL 'https://cloud.r-project.org/src/contrib/hms_0.4.2.tar.gz'
Content type 'application/x-gzip' length 12773 bytes (12 KB)
==================================================
downloaded 12 KB

trying URL 'https://cloud.r-project.org/src/contrib/e1071_1.6-8.tar.gz'
Content type 'application/x-gzip' length 581513 bytes (567 KB)
==================================================
downloaded 567 KB

trying URL 'https://cloud.r-project.org/src/contrib/imputeMissings_0.0.3.tar.gz'
Content type 'application/x-gzip' length 3415 bytes
==================================================
downloaded 3415 bytes

trying URL 'https://cloud.r-project.org/src/contrib/RANN_2.5.1.tar.gz'
Content type 'application/x-gzip' length 58102 bytes (56 KB)
==================================================
downloaded 56 KB

trying URL 'https://cloud.r-project.org/src/contrib/readr_1.1.1.tar.gz'
Content type 'application/x-gzip' length 233793 bytes (228 KB)
==================================================
downloaded 228 KB

trying URL 'https://cloud.r-project.org/src/contrib/corrplot_0.84.tar.gz'
Content type 'application/x-gzip' length 5385275 bytes (5.1 MB)
==================================================
downloaded 5.1 MB

* installing *source* package ‘hms’ ...
** package ‘hms’ successfully unpacked and MD5 sums checked
** R
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (hms)
* installing *source* package ‘e1071’ ...
** package ‘e1071’ successfully unpacked and MD5 sums checked
checking for C++ compiler default output file name... a.out
checking whether the C++ compiler works... yes
checking whether we are cross compiling... no
checking for suffix of executables... 
checking for suffix of object files... o
checking whether we are using the GNU C++ compiler... yes
checking whether g++ accepts -g... yes
** libs
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c Rsvm.c -o Rsvm.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c cmeans.c -o cmeans.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c cshell.c -o cshell.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c floyd.c -o floyd.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c init.c -o init.o
g++  -I/usr/share/R/include -DNDEBUG      -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c svm.cpp -o svm.o
svm.cpp: In function ‘svm_model* svm_load_model(const char*)’:
svm.cpp:2788:24: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
   fscanf(fp,"%80s",cmd);
                        ^
svm.cpp:2792:25: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
    fscanf(fp,"%80s",cmd);
                         ^
svm.cpp:2817:25: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
    fscanf(fp,"%80s",cmd);
                         ^
svm.cpp:2841:33: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
    fscanf(fp,"%d",&param.degree);
                                 ^
svm.cpp:2843:33: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
    fscanf(fp,"%lf",&param.gamma);
                                 ^
svm.cpp:2845:33: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
    fscanf(fp,"%lf",&param.coef0);
                                 ^
svm.cpp:2847:36: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
    fscanf(fp,"%d",&model->nr_class);
                                    ^
svm.cpp:2849:29: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
    fscanf(fp,"%d",&model->l);
                             ^
svm.cpp:2855:36: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
     fscanf(fp,"%lf",&model->rho[i]);
                                    ^
svm.cpp:2862:37: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
     fscanf(fp,"%d",&model->label[i]);
                                     ^
svm.cpp:2869:38: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
     fscanf(fp,"%lf",&model->probA[i]);
                                      ^
svm.cpp:2876:38: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
     fscanf(fp,"%lf",&model->probB[i]);
                                      ^
svm.cpp:2883:35: warning: ignoring return value of ‘int fscanf(FILE*, const char*, ...)’, declared with attribute warn_unused_result [-Wunused-result]
     fscanf(fp,"%d",&model->nSV[i]);
                                   ^
g++ -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -Wl,-z,relro -o e1071.so Rsvm.o cmeans.o cshell.o floyd.o init.o svm.o -L/usr/lib/R/lib -lR
installing to /databricks/spark/R/lib/e1071/libs
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded
* DONE (e1071)
* installing *source* package ‘imputeMissings’ ...
** package ‘imputeMissings’ successfully unpacked and MD5 sums checked
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (imputeMissings)
* installing *source* package ‘RANN’ ...
** package ‘RANN’ successfully unpacked and MD5 sums checked
** libs
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c ANN.cpp -o ANN.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c NN.cc -o NN.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c bd_fix_rad_search.cpp -o bd_fix_rad_search.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c bd_pr_search.cpp -o bd_pr_search.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c bd_search.cpp -o bd_search.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c bd_tree.cpp -o bd_tree.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c brute.cpp -o brute.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c init.c -o init.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c kd_dump.cpp -o kd_dump.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c kd_fix_rad_search.cpp -o kd_fix_rad_search.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c kd_pr_search.cpp -o kd_pr_search.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c kd_search.cpp -o kd_search.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c kd_split.cpp -o kd_split.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c kd_tree.cpp -o kd_tree.o
g++  -I/usr/share/R/include -DNDEBUG -I. -IANN -DRANN     -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c kd_util.cpp -o kd_util.o
g++ -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -Wl,-z,relro -o RANN.so ANN.o NN.o bd_fix_rad_search.o bd_pr_search.o bd_search.o bd_tree.o brute.o init.o kd_dump.o kd_fix_rad_search.o kd_pr_search.o kd_search.o kd_split.o kd_tree.o kd_util.o -L/usr/lib/R/lib -lR
installing to /databricks/spark/R/lib/RANN/libs
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (RANN)
* installing *source* package ‘corrplot’ ...
** package ‘corrplot’ successfully unpacked and MD5 sums checked
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded
* DONE (corrplot)
* installing *source* package ‘readr’ ...
** package ‘readr’ successfully unpacked and MD5 sums checked
** libs
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c Collector.cpp -o Collector.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c CollectorGuess.cpp -o CollectorGuess.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c Iconv.cpp -o Iconv.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c LocaleInfo.cpp -o LocaleInfo.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c RcppExports.cpp -o RcppExports.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c Reader.cpp -o Reader.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c Source.cpp -o Source.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c Tokenizer.cpp -o Tokenizer.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c TokenizerDelim.cpp -o TokenizerDelim.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c TokenizerFwf.cpp -o TokenizerFwf.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c TokenizerWs.cpp -o TokenizerWs.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c connection.cpp -o connection.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c datetime.cpp -o datetime.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c grisu3.c -o grisu3.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c init.c -o init.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c localtime.c -o localtime.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c parse.cpp -o parse.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c read.cpp -o read.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c type_convert.cpp -o type_convert.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c write.cpp -o write.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c write_connection.cpp -o write_connection.o
g++  -I/usr/share/R/include -DNDEBUG  -I"/usr/local/lib/R/site-library/Rcpp/include" -I"/usr/local/lib/R/site-library/BH/include"    -fpic  -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g  -c write_delim.cpp -o write_delim.o
g++ -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -Wl,-z,relro -o readr.so Collector.o CollectorGuess.o Iconv.o LocaleInfo.o RcppExports.o Reader.o Source.o Tokenizer.o TokenizerDelim.o TokenizerFwf.o TokenizerWs.o connection.o datetime.o grisu3.o init.o localtime.o parse.o read.o type_convert.o write.o write_connection.o write_delim.o -L/usr/lib/R/lib -lR
installing to /databricks/spark/R/lib/readr/libs
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded
* DONE (readr)

The downloaded source packages are in
	‘/tmp/RtmplIPfrQ/downloaded_packages’
Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: caret
Loading required package: lattice
Loading required package: ggplot2
Loading required package: e1071
Loading required package: imputeMissings

Attaching package: ‘imputeMissings’

The following object is masked from ‘package:e1071’:

    impute

The following object is masked from ‘package:dplyr’:

    compute

Loading required package: nnet
Loading required package: car

Attaching package: ‘car’

The following object is masked from ‘package:dplyr’:

    recode

Loading required package: reshape2
Loading required package: psych

Attaching package: ‘psych’

The following object is masked from ‘package:car’:

    logit

The following objects are masked from ‘package:ggplot2’:

    %+%, alpha

Loading required package: tidyr

Attaching package: ‘tidyr’

The following object is masked _by_ ‘.GlobalEnv’:

    complete

The following object is masked from ‘package:reshape2’:

    smiths

Loading required package: RANN
Loading required package: readr
Loading required package: corrplot
corrplot 0.84 loaded
[[1]]
[1] TRUE

[[2]]
[1] TRUE

[[3]]
[1] TRUE

[[4]]
[1] TRUE

[[5]]
[1] TRUE

[[6]]
[1] TRUE

[[7]]
[1] TRUE

[[8]]
[1] TRUE

[[9]]
[1] TRUE

[[10]]
[1] TRUE

[[11]]
[1] TRUE

[[12]]
[1] TRUE

[[13]]
[1] TRUE

Data Cleaning

## Do you see any problems?
sim.dat <- read.csv("https://raw.githubusercontent.com/happyrabbit/DataScientistR/master/Data/SegData.csv ")
summary(sim.dat)
      age            gender        income       house       store_exp      
 Min.   : 16.00   Female:554   Min.   : 41776   No :432   Min.   : -500.0  
 1st Qu.: 25.00   Male  :446   1st Qu.: 85832   Yes:568   1st Qu.:  205.0  
 Median : 36.00                Median : 93869             Median :  329.0  
 Mean   : 38.84                Mean   :113543             Mean   : 1356.8  
 3rd Qu.: 53.00                3rd Qu.:124572             3rd Qu.:  597.3  
 Max.   :300.00                Max.   :319704             Max.   :50000.0  
                               NA's   :184                                 
   online_exp       store_trans     online_trans         Q1       
 Min.   :  68.82   Min.   : 1.00   Min.   : 1.00   Min.   :1.000  
 1st Qu.: 420.34   1st Qu.: 3.00   1st Qu.: 6.00   1st Qu.:2.000  
 Median :1941.86   Median : 4.00   Median :14.00   Median :3.000  
 Mean   :2120.18   Mean   : 5.35   Mean   :13.55   Mean   :3.101  
 3rd Qu.:2440.78   3rd Qu.: 7.00   3rd Qu.:20.00   3rd Qu.:4.000  
 Max.   :9479.44   Max.   :20.00   Max.   :36.00   Max.   :5.000  
                                                                  
       Q2              Q3              Q4              Q5       
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.750  
 Median :1.000   Median :1.000   Median :3.000   Median :4.000  
 Mean   :1.823   Mean   :1.992   Mean   :2.763   Mean   :2.945  
 3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
                                                                
       Q6              Q7              Q8              Q9             Q10      
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
 1st Qu.:1.000   1st Qu.:2.500   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.00  
 Median :2.000   Median :4.000   Median :2.000   Median :4.000   Median :2.00  
 Mean   :2.448   Mean   :3.434   Mean   :2.396   Mean   :3.085   Mean   :2.32  
 3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:3.00  
 Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00  
                                                                               
        segment   
 Conspicuous:200  
 Price      :250  
 Quality    :200  
 Style      :350  
                  
                  
                  
# set problematic values as missings
sim.dat$age[which(sim.dat$age>100)]<-NA
sim.dat$store_exp[which(sim.dat$store_exp<0)]<-NA

# see the results
summary(subset(sim.dat,select=c("age","income")))
      age            income      
 Min.   :16.00   Min.   : 41776  
 1st Qu.:25.00   1st Qu.: 85832  
 Median :36.00   Median : 93869  
 Mean   :38.58   Mean   :113543  
 3rd Qu.:53.00   3rd Qu.:124572  
 Max.   :69.00   Max.   :319704  
 NA's   :1       NA's   :184     

Missing Values

1 Using Median Impute

# save the result as another object
demo_imp<-impute(sim.dat,method="median/mode")

# check the first 5 columns, there is no missing values in other columns
summary(demo_imp[,1:5])
      age           gender        income       house       store_exp      
 Min.   :16.00   Female:554   Min.   : 41776   No :432   Min.   :  155.8  
 1st Qu.:25.00   Male  :446   1st Qu.: 87896   Yes:568   1st Qu.:  205.1  
 Median :36.00                Median : 93869             Median :  329.8  
 Mean   :38.58                Mean   :109923             Mean   : 1357.7  
 3rd Qu.:53.00                3rd Qu.:119456             3rd Qu.:  597.3  
 Max.   :69.00                Max.   :319704             Max.   :50000.0  
imp<-preProcess(sim.dat,method="medianImpute")
demo_imp2<-predict(imp,sim.dat)
summary(demo_imp2[,1:5])
      age           gender        income       house       store_exp      
 Min.   :16.00   Female:554   Min.   : 41776   No :432   Min.   :  155.8  
 1st Qu.:25.00   Male  :446   1st Qu.: 87896   Yes:568   1st Qu.:  205.1  
 Median :36.00                Median : 93869             Median :  329.8  
 Mean   :38.58                Mean   :109923             Mean   : 1357.7  
 3rd Qu.:53.00                3rd Qu.:119456             3rd Qu.:  597.3  
 Max.   :69.00                Max.   :319704             Max.   :50000.0  

2 Using K-nearest neighbors

imp<-preProcess(sim.dat,method="knnImpute",k=5)
# need to use predict() to get KNN result
demo_imp<-predict(imp,sim.dat)

## Solve the problem
# find factor columns
imp<-preProcess(sim.dat,method="knnImpute",k=5)
idx<-which(lapply(sim.dat,class)=="factor")
demo_imp<-predict(imp,sim.dat[,-idx])
summary(demo_imp[,1:3])
      age                 income           store_exp       
 Min.   :-1.5910972   Min.   :-1.43989   Min.   :-0.43345  
 1st Qu.:-0.9568733   1st Qu.:-0.53732   1st Qu.:-0.41574  
 Median :-0.1817107   Median :-0.37606   Median :-0.37105  
 Mean   : 0.0000156   Mean   : 0.02389   Mean   :-0.00042  
 3rd Qu.: 1.0162678   3rd Qu.: 0.21540   3rd Qu.:-0.27437  
 Max.   : 2.1437770   Max.   : 4.13627   Max.   :17.52734  

3 Using Bagging Tree

imp<-preProcess(sim.dat,method="bagImpute")
demo_imp<-predict(imp,sim.dat)
summary(demo_imp[,1:5])
      age           gender        income       house       store_exp      
 Min.   :16.00   Female:554   Min.   : 41776   No :432   Min.   :  155.8  
 1st Qu.:25.00   Male  :446   1st Qu.: 86762   Yes:568   1st Qu.:  205.1  
 Median :36.00                Median : 94739             Median :  329.0  
 Mean   :38.58                Mean   :114631             Mean   : 1357.7  
 3rd Qu.:53.00                3rd Qu.:123726             3rd Qu.:  597.3  
 Max.   :69.00                Max.   :319704             Max.   :50000.0  

Centering and Scaling

1 Do-It-Yourself Approach

income<-sim.dat$income
# calculate the mean of income
mux<-mean(income,na.rm=T)
# calculate the standard deviation of income
sdx<-sd(income,na.rm=T)
# centering
tr1<-income-mux
# scaling
tr2<-tr1/sdx

head(tr2)
[1]  0.148876309  0.169836502  0.013226321  0.001470074  0.214867498
[6] -0.118004398

2 Using preProcess()

sdat<-subset(sim.dat,select=c("age","income"))
# set the "method" option
trans<-preProcess(sdat,method=c("center","scale"))
# use predict() function to get the final result
transformed<-predict(trans,sdat)

head(transformed)
        age       income
1 1.2981451  0.148876309
2 1.7209611  0.169836502
3 1.4390838  0.013226321
4 1.5095531  0.001470074
5 0.8753292  0.214867498
6 1.4390838 -0.118004398

Resolve Skewness

describe(sim.dat)
# select the two columns and save them as dat_bc
dat_bc<-subset(sim.dat,select=c("store_trans","online_trans"))
(trans<-preProcess(dat_bc,method=c("BoxCox")))

# Use predict() to get the transformed result
transformed<-predict(trans,dat_bc)

# Check before and after
par(mfrow=c(1,2),oma=c(2,2,2,2))
hist(dat_bc$store_trans,main="Before Transformation",xlab="store_trans")
hist(transformed$store_trans,main="After Transformation",xlab="store_trans")

Resolve Outliers

## Z-score and modified Z-score
# calculate median of the absolute dispersion for income
ymad<-mad(na.omit(sdat$income))
# calculate z-score
zs<-(sdat$income-mean(na.omit(sdat$income)))/ymad
# count the number of outliers
sum(na.omit(zs>3.5))
[1] 59

Collinearity

# corrplot()
# select non-survey numerical variables
sdat<-subset(sim.dat,select=c("age","income","store_exp","online_exp","store_trans","online_trans" ))
# use bagging imputation here
imp<-preProcess(sdat,method="bagImpute")
sdat<-predict(imp,sdat)
# get the correlation matrix
correlation<-cor(sdat)
# plot 
par(oma=c(2,2,2,2))
corrplot.mixed(correlation,order="hclust",tl.pos="lt",upper="ellipse")
# The `findCorrelation()` function in package `caret` will apply the above algorithm.
(highCorr<-findCorrelation(cor(sdat),cutoff=.75))

# delete highly correlated columns
sdat<-sdat[-highCorr]

# check the new correlation matrix
corrplot.mixed(cor(sdat),order="hclust",tl.pos="lt",upper="ellipse")

Sparse Variables

# make a copy
zero_demo<-sim.dat
# add two sparse variable
# zero1 only has one unique value
# zero2 is a vector with the first element 1 and the rest are 0s
zero_demo$zero1<-rep(1,nrow(zero_demo))
zero_demo$zero2<-c(1,rep(0,nrow(zero_demo)-1))
nearZeroVar(zero_demo,freqCut = 95/5)
[1] 20 21

Re-encode Dummy Variables

1 Using class.ind() from nnet package

dumVar<-class.ind(sim.dat$gender)
head(dumVar)
     Female Male
[1,]      1    0
[2,]      1    0
[3,]      0    1
[4,]      0    1
[5,]      0    1
[6,]      0    1

2 Using dummyVars() from caret

# use "origional variable name + level" as new name

dumMod<-dummyVars(~gender+house+income,
                  data=sim.dat, levelsOnly=F)
head(predict(dumMod,sim.dat))
  gender.Female gender.Male house.No house.Yes   income
1             1           0        0         1 120963.4
2             1           0        0         1 122008.1
3             0           1        0         1 114202.3
4             0           1        0         1 113616.3
5             0           1        0         1 124252.6
6             0           1        0         1 107661.5
## the function can create interaction term
dumMod<-dummyVars(~gender+house+income+income:gender,
                  data=sim.dat,
                  levelsOnly=F)
head(predict(dumMod,sim.dat))
  gender.Female gender.Male house.No house.Yes   income gender.Female:income
1             1           0        0         1 120963.4             120963.4
2             1           0        0         1 122008.1             122008.1
3             0           1        0         1 114202.3                  0.0
4             0           1        0         1 113616.3                  0.0
5             0           1        0         1 124252.6                  0.0
6             0           1        0         1 107661.5                  0.0
  gender.Male:income
1                0.0
2                0.0
3           114202.3
4           113616.3
5           124252.6
6           107661.5