6. Pre-processing procedures

Note

Well begun is half done – old Chinese proverb

In my opinion, preprocessing is crucial for the data mining algorithms. If you get a good pre-processing, you will definitely get a beeter result. In this section, we will learn how to do a proper pre-processing in R and Python.

6.1. Rough Pre-processing

6.1.1. dealing with missing data

Usually, we have two popular way to deal with the missing data: replacing by 0 or replacing by mean value.

Python

  • dealing with missing data in Python

:: Example:

import pandas as pd

d = {'A': [1, 0, None, 3],
     'B': [1, 0, 0, 0],
     'C': [4, None, 7, 8]}

df = pd.DataFrame(d)
print(df)

# fill missing numerical value with 0
print(df.fillna(0))

# fill missing numerical value with mean
df = df.fillna(df.mean())
print(df)

:: Ouput:

     A  B    C
0  1.0  1  4.0
1  0.0  0  NaN
2  NaN  0  7.0
3  3.0  0  8.0
     A  B    C
0  1.0  1  4.0
1  0.0  0  0.0
2  0.0  0  7.0
3  3.0  0  8.0
          A  B         C
0  1.000000  1  4.000000
1  0.000000  0  6.333333
2  1.333333  0  7.000000
3  3.000000  0  8.000000

R

  • dealing with missing data in R

:: Example:

library(dplyr)

df = data.frame(A = c(1, 0, NA, 3),
                B = c(1, 0, 0, 0),
                C = c(4, NA, 7, 8))
df

na2zero <- function(data){
  data %>% mutate_all(~replace(., is.na(.), 0))
}

na2zero(df)

na2mean <- function(data){
  for(i in 1:ncol(data)){
    data[is.na(data[,i]), i] <- mean(data[,i], na.rm = TRUE)
  }
  return(data)
}

na2mean(df)

:: Ouput:

> df
   A B  C
1  1 1  4
2  0 0 NA
3 NA 0  7
4  3 0  8

> na2zero(df)
  A B C
1 1 1 4
2 0 0 0
3 0 0 7
4 3 0 8

> na2mean(df)
         A B        C
1 1.000000 1 4.000000
2 0.000000 0 6.333333
3 1.333333 0 7.000000
4 3.000000 0 8.000000

6.2. Source Code for This Section

The code for this section is available for download for R, for Python,
  • R Source code
rm(list = ls())
# set the enverionment 
path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv'
rawdata = read.csv(path)

# summary of the data
summary(rawdata)
# plot of the summary
plot(rawdata)

dim(rawdata)
head(rawdata)
tail(rawdata)

colnames(rawdata)
attach(rawdata)

# get numerical data and remove NAN
numdata=na.omit(rawdata[,c(1:2,4:12)])

cor(numdata)
cov(numdata)

dev.off()
# laod cocorrelation Matrix plot lib
library(corrplot)
M <- cor(numdata)
#par(mfrow =c (1,2))
#corrplot(M, method = "square")
corrplot.mixed(M)


nrow=nrow(rawdata)
ncol=ncol(rawdata)
c(nrow, ncol)



Nvars=ncol(numdata)
# checking data format 
typeof(rawdata)
install.packages("mlbench")
library(mlbench)
sapply(rawdata, class)

dev.off()
name=colnames(numdata)
Nvars=ncol(numdata)
# boxplot 
par(mfrow =c (4,3))
for (i in 1:Nvars)
{
  #boxplot(numdata[,i]~numdata[,Nvars],data=data,main=name[i])
  boxplot(numdata[,i],data=numdata,main=name[i])
}

# Histogram with normal curve plot 
dev.off()
Nvars=ncol(numdata)
name=colnames(numdata)
par(mfrow =c (3,5))
for (i in 1:Nvars)
{
  x<- numdata[,i]
  h<-hist(x, breaks=10, freq=TRUE, col="blue", xlab=name[i],main=" ", 
            font.lab=1) 
  axis(1, tck=1, col.ticks="light gray")
  axis(1, tck=-0.015, col.ticks="black")
  axis(2, tck=1, col.ticks="light gray", lwd.ticks="1")
  axis(2, tck=-0.015)
  xfit<-seq(min(x),max(x),length=40) 
  yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
  yfit <- yfit*diff(h$mids[1:2])*length(x) 
  lines(xfit, yfit, col="blue", lwd=2) 
} 


library(reshape2)
library(ggplot2)
d <- melt(diamonds[,-c(2:4)])
ggplot(d,aes(x = value)) + 
  facet_wrap(~variable,scales = "free_x") + 
  geom_histogram()
  • Python Source code
'''
Created on Apr 25, 2016
test code 
@author: Wenqiang Feng 
'''
import pandas as pd
#import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from docutils.parsers.rst.directives import path

if __name__ == '__main__':
    path ='~/Dropbox/MachineLearningAlgorithms/python_code/data/Heart.csv' 
    rawdata = pd.read_csv(path)
    
    print "data summary"
    print rawdata.describe()
    
    # summary plot of the data
    scatter_matrix(rawdata,figsize=[15,15])
    plt.show()
    
    # Histogram 
    rawdata.hist()
    plt.show()
    
    # boxplot 
    pd.DataFrame.boxplot(rawdata)
    plt.show()
    
    
    print "Raw data size"
    nrow, ncol = rawdata.shape
    print nrow, ncol
    
    path = ('/home/feng/Dropbox/MachineLearningAlgorithms/python_code/data/'
    'energy_efficiency.xlsx')
    path
            
    rawdataEnergy= pd.read_excel(path,sheetname=0)
    
    nrow=rawdata.shape[0] #gives number of row count
    ncol=rawdata.shape[1] #gives number of col count
    print nrow, ncol
    col_names = rawdata.columns.tolist()
    print "Column names:"
    print col_names
    print "Data Format:"
    print rawdata.dtypes
    
    print "\nSample data:"
    print(rawdata.head(6))
    
    
    print "\n correlation Matrix"
    print rawdata.corr()
    
    # cocorrelation Matrix plot     
    pd.DataFrame.corr(rawdata)
    plt.show()
    
    print "\n covariance Matrix"
    print rawdata.cov()
    
    print rawdata[['Age','Ca']].corr()
    pd.DataFrame.corr(rawdata)
    plt.show()
    

    
    # define colors list, to be used to plot survived either red (=0) or green (=1)
    colors=['red','green']

    # make a scatter plot

    # rawdata.info()

    from scipy import stats
    import seaborn as sns # just a conventional alias, don't know why
    sns.corrplot(rawdata) # compute and plot the pair-wise correlations
    # save to file, remove the big white borders
    #plt.savefig('attribute_correlations.png', tight_layout=True)
    plt.show()
    
    
    attr = rawdata['Age']
    sns.distplot(attr)
    plt.show()
    
    sns.distplot(attr, kde=False, fit=stats.gamma);
    plt.show()
    
    # Two subplots, the axes array is 1-d
    plt.figure(1)
    plt.title('Histogram of Age')
    plt.subplot(211) # 21,1 means first one of 2 rows, 1 col 
    sns.distplot(attr)
    
    plt.subplot(212) #  21,2 means second one of 2 rows, 1 col 
    sns.distplot(attr, kde=False, fit=stats.gamma);

    plt.show()