-Data Summary

Before do anything else, it is important to understand the structure of the data:

•missing data

•cleaning / tidying

•plotting

•correleations

•outliers

•summary stats

All the list are the functions in R. Some of them need additional packages.

General identifying

View(data)
glimpse(data)
spec(data) for csv file
attributes(data)
class(data)

In depth summarizing

1-With Summary() from Base

summary(data)

#Grouping with some of category
#example category: age
by(data,data$age, summary)

2-skim(), from the skimr package

install.packages("skimr")
library(skimr)
# Descriptive statistics 
skim(data)
skim(data_excel)

#update 
install.packages("dplyr")
library(dplyr)
library(skimr)
group_by(data, category) %>% skim()

3-describe, from the Hmisc package

install.packages("Hmisc")
library(Hmisc)
Hmisc::describe(data)
describe(data)

4-stat.desc(), from the pastecs package

install.packages("pastecs")
library(pastecs)
stat.desc(data)

5-describe and describeBy, from the psych package

install.packages("psych")
library(psych)
psych::describe(data)
describe(data)
psych::describeBy(data, data$type)

#The “mat” parameter does allow you to produce 
#a matrix output of the above.
psych::describeBy(data, data$type, mat = TRUE)

6-descr and dfSummary, from the summarytools package

install.packages("summarytools")
library(summarytools)
summarytools::descr(data)
#Only works with numerical data. 
descr(data)

#as data.frame
kable(as.data.frame(summarytools::descr(data)))
summarytools::descr(data)

#transpose
summarytools::descr(data, transpose = TRUE)

#Complete menu from summarytools
dfSummary(data)

7-CreateTableOne, from the tableone package

install.packages("tableone")
library(tableone)

CreateTableOne(data = data)
summary(CreateTableOne(data = data))

CreateTableOne(strata = "category", data = data)

#For example, if we think “score” 
#should not be treated as normal:
print(CreateTableOne(strata = "category", data = data), 
nonnormal = "score")

8-desctable, from the desctable package

install.packages("desctable")
library(desctable)
desctable(data)

group_by(data, category) %>%
desctable()
 
#This function is super customisable.
desctable(data,stats = list("N" = length, "Mean" = mean, 
"SD" = sd, "Min" = min, "Max" = max))

9-ggpairs, from the GGally package

install.packages("GGally")
library(GGally)

ggpairs(data)
ggpairs(data, mapping = aes(colour = category))

10-ds_summary_stats from descriptr

install.packages("descriptr")
library(descriptr)

ds_summary_stats(data$score)
ds_screener(data)
ds_multi_stats(filter(data, !is.na(score)), score, rating)
ds_freq_table(data$category)

11-With dlookr: An automated report (as pdf or html)

data<-qry_neue_DL
View(diagnose(data))

a<-data %>%
  diagnose() %>%
  select(-unique_count, -unique_rate) %>% 
  filter(missing_count > 0) %>% 
  arrange(desc(missing_count))
View(a)

View(diagnose_numeric(data))
View(diagnose_category(data))

#No 1. Missing values
diagnose_category(data) %>% 
  filter(is.na(levels))

#0.01% list levels
data %>%
  diagnose_category(top = 500)  %>%
  filter(ratio <= 0.01)


#Diagnosing outliers with diagnose_outlier()
diagnose_outlier(data)

#Numeric variables that contain anomalies are easily found 
#with filter().:
diagnose_outlier(data) %>% 
  filter(outliers_cnt > 0) 

#The following is a list of numeric variables with 
#anomalies greater than 5%.:
diagnose_outlier(data) %>% 
  filter(outliers_ratio > 5) %>% 
  mutate(rate = outliers_mean / with_mean) %>% 
  arrange(desc(rate)) %>% 
  select(-outliers_cnt)

#Visualization of outliers using plot_outlier()
data %>%
  plot_outlier(Alter) 

#Use the function of the dplyr package and plot_outlier() 
#and diagnose_outlier() to visualize anomaly values 
#of all numeric variables with an outlier ratio 
#of 0.5% or more.:

data %>%
  plot_outlier(diagnose_outlier(data) %>% 
                 filter(outliers_ratio >= 0.5) %>% 
                 select(variables) %>% 
                 unlist())

data %>%
  plot_outlier(diagnose_outlier(data) %>% 
                 filter(outliers_ratio >= 0.5) %>% 
                 select(variables) %>% 
                 unlist())

data %>%
  diagnose_report(output_format = "html", 
  output_file = "Diagn.html")

12-With DataExplorer package:

Specifics identifying

1-Identify Duplicates values:

Find the duplicates values (only) in primary key

data<-dataset    #dataset

#1. Duplicates values: Find the duplicates 
#values (only) in primary key better 
#or all of the dataset

#packages:
library(skimr)
library(Hmisc)

data_1<-unique(data) #duplicates in primary key

before<-length(data$primarykey)
before

after<-length(data_1$primarykey)
after

different<-before-after
different

before_after_matrix<-cbind(before,after)
before_after_matrix

#before:
skim(data)

#after:
skim(data_1)

#or
# Install
install.packages("dplyr")
# Load
library("dplyr")

test<-distinct(data,PersNr_pseudo)
test
length(test$PersNr_pseudo)
skim(test)

2-Identify NA values (Not Available):

http://naniar.njtierney.com/

#package: dplyr, knitr, ggplot2, tidyr, purr, skimr
data %>% 
  summarise_all(funs(sum(is.na(.)))) %>% 
  gather %>% 
  ggplot(aes(x = reorder(key, value), y = value)) + geom_bar(stat = "identity") +
  coord_flip() +
  xlab("variable") +
  ylab("Absolute number of missings")


#With this large number of missings, 
#we probably will not find an easy solution.
#At least let's remember which columns have more than 10%, missings:

cols_with_NA <- round(colMeans(is.na(data)),2)
cols_with_NA
length(cols_with_NA)


# columns have more than 10%
cols_with_too_many_NA <- cols_with_some_NA[cols_with_some_NA > .1]
cols_with_too_many_NA
length(cols_with_too_many_NA)

# alterntively: A column,which less than 10% missing
x<-data %>% 
  select_if(function(col) mean(is.na(col)) < .1)
length(x)
View(x)

3-Identify outliers:

#Simple Identification 
#package: dplyr, knitr, ggplot2, tidyr, purr, skimr
#Choose numeric variable only
data %>% 
  select_if(is.numeric) %>% names

#Histograms are a natural and easy way 
#to spot them and to learn something about the distribution.
data %>% 
  select_if(is.numeric) %>% 
  gather %>% 
  ggplot(aes(x = value)) + facet_wrap(~ key, scales = "free", nrow = 3) +
  geom_histogram()

#Box plots (or violin plots/bean plots) may also be a way:
data %>% 
  select_if(is.numeric) %>% 
  gather %>% 
  ggplot(aes(x = 1, y = value)) + facet_wrap(~ key, scales = "free") + 
  geom_boxplot() +
  ylab("Value") +
  xlab("Variable")

data %>% 
  select_if(is.numeric) %>% 
  gather %>% 
  ggplot(aes(x = 1, y = value)) + facet_wrap(~ key, scales = "free") + 
  geom_violin() +
  ylab("Value") +
  xlab("Variable")

#Similar to outliers, for categorical variable

library(purrr)

data %>% 
  select_if(negate(is.numeric)) %>% 
  select(-matches("essay")) %>% 
  select(-last_online) %>% 
  gather %>% 
  ggplot(aes(x = value)) + geom_bar() + 
  facet_wrap(~ key, scales = "free", ncol = 3) 

#without plotting
y<-data %>% 
  select_if(is.character) %>% 
  summarise_all(funs(max(table(.)/(min(table(.)))))) %>% 
  arrange %>% 
  kable

#or with 
library(skimr)
skim(data)
View(skim(data))

#package: dplyr
library("dplyr")
select_if(data, is.numeric)
names(select_if(data, is.numeric))

#outliers: 
#errors or variance
#Most common causes of outliers on a data set:
#Data entry errors (human errors)
#Measurement errors (instrument errors)
#Experimental errors (data extraction or experiment planning/executing errors)
#Intentional (dummy outliers made to test detection methods)
#Data processing errors (data manipulation or data set unintended mutations)
#Sampling errors (extracting or mixing data from wrong or various sources)
#Natural (not an error, novelties in data)


#methode: visualization or mathematical or ML: 
#univariate (one variable) & Multi-variate (two or more variable)
#parametric (ditribution must be have) or non-parametric

#Some of the most mathematical popular methods for outlier detection are:
#Z-Score or Extreme Value Analysis (parametric)
#Probabilistic and Statistical Modeling (parametric)
#Linear Regression Models (PCA, LMS)
#Proximity Based Models (non-parametric)
#Information Theory Models
#High Dimensional Outlier Detection Methods (high dimensional sparse data)

#ML methode

################
#Visualization:
#Univariate 
#Boxplot
###############
boxplot(data$parameter, main="Alter")
outlier_values <- boxplot.stats(data$parameter)$out
outlier_values

#################
#Visualization
#Multi-variate
#Boxplot
#################

boxplot(data$parameter1~ data$parameter2, data=data_1, main="p1 vs p2")

#################
#Visualization
#Multi-variate
#Scatterplot
#################

# Simple Scatterplot
plot(data$parameter1,data$parameter2, main="Scatterplot", 
     xlab="Alter ", ylab="Betriebszugehörigkeit (Jahre)", pch=19)
     
# Add fit lines
abline(lm(data$Alter~data$`Betriebszugehörigkeit (Jahre)`), col="red") # regression line (y~x) 
lines(lowess(data$Alter,data$`Betriebszugehörigkeit (Jahre)`), col="blue") # lowess line (x,y)
#more about 
#scatterplot: https://www.statmethods.net/graphs/scatterplot.html

###############
#Mathematical
#Z-score
#parametric
###############
#Suggest: Normal Distribution. 
#package: outliers

install.packages("outliers")
library(outliers)

#extreme right tail:
outlier(data$Alter)

#Extreme left tail:
outlier(data$Alter,opposite = TRUE)

###############
#Mathematical
#IQR: Q3-Q1, Tukey methode
#non-parametric
###############
#Suggest: IQR
#package: outliers

install.packages("outliers")
library(outliers)

iqr_score_a<-scores(data$old,type = "iqr")

#step_1: Q1 (25percentile) & Q3(75percentile)
q_a<-quantile(data$Alter)

#step_2: Multiply the IQR you found in Step 1 by 1.5
iqr_score_a_new<-iqr_score_a*1.5

#step_3: Add the amount you found in Step 2 to Q3 from Step 1
a_upper_limit<-iqr_score_a_new + q_a[4]

#step_4:Subtract the amount you found in Step 2 from Q1 from Step 1:
a_lower_limit<-iqr_score_a_new - q_a[2]


#Count the outliers from the dataset
outlier_test<-iqr_score_a>a_upper_limit & iqr_score_a<a_lower_limit
outlier_test
length(outlier_test[outlier_test="TRUE"])

#more : http://www.statisticshowto.com/find-outliers/

#The outliers are exclude between upper limit and lower limit

#############################
#Machine Learning Alghorithm
#Local Outlier Factor
############################
install.packages("DMwR")
library(DMwR)
data<-qry_neue_DL
data

outlier.scores<-lofactor(data$Alter,k=5)
outlier.scores
plot(density(outlier.scores))
#Error: cannot allocate vector of size 16.9 Gb

#############################
#Machine Learning Alghorithm
#OutliersO3: 
############################
# -"HDo" HDoutliers (from HDoutliers)
# -"PCS" FastPCS (FastPCS) 
# -"BAC" mvBACON (robustX)
# -"adjOut" adjOutlyingness (robustbase)
# -"DDC" DectectDeviatingCells (Cellwise)
# -"MCD" covMCD (robustbase)
#install.packages("Outliers03")
#library(Outliers03)
#package ‘Outliers03’ is not available (for R version 3.5.1)

#############################
#Machine Learning Alghorithm
#k-nearest neighbors 
############################

##########################################
#Machine Learning Alghorithm
#RobustX: 
##########################################
install.packages("robustX")
library(robustX)

distances<-mvBACON(data$Alter)
distances

##########################################
#Machine Learning Alghorithm
#HDoutliers: Leland Wilkinson's Algorithm
##########################################
install.packages("HDoutliers")
library(HDoutliers)
cevi<-1:100000
out.W<-HDoutliers(cevi)
plotHDoutliers(cevi,out.W)

4-Plausibility check: numeric & non numeric

Plausibility check can includes checking orders of magnitude, looking for implausible values (negative body weight), among others. A good starter is to differentiate between numeric and non-numeric variables.

#numeric
data %>% 
  select_if(is.numeric) %>% 
  map(summary)

#non-numeric
View(data %>% 
  select(-matches("essay")) %>% 
  select_if(is.character) %>% 
  mutate_all(factor) %>% 
  map(summary))
  
  #Alternatively
  #more complete with dlookr package

5-Highly correlated & covariance of variables:

#package: dplyr, knitr, ggplot2, tidyr, purr, skimr:
data %>% 
  select_if(is.numeric) %>% 
  cor