The following course goes over the R programming language and statistical concepts in R in three 3 levels. Basics, Intermediate and Advanced.
Setup and Installation
A Installing R and RStudio | Hands-On Programming with R
## A Quick Tutorial on R
# Tahir Muhammad | University of Toronto
# Introduction to R ! (First Level)
# Get your working directory by getwd(), and set your working directory by setwd("---File Path---) command.
# Note the file path should contain foward slashes, i.e / for windows and \\ for linux
# rm(list = ls()) //Clears your current working enviroment
# Arithmetic Operators
5+1
6-9
11*11
7^8
4/5
# Special numbers
pi
exp(1)
exp(2)
# Variables
a <- 3
b <- 5
a*b
# Bedmas still applies
2*a-b
# Storing a vector into a variable, y
y <- c(1,2,3,4,5)
# Basic vector operation
x <- c(1,4,6,-8,5)
# Adding an element to a vector
x+y
# Multiplying by a scalar to vector y
x*y
#
x %*% y
# Methods on vectors
mean(x)
var(x)
sd(x)
var(x)
median(x)
sum(x)
prod(x)
# Plotting
# Qualitative data: Make a frequency table
x = c( "yes", "no", "yes", "no", "yes", "yes", "no", "yes", "yes", "no")
table(x)
# Partition the space into a matrix (row,columns)
par(mfrow=c(2,2))
# Bar chart
barplot(table(x))
barplot(table(x), col=c("red","blue"))
# Pie chart
par(mfrow=c(2,1)) #2 rows, 1 column
pie(table(x))
pie(table(x), col=c("green","purple"), main="People who like starbucks at UTM")
# Probability distribution
#The command for Normal distribution is pnorm(n, mean, standard deviation).
#Example 1) Probability (Z < 1.96)? #This is the CDF
pnorm(1.96, 0, 1) #Takes in a value and gives you probability
#Example2) Let X~N(25,100), what is P(10 < X < 30)?
pnorm(30, mean = 25, sd = 10) - pnorm(10, mean = 25, sd = 10)
#Note: When you put in the value in pnorm by default, it thinks its the standardized value.
pnorm(1.96)
#Normal Distribution Quantile
#Takes probabilties and give u values. i.e: "What is the critical value given this much probability and it's distribution?"
qnorm(0.975, mean =0, sd =1)
#The default gives you the value from a standard z table
qnorm(0.95)
#The command for Gamma distribution is pnorm(n, mean, standard deviation).
#X~gamma(5,5), what is P(1 < X < 6)?
pgamma(6, shape = 6, scale = 6) - pgamma(1, shape = 6, scale = 6)
#Stimulating data and making graphs
# The rnorm, rgamma, r-distribution function does that for us.
#To assessing normality look at Histograms, QQPlot, AND Boxplot
z = rnorm(10000)
par(mfrow=c(1,3)) #Showing multiple plots
hist(z)
qqnorm(z) #QQ plot
boxplot(z)
# Gamma distribution
g <- rgamma(100000, shape=2, scale=0.5) #Shape is called the alpha, and beta is scale
par(mfrow=c(1,3)) #Showing multiple plots
hist(g)
qqnorm(g) #QQ plot
boxplot(g)
# T-distribution
t <- rt(100000, df=6)
par(mfrow=c(1,3))
hist(t)
qqnorm(t)
boxplot(t)
# Entering in data
mydata <- c(74,12,654,12341, 32, 99, 1000)
# Can preform all kinds of descriptive statistics
min(mydata)
max(mydata)
sum(mydata)
var(mydata)
sd(mydata)
median(mydata)
sort(mydata) # Sorts the vector
quantile(mydata) # Gives you all the quantiles
# Summary of the vector: # min, lower hinge, Median, upper hinge, max
fivenum(mydata) # min, lower hinge, Median, upper hinge, max
# Importing and Reading CSV files
reading = read.csv("/home/tahir/Downloads/reading.csv", header = TRUE)
reading #displays your data. Note: It's / for linux, and \\ for windows.
reading$score #Gives us the column score
reading$treatment #Gives us the column Treatment
control = reading[reading$treatment == 'Control',] # Gives us all the things that were controlled
treated = reading[reading$treatment == 'Treated',]
mean(control$score)
mean(treated$score)
# Trees dataset
data(trees)
x = trees
summary(x)
#Lets assess the normality of the Trees data set (i.e of Volume, Hieght and Girth)
par(mfrow=c(3,3))
#Girth
hist(x$Girth)
qqnorm(x$Girth)
qqline(x$Girth,col=4)
boxplot(x$Girth)
# Volume
hist(x$Height)
qqnorm(x$Height)
qqline(x$Height,col=2)
boxplot(x$Height)
#Hieght
hist(x$Volume)
qqnorm(x$Volume)
qqline(x$Volume)
boxplot(x$Volume)
#Conclusion: Height seems normal, volume does not seem normal.
# Some useful R librarys
#ggplot2 = Graphics in r
#tidyr = cleaning up info
#stringr = text info
#httr = Website data
#shiny = Interactive applications for websites
# Installing a package with its dependencies
#install.packages("tidyverse", dependencies = TRUE)
# Loading a package
library(dplyr)
library(ggplot2)
library(tidyverse)
# Lets load some data
getwd()
setwd("/home/tahir/Desktop/TA/Teaching/Datasets")
coffee <- read.csv("coffee_ratings.csv")
# Seeing your data
View(coffee) # Views and opens it on another page
glimpse(coffee) # A transposed version of print: columns run down the page, and data runs across.
head(coffee) # Displays the top 5 rows in your dataframe
# The pipe operator in R
# This allows us to redirect output. The commands before the pipe, %>%, is consired as input to the command after the pipe.
head(coffee) %>% glimpse() # Notice how the number of rows has changed.
# If we look at our data, we have several different data types of columns
# Data types in R include:
# logical vectors <lgl>, contain TRUE or FALSE.
# double vectors <dbl>, contain real numbers.
# integer vectors <int>, contain integers.
# character vector <chr>, contain strings made with "".
# dates <dt>, record a date.
# factors <fct>, which are used to represent categorical variables can take one of a fixed and known set of possible values (called the levels).
# All of these combined can be made into a tibble / Dataframe.
glimpse(coffee)
# Lets visualize some of the variables.
#What kind of variable is species? Nominal categorical variable# What kind(s) of plot is appropriate to visualize the distribution of this variable? Barplot!# Let's visualize!
ggplot(data=coffee,aes(x=farm_name, color=species, fill=farm_name)) +
geom_bar() +
labs(x="Species of coffee bean") +
coord_flip()
# Tahir Muhammad | University of Toronto
# Intermediate Topics in R: Confidence Intervals & Hypothesis Testing, One-Way & Two-way Anova, and Regression.
###################################################################
####### PART 1: Hypotehsis Testing and Confidence Intervals #######
###################################################################
# These types of tests can be broken down into 6 different types of cases/situations.
# Single Populaion/One Samples: Large & Small Sample Size for One mean
# Multiple Populations/ Two Samples: Dependent Samples, & Independent Samples. 3 Cases: (I) Variance Known,
# (II) Variance unkown but assumed equal and (III) Variance unknown and not equal.
# Note about Confidence Intervals: There is no direct function which calculates just the confidence intervals, hence we will
# need to create it from scratch for each case. The good thing is, we know that each CI is always point_estimate +/- Margin of Error.
# Single Populaion / One Samples
# Case I. Large Samples
#Lets get some data. Here is the salary of 30 data scientists.
salary_data = c(117313, 104002, 113038, 101936, 84560, 113136, 80740, 100536, 105052, 87201, 91986, 94868, 90745,
102848, 85927, 112276, 108637, 96818, 92307, 114564, 109714, 108833, 115295, 89279, 81720,
89344, 114426, 90410, 95118, 113382)
# Calculate a 95% Confidence Interval for the given dataset.
# Since n = 30, we can use CLT to approximate this to a normal distribution.
x_bar <- mean(salary_data)
s <- sd(salary_data)
n <- 30
# Getting the Z_alpha/2.
# Recall that R always gets the area under the curve from right to left.
# Since we have a two tailed test, we must account for it accordingly.
qnorm(0.975) # Implies 95% probability in between.
#Thus, our 95% CI is:
Lower_tail <- x_bar - qnorm(0.975)*(s/sqrt(n))
Upper_tail <- x_bar + qnorm(0.975)*(s/sqrt(n))
# Hence we conclude that we are 95% confident that the true average salary of a data scientist is between 96-104k.
# Hypothesis Test:
# 1) H0: mu = 113,000 & H1: mu =/= 113,000
# 2) Calculate your test statistic.
t.test(salary_data, mu=113000, alternative="two.sided")
# 3) Make a decision, then make a real-world conclusion.
#Here we can see that since the p-value is below 0.05, we have statistical evidence that the average salary of data scientists is not $113,000.
#Some other cases in R
# Note that by default, R does a two-sided test. i.e the following is same as above.
t.test(salary_data, mu=113000)
# Lower tailed test
t.test(salary_data, mu=113000, alternative="less")
# Upper tailed test
t.test(salary_data, mu=113000, alternative="greater")
# Single Populaion / One Sample
# Case II. Small Sample Size.
# Suppose now we had data of n = 9.
salary_data2 <- c(78000, 90000,75000,117000, 105000, 96000, 89500, 102300, 80000)
# Find the appropriate statistic, taking into consideration the degrees of freedom (if applicable) for 99% confidence
xbar <- mean(salary_data2)
std <- sd(salary_data2)
n <- 9
df <- 8
# Find the 99% CI
# value of t with 0.05% in each tail. Find t_n-1;alpha/2
tvalue <- qt(0.005, 8, lower.tail = TRUE)
# This gets us P(X <= x), i.e the quantile which has probability less than & equal to 0.005.
# Note that since t-distribution is symmetric, we can just take the positive value of that quantile
# Now, we can construct our 99% CI.
Lower_tail<- xbar + tvalue*(std/sqrt(n))
Upper_tail <- xbar - tvalue*(std/sqrt(n))
# Display our CI
Lower_tail
Upper_tail
# Hence we can be 99% confident that the mean salary for a data scientist is between $76,950 to $108,115.
# Notice how the interval got bigger as we increased the confidence level.
# Hypothesis Test
# Question: Estimate if the email open rate of one of our firm's competitor is above your company's.
# Open rate data for a competitior firm
open_rate_data <- c(0.26,0.23,0.42,0.49,0.23,0.59,0.29,0.29,0.57,0.40)
# Test at a significance level of 0.05. (Assume it is 0.05 when a significance level is not given)
# 1) H0: mu <= 0.40 & H1: mu > 0.40
# 2) Calculate your test statistic
# The Student's T distribution works here because of small small, and variance unknown.
t.test(open_rate_data, mu = 0.40, alternative="greater")
# 3) Make a decision: Since Pvalue > alpha, we FTR H0.
# Multiple Populations
# Two Samples; Dependent. Here we have paired Data, positive Co-relation, pairs are independent.
# Usually use this test for when researching same subject over time -> Before & After situations, positive co-relation.
# Or when looking at cause and effect situations
# the test is same as above for one mean, except the difference is:
difference = before - after
dbar = mean(difference)
# Construct CI like above, i.e:
CI <- dbar +/- (z_alpha/2 or t_n-1;alpha/2) * (std/sqrt(n))
# Hypothesis Testing
# Test to see if a drug companies claims are true about thier new drug increasing magnesium levels in patients.
# Getting our data
before <- c(2,1.4,1.3,1.1,1.8,1.6,1.5,0.7,0.9,1.5)
after <- c(1.7,1.7,1.8,1.3,1.7,1.5,1.6,1.7,1.7,2.4)
# Check to see if our data is co-related
cor(before,after)
plot(before,after)
# H0: mu_d >= 0 & H1: mu_d < 0
# Two samples, paired t-test.
t.test(before,after,paired=T)
# Here we can conlcude that at 5% significance level, we have sufficient evidence that the company's new drug does indeed increase patients magnesium levels.
# Lets construct a CI for this example aswell.
difference <- c( before - after)
# Calculate the standard error
dbar <- mean(difference)
s <- sd(difference)
n <- length(difference)
SE <- s/(sqrt(n))
# The 95% CI
lower_tail <- dbar - qt(0.975,9)*SE
upper_tail <- dbar + qt(0.975,9)*SE
# As we can see, this matches our CI which we got from the built-in t.test command! Nice =).
# Multiple Populations
# Two Samples; Independent. Case I) Population variance is unknown, and Variance NOT equal.
# Calculate the 99% confidence interval for the difference of two means.
# Sample 1
n_1 <- 100
xbar <- 58
sigma_1 <- 10
# Sample 2
n_2 <- 70
ybar <- 65
sigma_2 <-5
# Calculate Standard error
var1 <- (sigma_1^2)/n_1
var2 <- (sigma_2^2)/n_2
standard_error <- sqrt((var1 + var2))
Lower_tail <- (ybar - xbar) + (qnorm(0.005))*(standard_error)
Upper_tail <- (ybar - xbar) - (qnorm(0.005))*(standard_error)
#Hence, we conclude that we are 99% confidence that the true difference between engineering and history students scores is between 4 to 10 points.
# Hypothesis Testing
# Welch Two Sample t-test
# Lets use the same data as the paired example.
before <- c(2,1.4,1.3,1.1,1.8,1.6,1.5,0.7,0.9,1.5)
after <- c(1.7,1.7,1.8,1.3,1.7,1.5,1.6,1.7,1.7,2.4)
# Test with pop var known
t.test(before,after, var.equal = F, alternative="greater")
# Note by default, it assumes variance is not equal. Thus the following is same as above.
t.test(before,after, alternative="greater")
# Multiple Populations
#Two Samples; Independent. Case II) Population variance is unknown but assumed equal.
# Question: Estimate the difference in price of apples between NewYork and LA.
# Lets get some data
NY_PriceOfApples <- c(3.80, 3.76, 3.87, 3.99, 4.02, 4.25, 4.13, 3.98, 3.99, 3.62) # Sample 1
LA_PriceOfApples <- c(3.02, 3.22, 3.24, 3.02, 3.06, 3.15, 3.81, 3.44) # Sample 2
# Calculate the 95% Confidence Interval
# Sample 1
xbar <- mean(NY_PriceOfApples)
n1 <- 10
s1 <- sd(NY_PriceOfApples)
# Sample 2
ybar <- mean(LA_PriceOfApples)
n2 <- 8
s2 <- sd(LA_PriceOfApples)
# Pooled variance
PooledVar_numerator <- ((n1-1)*(s1^2)) + ((n2-1)*(s2^2))
PooledVar_denomenator <- n1 + n2 - 2
PooledVariance <- PooledVar_numerator/PooledVar_denomenator
# Standard Error
SE <- sqrt(((PooledVariance/n1) + (PooledVariance/n2)))
# T-value. DOF = n1 + n2 -2 = 16
t_value <- qt(0.025, 16, lower.tail = FALSE)
# 95% CI
Lower_tail <- (xbar - ybar) - (t_value*SE)
Upper_tail <- (xbar - ybar) + (t_value*SE)
# We can see that the mean difference is between (0.47, 0.92). Since the interval does not include 0, we can conclude that we are
# 95% confident NewYork Apples are more expensive than the apples in LA
# Hypothesis Test: Test to see if the true means difference is equal to 0.
# Variance is unknown but assumed equal.
t.test(NY_PriceOfApples,LA_PriceOfApples, var.equal=T)
# Changing confidence levels
t.test(NY_PriceOfApples,LA_PriceOfApples, conf.level = 0.99, var.equal=T)
# Changing alternative hypothesis
t.test(NY_PriceOfApples,LA_PriceOfApples, conf.level = 0.99, var.equal=T, alternative="less")
# Recall that when we are comparing two means using any of the hypothesis testing techniques we have learnt
# so far, it is always about one group mean or two. Now, we introduce Anova, (One-way), which can be used to test
# multiple group means at the same time.
# Example: Test if the group means are the same for 4 different groups.
values <- c(65, 54, 56, 60, 55, 58, 62, 65, 64, 67, 70, 74, 60, 64, 68, 70)
groups <- c(rep("a1",4), rep("a2", 4), rep("a3", 4), rep("a4", 4))
dataset <- data.frame(x = groups, y = values)
# Display our constructed relational data table
dataset
# Find group means
with(dataset, tapply(values, groups, mean))
# ybar1 = 59, ybar2 = 60, ybar3 = 68.75, ybar4 = 65.5
# Quick comparison of the means
boxplot(values ~ groups)
# Anova: Method 1
anova(lm(values~groups,data = dataset))
# Anova: Method 2 (Both give same output)
anova(aov(values~groups, data=dataset))
# Tahir Muhammad | University of Toronto
# Advanced concepts in R: Simulation, Inverse Transformation Method, Acceptance Rejection Method,Transformations & Monte Carlo Estimation
# Stimulation of Random Numbers: How do we simulate the uniform distribution
# How can we simulate any distribution? Two methods:
# I) Inverse Transfomation Method & II) Acceptance-Rejection Method
# Transformation Methods: Convulution, Mixtures, Box-Mueller Transformation
# Solving any integration problems: Monte Carlo Estimations
####################################
#### Simulating random numbers #####
####################################
# How does a computer stimulate random numbers?
# Linear Congruential Generators -> uses a mathematical formula, r_i = ar_i + b(mod d)
seed = 1 # Setting seed to be able to replicate: must be positive
a = 5 # the multiplier
b = 3 # shift
d = 16 #modulus
n = 20 # lenght of run
r = numeric(n)
r[1] = seed
for (i in 1:(n-1)){
r[i+1] = (a*r[i]+b) %% d
}
# r is now an array filled with random numbers
r
# It is usually good practice to generate numbers over (0,1).
# Why do we do this? --> Because once you have uniform (0,1), you are able to generate any distribution.
# Make the random generated numbers over (0,1)
# Initialize
a = 1093; b = 18257; d = 86436; seed = 2
m = 10000; r = numeric(m); r[1] = seed
# Generate random numbers using Linear Congruential Generator
for (i in 1:(m-1)){
r[i+1] = (a*r[i]+b) %% d
}
# Make all numbers between 0,1
u <- (r+0.5)/d
# Plot to see that these are unifomaly distributed! -> Our numbers where random on the interval (0,d), or (0,1) for u.
hist(u)
hist(r)
# The above holds a powerful result. This is one way the r-uniform distribution can be generated in R, under the hood.
# To use R's built in commands, we can just do:
x <- runif(n=10000,min=0,max=1)
hist(x)
########################################
#### Inverse Transformation Method #####
########################################
# Given a density function, can generate it's distribution.
# How? Well, think about how every CDF is between 0,1. This implies that every CDF can be thought of as a Unif(0,1).
# However, the uniform(0,1) most likely does not look like the density you are trying to generate. Thus, what we do is
# We simulate uniform(0,1) (can derive it as we shown above), and then apply the F inverse to get the density x!
# Steps:
# i) Generate a random variable from U(0,1)
# ii) Deliver x = Fx^-1 (u)
# Example: Find the distribution of f(x) = 3x^2, where 0 < x < 1
n = 1000
u <- runif(n,0,1)
x <- u^(1/3)
hist(x, prob=TRUE, main = bquote(f(x) == 3*x^2), col="green")
# Sample
y <- seq(0, 1, .01)
# Density Curve
lines(y, 3*y^2, col="black")
# Inverse Value Method; Discrete Case: Recursive
#########################################
######## Mixtures & Convulutions ########
#########################################
# Convulution is a transformation method which allows you to find the distribution of the sum of two random variables.
# Example: Adding hieght of class 1 + hieght of class 2. (They must come from the same distribution)
# Some common distributions are convolutions! For example, Binomial is a sum of bernoulli -> convulution. Exponential -> Gamma, .. etc.
# Mixtures is a derived density from one or more distributions. Ex: Adding male distribution to female distribution
# The sum of the wieghts must add up to 1. Ex: 05Fx_1(x) + 0.3Fx_2(x) + 0.2F_x3(x) => 0.5+0.2 +0.3 = 1.
# Convulution Example: What is the density of S, where S = X1 + X2 + X3, and X1 ~ N(0,1), X2 ~ N(11,20), & X3 ~ N(-1,1) ?
n <- 10000
x1 <- rnorm(n,0,1)
x2 <- rnorm(n,11,20)
x3 <- rnorm(n,-1,1)
S <- x1 + x2 + x3
hist(S) # We can see the resulting density is also normal.
# Mixtures Example:
# To simulate mixtures, we apply the composition technique (algorithm). It is briefly summarized below.
# Suppose you want to simulate from: F(x) = (1/15)X1 + (2/15)X2 + (3/15)X3 + (4/15)X4 + (5/15)X5
# sample intgers 1-4, with each integer having the probability of a weight in the equation above.
# generate X1 - X4 from thier corresponding density and compute your equation.
# Generating corresponding density of X1, X2, X3, X4, X5
n <- 10000
X1 <- rgamma(n,1,1)
X2 <- rnorm(n,0,1)
X3 <- rbeta(n,1,1)
X4 <- runif(n,0,1)
X5 <- rexp(n,1)
# Sample K accordingly. Note that sum of Prob(k1 to k4) = 1.
K <- sample(1:5, size=n, replace=TRUE, prob=(1:5)/15)
# We now have a integer of 10,000 K, each sampled based on thier probabilities.
# Let's compute the Mixture now
x <- 0
i <- 0
for i in(1:k){
if k[i] == 1{
x = x + k[i]*X1
}
if k[i] == 2{
x = x + k[i]*X2
}
if k[i] == 3{
x = x + k[i]*X3
}
if k[i] == 4{
x = x + k[i]*X4
}
if k[i] == 5{
x = x + k[i]*X4
}
}
# We have now computed the density of X, which is a mixture of of gamma, normal, beta, uniform, and exponential densities!
hist(x)
#########################################
######## Monte Carlo Stimulation ########
#########################################
# Monte Carlo estimation is a way to estimate/solve complex integration problems we are unable to compute/solve otherwise.
# The generic algorithm always goes something like this: i) Genere x1, x2, ... xn from some distribution. ii) deliver thetahat = E(x)*constants/rvs, etc.
# We can do is in 5 ways:
# 1) Estimate intervals which are bounded by [0,1] by the uniform distribution
# 2) Transform bounded intervals to be on [0,1] and then use (1)
# 3) Transform the integral to any PDF, and estimate using that.
# 4) Hit or Miss / Indicator functions:
# 5) Importance Sampling Method: Multiply your integrand g(x) by a function h(x), which has the following properties:
# a) We know it's density/it'seasy to simulate
# b) We choose h(x) to be close as possible as g(x) in order to be computationally efficient
# Example 1: Approximate the value of: int(0,pi/3) sin(t) dt, and compare it with the exact value
n <- 100000
u <- runif(n,0,(pi/3))
# Compute exact value
f_x <- function(x){sin(x)}
theta <- integrate(f_x,0,pi/3)
# Compute estimated value
theta_hat <- (pi/3)*mean(sin(u))
# Print out exact and estimated values
theta
theta_hat
# Note that to get the variance of the estimate, we need alot of theta_hats, which in turn will help us estimate the variance.
# To understand the statement above intuitionally, recall the sampling distribution.
y <- replicate(1000, expr = {
u <- runif(n,0,(pi/3))
theta_hat <- (pi/3)*mean(sin(u))
})
mean(y)
sd(y)
# Example 2: Write a Monte Carlo function to estimate P(X=1), where X ~ beta density(3,3).
# True value
theta <- rbeta(1,3,3)
# Estimate
theta_hat <- function(m){
n <- m
u <- runif(n,0,1)
estimate <- 30*mean((u^2)*((1-u)^2))
return(estimate)
}
theta_hat(1000)