# This script simulates why we have to devide by n-1 when calculating the variance of a sample
library(ggplot2)
# Create a 100 random datapoints with a mean of 15 and a standard deviation of 5
population = rnorm(100,15,5)
# Make a dataframe and add an ID column
population = as.data.frame(cbind(ID=1:100,Height=population))
# Let's check out how the values are distributed
ggplot(population,aes(x=Height))+
geom_histogram()
#Create an empty object that will hold the results of the simulation
simulation=NULL
# a so-called loop: Do something n-times (here 1000 times)
for(i in 1:1000){
# take 10 samples from the population
sample = sample(population$Height, 10)
# calculate the variance with n=10
variance_sample = sum((sample-mean(sample))^2)/10 # n
# calculate the variance with n-1
variance_sample_n1 = sum((sample-mean(sample))^2)/9 # n-1
# calculate the variance the variance of the population with N
Variance_Population =sum((population$Height-mean(population$Height))^2)/100
# Bind all the results together and save it in the simulation object
simulation= as.data.frame(rbind(simulation,
rbind(cbind(Run=i,Methode="n",Variance=variance_sample),
cbind(Run=i,Methode="n-1", Variance=variance_sample_n1),
cbind(Run=i,Methode="N", Variance=Variance_Population))))
}
# The Variance column was converted to a factor in the process, so we have to convert it to numeric again
simulation$Variance = as.numeric(as.character(simulation$Variance))
# Plot of the result
ggplot(simulation, aes(x=Methode,y=Variance))+
geom_boxplot()
# You can re-run this code multiple times, n-1 will produce more acurate estimates.