Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---
- title: "Assignment 1"
- author: "Nikola Danevski"
- date: "9/14/2019"
- output: pdf_document
- ---
- ```{r setup, include=FALSE}
- knitr::opts_chunk$set(echo = TRUE)
- ```
- The histograms can be found below.
- b) The first one has one center, sort of symmetric shape and the data is mostly assembled around the center.
- c) We use ceil(log(2, n)) + 1 bins by the formula, which would result in 13, because n = 2304.
- However, the number of bins used by the hist() function is 10.
- j) We can see that the data is skewed to the right from the histogram. This is also noticeable later when
- we calculate the skewness and it's positive.
- m) We can see that the new histogram has almost perfect distribution, one-centered, symmetrical and data spread around the center. An even better sign for this is after drawing the mean, median and trimmed-mean lines which all seem to overlap.
- This implies the distribution is perfect.
- o) The new histogram with sqrt is almost bell-shaped distribution, it is one-centered and symmetrical with a lot of the data spread around the center.
- ```{r}
- waves <- read.csv("waves.csv")
- summary(waves)
- waves_height_max <- waves$Hmax
- hmax_hist <- hist(waves_height_max, main = "Histogram of Hmax", xlab = "Maximum height", ylab = "Number of waves")
- mean_waves_height <- mean(waves_height_max) #saving the mean and median in an object
- median_waves_height <- median(waves_height_max)
- mean_waves_height #printing the values of the mean and the median
- median_waves_height
- mean_trimmed_20 <- mean(waves_height_max, trim=0.2)
- abline(v = mean_waves_height, col="red")
- abline(v = median_waves_height, col="blue")
- abline(v = mean_trimmed_20, col="green")
- text(mean_waves_height + 0.28, 600,substitute(paste(bar(x),"=",m), list(m=round(mean_waves_height,2))), col="red")
- text(median_waves_height - 0.28, 600,substitute(paste(tilde(x),"=",m), list(m=round(median_waves_height,2))), col="blue")
- text(mean_trimmed_20 - 0.34, 500,substitute(paste(bar(x)[20],"=",m), list(m=round(mean_trimmed_20,2))), col="green")
- ```
- ```{r}
- hmax_hist
- ```
- ```{r}
- q1 <- quantile(waves_height_max, 0.25)
- q3 <- quantile(waves_height_max, 0.75)
- q1
- q3
- IQR_height_max <- IQR(waves_height_max)
- IQR_height_max
- lower_fence <- q1 - 1.5 * IQR_height_max #the lower fence
- upper_fence <- q3 + 1.5 * IQR_height_max #the upper fence
- lower_fence
- upper_fence
- #Now calculating the outliers
- waves[waves_height_max >= upper_fence, ] #upper outliers
- waves[waves_height_max <= lower_fence, ] #lower outliers
- #Now calculating variance, standard deviation and coeff. of variation of the maximum wave height.
- variance <- var(waves_height_max)
- stand_dev <- sd(waves_height_max)
- cv <- stand_dev / mean_waves_height
- variance
- stand_dev
- cv
- #Calculating skewness
- library(moments)
- skewness(waves_height_max)
- #Box-Cox transformation (TODO: WHAT DOES PART k) want from me?)
- #the recommended one is the middle line (page 4)
- #
- #
- library(MASS)
- bc <- boxcox(waves_height_max~1) #original plot, we wanna see the range on the right
- lambda <- bc$x[bc$y == max(bc$y)] #this gives the best value for lambda
- new_data_box_cox <- (waves_height_max^lambda-1)/lambda #according to the formula
- hist(new_data_box_cox, main = "After applying Box-Cox", xlab = "New data with the transformation")
- mean_new_data <- mean(new_data_box_cox)
- median_new_data <- median(new_data_box_cox)
- mean_new_data_trimmed_20 <- mean(new_data_box_cox, trim=0.2)
- abline(v = mean_new_data, col="red")
- abline(v = median_new_data, col="blue")
- abline(v = mean_new_data_trimmed_20, col="green")
- text(mean_new_data + 0.28, 400,substitute(paste(bar(x),"=",m), list(m=round(mean_new_data,2))), col="red")
- text(median_new_data - 0.28, 350,substitute(paste(tilde(x),"=",m), list(m=round(median_new_data,2))), col="blue")
- text(mean_new_data_trimmed_20 - 0.34, 400,substitute(paste(bar(x)[20],"=",m), list(m=round(mean_new_data_trimmed_20,2))), col="green")
- new_data_sqrt <- (sqrt(waves_height_max))
- hist(new_data_sqrt, main = "After applying sqrt transformation", xlab = "New data with sqrt trans")
- mean_sqrt <- mean(new_data_sqrt)
- median_sqrt <- median(new_data_sqrt)
- mean_sqrt_trimmed_20 <- mean(new_data_sqrt, trim = 0.2)
- abline(v = mean_sqrt, col="red")
- abline(v = median_sqrt, col="blue")
- abline(v = mean_sqrt_trimmed_20, col="green")
- text(mean_sqrt + 0.18, 300,substitute(paste(bar(x),"=",m), list(m=round(mean_sqrt,2))), col="red")
- text(median_sqrt - 0.18, 350,substitute(paste(tilde(x),"=",m), list(m=round(median_sqrt,2))), col="blue")
- text(mean_sqrt_trimmed_20 - 0.14, 300,substitute(paste(bar(x)[20],"=",m), list(m=round(mean_sqrt_trimmed_20,2))), col="green")
- summary(new_data_sqrt)
- #Now we draw the QQ plots
- qqnorm(waves_height_max, main = "Normal data", xlab = "", ylab = "")
- qqline(waves_height_max)
- qqnorm(new_data_box_cox, main = "Box cox transformation data", xlab = "", ylab = "")
- qqline(new_data_box_cox)
- qqnorm(new_data_sqrt, main = "Square root data", xlab = "", ylab = "")
- qqline(new_data_sqrt)
- #TODO -> WHAT DOES question p want?
- mat <- matrix(NA, nrow=9, ncol=5)
- rownames(mat) <- c("Original","","","Box-Cox","","","Square Root","","")
- colnames(mat) <- c("k","xbar-k*s", "xbar+k*s", "Theoretical %","Actual %") #TODO fix the bar thing
- ### Fill in known quantities
- mat[,1] <- c(1,2,3)
- mat[,4]<- c(68,95,99.7)
- mat[1,2] <- mean(waves_height_max)-1*sd(waves_height_max)
- mat[2,2] <- mean(waves_height_max)-2*sd(waves_height_max)
- mat[3,2] <- mean(waves_height_max)-3*sd(waves_height_max)
- mat[1,3] <- mean(waves_height_max)+1*sd(waves_height_max)
- mat[2,3] <- mean(waves_height_max)+2*sd(waves_height_max)
- mat[3,3] <- mean(waves_height_max)+3*sd(waves_height_max)
- mat[4,2] <- mean(new_data_box_cox)-1*sd(new_data_box_cox)
- mat[5,2] <- mean(new_data_box_cox)-2*sd(new_data_box_cox)
- mat[6,2] <- mean(new_data_box_cox)-3*sd(new_data_box_cox)
- mat[4,3] <- mean(new_data_box_cox)+1*sd(new_data_box_cox)
- mat[5,3] <- mean(new_data_box_cox)+2*sd(new_data_box_cox)
- mat[6,3] <- mean(new_data_box_cox)+3*sd(new_data_box_cox)
- mat[7,2] <- mean(new_data_box_cox)-1*sd(new_data_box_cox)
- mat[8,2] <- mean(new_data_box_cox)-2*sd(new_data_box_cox)
- mat[9,2] <- mean(new_data_box_cox)-3*sd(new_data_box_cox)
- mat[7,3] <- mean(new_data_box_cox)+1*sd(new_data_box_cox)
- mat[8,3] <- mean(new_data_box_cox)+2*sd(new_data_box_cox)
- mat[9,3] <- mean(new_data_box_cox)+3*sd(new_data_box_cox)
- mat[1,5] <- sum(waves_height_max >mean(waves_height_max)-1*sd(waves_height_max) & waves_height_max < mean(waves_height_max)+1*sd(waves_height_max))/length(waves_height_max)*100
- mat[2,5] <- sum(waves_height_max >mean(waves_height_max)-2*sd(waves_height_max) & waves_height_max < mean(waves_height_max)+2*sd(waves_height_max))/length(waves_height_max)*100
- mat[3,5] <- sum(waves_height_max >mean(waves_height_max)-3*sd(waves_height_max) & waves_height_max < mean(waves_height_max)+3*sd(waves_height_max))/length(waves_height_max)*100
- mat[4,5] <- sum(new_data_box_cox >mean(new_data_box_cox)-1*sd(new_data_box_cox) & new_data_box_cox < mean(new_data_box_cox)+1*sd(new_data_box_cox))/length(new_data_box_cox)*100
- mat[5,5] <- sum(new_data_box_cox >mean(new_data_box_cox)-2*sd(new_data_box_cox) & new_data_box_cox < mean(new_data_box_cox)+2*sd(new_data_box_cox))/length(new_data_box_cox)*100
- mat[6,5] <- sum(new_data_box_cox >mean(new_data_box_cox)-7*sd(new_data_box_cox) & new_data_box_cox < mean(new_data_box_cox)+7*sd(new_data_box_cox))/length(new_data_box_cox)*100
- mat[7,5] <- sum(new_data_sqrt > mean(new_data_sqrt)-1*sd(new_data_sqrt) & new_data_sqrt < mean(new_data_sqrt)+1*sd(new_data_sqrt))/length(new_data_sqrt)*100
- mat[8,5] <- sum(new_data_sqrt > mean(new_data_sqrt)-2*sd(new_data_sqrt) & new_data_sqrt < mean(new_data_sqrt)+2*sd(new_data_sqrt))/length(new_data_sqrt)*100
- mat[9,5] <- sum(new_data_sqrt > mean(new_data_sqrt)-3*sd(new_data_sqrt) & new_data_sqrt < mean(new_data_sqrt)+3*sd(new_data_sqrt))/length(new_data_sqrt)*100
- library(knitr)
- kable(x=mat, digits=2,row.names=T, format="markdown")
- ```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement