Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #=======================================
- # BSAD 443, Session 1
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- # OLS regression model
- #====
- model <- lm(price ~ .,realestate) # the period means all variables
- summary(model)
- # But first a bit of R
- #====
- # Orientation
- #=
- getwd() # shows us where the working directory is
- setwd("c:/temp") # sets the working directory to the "temp" folder under drive "c" -- note direction of slash!
- dir.create('BSAD443') # creates a directory called IDS575 under temp. Note that R is, for the most part, accpets both single and double quatation marks
- setwd("BSAD443")
- getwd()
- # [1] "c:/temp/BSAD443"
- # Install a package
- #=
- # use RStudio's gui, or
- install.packages("dplyr")
- library(dplyr) # makes all functions in dplyr available, or
- dplyr:: # brings up a list of functions to choose from
- # Basic operations
- #=
- 5+6
- # [1] 11
- 8*7
- # [1] 35
- 10*5 / 0.5
- # [1] 100
- # Vectors
- a <- 5+6
- b <- 8*7
- c <- a+b
- c
- x <- "BSAD443"
- y <- "is where you will learn that"
- z <- "equals"
- something <- c(x,y, a, "plus", b, z, c) # c is a function, stands for combine, or concatenate
- length(something) # length is also a function
- # [1] 7 - there are 7 elements in the "something" vector
- # Classes and structures
- #=
- is.vector(something)
- is.vector(something)==FALSE
- class(something) # class is also a function
- class(c)
- class(a>b)
- a>b
- ?class # help on a function
- help(package="ggplot2") # help on a package (along with its functions)
- # Environment
- #=
- ls() # shows a list of objects that were saved into the R environment (also see them in the top-right pane)
- save(a,b,x,y, file="firststeps.Rdata")
- rm(list=ls())
- ls()
- # character(0)
- load("firststeps.Rdata")
- ls()
- # [1] "a" "b" "x" "y"
- # Classes/data type
- #=
- a <-5
- b<-"hello"
- c<-FALSE
- class(a)
- #[1] "numeric"
- class(b)
- #[1] "character"
- class(c)
- #[1] "logical"
- is.logical(a)
- is.numeric(a)==FALSE
- class(a)!=class(c)
- #. Functions (note that R is a functional prog. language)
- a1<-1:10
- a2<-seq(1:10)
- identical(a1,a2)
- ?identical
- ?seq()
- a3 <- seq(from=1, to=10, by=0.01)
- round(a3,1)
- a3 # oops. don't forget to assign!
- a3 <- round(a3,1)
- head(a3)
- tail(a3)
- # Load data and summary stats
- #=
- realestate <- read.csv("realestate.csv", header=T)
- # but let's import through RStudio's interface
- head(realestate)
- tail(realestate)
- dim(realestate)
- str(realestate)
- summary(realestate)
- # We need to fix the data types of all the factors (category) variable
- realestate$driveway <- as.factor(realestate$driveway)
- realestate$recroom <- as.factor(realestate$recroom)
- realestate$fullbase <- as.factor(realestate$fullbase)
- realestate$gashw <- as.factor(realestate$gashw)
- realestate$airco <- as.factor(realestate$airco)
- realestate$garagepl <- as.factor(realestate$garagepl)
- realestate$prefarea <- as.factor(realestate$prefarea)
- summary(realestate) # better!
- #====================================
- # Business Analytics, Session 2
- #====================================
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- # Quinlan School of Business - Loyola University Chicago
- # Do not distribute or use outside this class without explicit permission from the instructor.
- # Load packages
- #-
- library('readxl') # for reading excel files
- library('lubridate') # for handling dates
- library('zoo') # for handling dates
- ####################################################################
- # Go to the City of Chicago Data Portal (https://data.cityofchicago.org/)
- # Feel free to check out NYC and LA as well.
- # Select a data set that is collected over time. Download it as CSV or Excel file.
- # Note the location of the downloaded file on your computer.
- ####################################################################
- # Loading data
- #=
- # Know where your file is located and file name (e.g., desktop, user/yourname/data/file.csv)
- # To read a CSV file
- realestate <- read.csv("realestate.csv", header=T, stringsAsFactors=F)
- # Use readr::read_csv, if you have a large CSV file (over 50MB)
- realestate1 <- readr::read_csv("realestate.csv")
- # Or, easier, but not great for replication
- realestate1 <- read_csv(file.choose())
- # Or, even easier, but worse in terms of replication
- # Use RStudio's interface
- # Now for excel
- endowments <- read_excel(file.choose())
- dummy <- read_excel(file.choose(), sheet=2)
- # Inspect the data
- #=
- realestate
- realestate1
- names(realestate)
- head(realestate)
- tail(realestate, n=10)
- dim(realestate) # cols and rows
- View(realestate)
- View(endowments)
- ####################################################################
- # Load the previously downloaded data into R. Inspect the data.
- ####################################################################
- # Data types
- #=
- str(realestate)
- summary(realestate)
- glimpse(realestate)
- # Changing data type
- #-
- attach(realestate)
- summary(garagepl)
- hist(garagepl) # not a "good" numeric variable. Let's turn it into a factor
- garagepl_f <- as.factor(garagepl) # saving as a new column to be on the safe side
- # other common options: as.numeric, as.character
- # Let's see what we've got:
- summary(garagepl_f)
- plot(garagepl_f) # We will learn much more on plots. This is not-so-great sneak preview
- # Same as
- barplot(table(garagepl_f))
- # And add some colors
- barplot(table(garagepl_f), col=c("orange", "steelblue", "red")) # Note great, but these plots (and variables will get better)
- ####################################################################
- # Convert variables to their "correct" type in your data
- ####################################################################
- # Manipulating dates
- #=
- # Dates are another important data type. But it is a bit of a process.
- glimpse(bp_samp) # random sample of 10% of the entire building permits data
- detach(realestate)
- attach(bp_samp)
- summary(ISSUE_DATE) #we'll focus on ISSUE_DATE
- head(ISSUE_DATE) # not very useful
- as.Date(head(ISSUE_DATE)) #oops. one more time
- as.Date(head(ISSUE_DATE), format="%m/%d/%Y")
- bp_samp$date <- as.Date(ISSUE_DATE, format="%m/%d/%Y") # but who wants to deal with these abbreviations?
- # plus, what if we have date AND time? we use lubridate.
- mdy(head(ISSUE_DATE)) # voila!
- # now, let's assume we had a variable with time
- sometimes <- c("1/12/2017 7:00:01 PM", "18/12/2017 7:00:20 AM", "23/12/2017 7:02:15 PM")
- dmy_hms(sometimes, tz="UTC") # UTC is actually read automatically from my computer.
- # We will learn more about time data when we get to time series analysis. But here's a quick trick:
- bp_samp$yearq <- as.yearqtr(bp_samp$date)
- head(bp_samp$date)
- head(bp_samp$yearq)
- ####################################################################
- # Convert at least time variable in your data.
- ####################################################################
- # Manipulating strings (characters)
- #=
- # We use a common procedure in computing languages, called regular expression (yes, even Excel uses it!)
- # It offers a simple yet powerful way to handle messy data. It is used to find, extract, replace, and add characters.
- cost <- ESTIMATED_COST[1:50]
- cost
- grep("$", cost) # row output
- grep("$", cost, value=T) # value output
- grepl("$", cost) # logical output
- grep("$50", cost)
- grep("\\$50", cost) # We need to escape special symbols, such as $,.() etc.
- grep("\\$50|\\$20", cost) # the | stands for "or"
- grep("\\b500\\b", cost, value=T) # \\b \\b are string boundaries. we use these for exact match
- # Let's replace strings
- gsub("50", "IDS", cost)
- gsub("\\$", "", cost) # remember to reassign the processed output and convert to the correct data type if needed
- bp$ESTIMATED_COST <- as.numeric(gsub("\\$", "", bp$ESTIMATED_COST))
- #################
- # Find all variables containing odd characters in the data you downloaded.
- # Replace these characters with blank or other characters, as needed.
- # If you do not have any such characters in your data, find all "/" in the PERMIT_TYPE column and replaced them with ":"
- # Then, in the same column, remove all the spaces.
- # Finally, convert the data into the correct data type (e.g. from character to numeric)
- #################
- # Tidying data with tidyverse (and some other tools)
- #=
- library('tidyverse') # this an entire "universe" of packages, all are useful
- # more on this, next time.
- #=======================================
- # BSAD 443, Session 2
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- # Do not distribute or use outside this class without explicit permission from the instructor.
- #=======================================
- # Assign a few objects
- #Numeric vector
- nums <- c(1,3.5,5)
- result <-2^sqrt(91)
- int <- c(1L,3L,5L) # Notice capitalized L to denote an integer
- # Character vector
- grades <- c("high", "low", "high") # Character vector
- names <- c("Abby", "Ben", "Charlie")
- mixed<- c(pi, 1, "sometext", TRUE) # Mixed vector, turns into character
- # Logical vector
- right <- c(TRUE, T, FALSE) # T and F are equivalent to TRUE and FALSE
- wrong <- 2>10
- # Let's examine these vectors
- typeof(nums)
- class(int)
- mode(result) # These three functions may be regarded as equivalent
- str(grades) # Checks vector structure
- str(wrong)
- is.na(c(grades, mixed, NA)) # Returns a logical vector testing for NA
- length(grades) # Returns the number of observations in a vector
- length(c(grades, mixed)) # Pretty straightforward
- nchar(names) # Counts the character in each observation in a character vector.
- # Access vector elements
- nums[2] # First
- nums[2:3] # Second to third
- nums[c(1,3)] # First and third
- sort(nums) # Order a vector
- nums
- order(nums) # Nearly the same but not exactly
- class(sort(nums)) # Different class, but for most purposes they are the same
- class(sort(integer))
- sort(nums, decreasing=T) # Reverse
- # Change vector elements
- char.new <- c(grades,names) # Input and combine elements and vectors
- char.new <- c(char.new, "Alan", "Bo")
- char.new <- c(names, "Alan", "Bo")
- nums[2]<-5 # Substitutes the second element
- nums[c(1,3)]<-c(5,10) # Substitutes the first and third elements
- nums<-c(nums,5,10)
- # Change vector type
- is.character(char.new) # First logical test for vector type
- is.character(nums)
- mixed
- as.numeric(mixed) # Really did coerce a vector conversion. Turned non-numeric elements to NAs
- grades
- as.numeric(value.mixed)
- is.factor(grades)
- class(grades)
- as.factor(grades)
- grades # Remember to assign the coerced type
- grades<-as.factor(grades)
- levels(grades)
- is.character(levels(grades)) # Yes, factors levels are characters
- typeof(grades) # And factors are a special type of integer
- # Missing data
- num.new <- c(nums*nums,6,NA,9,NA)
- mean(num.new)
- is.na(num.new)
- mean(num.new, na.rm = T)
- # Array and matrix
- mat<- matrix(1,4,5)
- mat1<- matrix(1:10,4,5) # Sequence 1-10, 4 rows, 5 columns
- mat2<- matrix(1:10, ncol=5,nrow=4) # Same code, but easier to understand
- identical(mat1,mat2) # Yes, they are the same
- mat[1:6]
- mat[1,c(1:3)] # The first row, data for the first three columns
- is.matrix(mat) # Logical expression testing if it is a matrix structure
- length(mat[,2]) # Length of the second column of the matrix
- ncol(mat) # Number of columns
- nrow(mat) # Number of rows
- dim(mat) # Dimensions of the matrix (rows and columns)
- # Data frame
- dat <- data.frame(8:10, c("exec", "mid", "exec"))
- dat <- data.frame(score = 8:10, role = c("exec", "mid", "exec")) # Better! This one has clear, understandable column names
- dat<-data.frame(nums, char.new,stringsAsFactors=FALSE) # stringsAsFactors=FALSE turn vectors with strings into a character vector
- dat<-data.frame(nums, busperf,char.new) # Oops. What is happening here?
- dat.mat <- data.frame(mat)
- dat<-data.frame(dat, dept=c("sales",
- "accounting", "sales", "IT", "RD"),stringsAsFactors=FALSE)
- dat[1:6] # Oops
- dat[1,c(1:3)] # First row, column 1 through 3
- dat[,c("dept","nums")] # Select only these columns, notice that you can change column order
- paygrade<-c(1,3,2,1,3)
- dat<-cbind(dat, paygrade) # Added the paygrade vector (column) into the dataframe
- dat<-rbind(dat,c(5,"David", "logistics",2)) # Added a row
- colnames(dat)<-c("Years", "Name", "Department", "Paygrade") # This is how you change column names of a dataframe
- dat$Paygrade<-NULL
- # List
- Years <- c(2, 3, 5)
- Code<- c("a1", "b1", "a1", "a1", "b1")
- SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
- sales<-list(Years, Code, SalesQuota)
- sales[[1]] # All the elements in the first member (in this case, Years)
- sales[[3]][3] # TRUE
- sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it
- #=======================================
- # BSAD 443, Session 1 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- # Assign a few objects
- #Numeric vector
- nums <- c(1,3.5,5)
- result <-2^sqrt(91)
- int <- c(1L,3L,5L) # Notice capitalized L to denote an integer
- # Character vector
- grades <- c("high", "low", "high") # Character vector
- names <- c("Abby", "Ben", "Charlie")
- mixed<- c(pi, 1, "sometext", TRUE) # Mixed vector, turns into character
- # Logical vector
- right <- c(TRUE, T, FALSE) # T and F are equivalent to TRUE and FALSE
- wrong <- 2>10
- # Let's examine these vectors
- typeof(nums)
- class(int)
- mode(result) # These three functions may be regarded as equivalent
- str(grades) # Checks vector structure
- str(wrong)
- is.na(c(grades, mixed, NA, "NA", ".","")) # Returns a logical vector testing for NA
- is.null(c(grades, mixed, "", NA, ".")) # Oops, this function behaves differently
- is.null("") # Still not working
- is.null(" ") # Nope
- is.null(NULL) # Okay! But why?
- class(NULL)
- class(NA) # That's why. NULL is a class of it's own. We used it, for example, to drop data.
- length(grades) # Returns the number of observations in a vector
- length(c(grades, mixed)) # Pretty straightforward
- nchar(names) # Counts the character in each observation in a character vector.
- #### Answers to quiz
- busperf<-c(9.5, 8, 9, 8.2, 7.1)
- class(busperf)# numeric
- mean(busperf) # 8.36
- busperf<-c(busperf, 81,7)
- mean(busperf) # 18.54
- # Access vector elements
- nums[2] # First
- nums[2:3] # Second to third
- nums[c(1,3)] # First and third
- sort(nums) # Order a vector
- nums
- order(nums) # Nearly the same but not exactly
- class(sort(nums)) # Different class, but for most purposes they are the same
- class(sort(integer))
- sort(nums, decreasing=T) # Reverse
- #### Answers to quiz
- nothundred<-seq(4,100)
- halfhundred<-nothundred[nothundred>50]
- sort(halfhundred, decreasing=T)
- # Change vector elements
- char.new <- c(grades,names) # Input and combine elements and vectors
- char.new <- c(char.new, "Alan", "Bo")
- char.new <- c(names, "Alan", "Bo")
- nums[2]<-5 # Substitutes the second element
- nums[c(1,3)]<-c(5,10) # Substitutes the first and third elements
- nums<-c(nums,5,10)
- #### Answers to quiz
- num<-seq(9,99, by=3)
- quantile(num) # 31.5 and 76.5
- sd(num[num>20]) # 23.81176
- # various ways to solve this one. Perhapse the easiest to understand:
- n1<-num[num<51]
- n2<-num[num>71]
- n3<-c(n1,n2)
- mean(n3) # 52.25
- # Change vector type
- is.character(char.new) # First logical test for vector type
- is.character(nums)
- mixed
- as.numeric(mixed) # Really did coerce a vector conversion. Turned non-numeric elements to NAs
- grades
- as.numeric(value.mixed)
- is.factor(grades)
- class(grades)
- as.factor(grades)
- grades # Remember to assign the coerced type
- grades<-as.factor(grades)
- levels(grades)
- is.character(levels(grades)) # Yes, factors levels are characters
- typeof(grades) # And factors are a special type of integer
- # Missing data
- num.new <- c(nums*nums,6,NA,9,NA)
- mean(num.new)
- is.na(num.new)
- mean(num.new, na.rm = T)
- #### Answers to quiz
- busperf<-sort(busperf)
- busperf[c(2)]<-7.9
- busperf[busperf==9]<-NA
- # Array and matrix
- mat<- matrix(1,4,5)
- mat1<- matrix(1:10,4,5) # Sequence 1-10, 4 rows, 5 columns
- mat2<- matrix(1:10, ncol=5,nrow=4) # Same code, but easier to understand
- identical(mat1,mat2) # Yes, they are the same
- mat[1:6]
- mat[1,c(1:3)] # The first row, data for the first three columns
- is.matrix(mat) # Logical expression testing if it is a matrix structure
- length(mat[,2]) # Length of the second column of the matrix
- ncol(mat) # Number of columns
- nrow(mat) # Number of rows
- dim(mat) # Dimensions of the matrix (rows and columns)
- #### Answers to quiz
- mat<-matrix(1:4,nrow=6, ncol=2)
- mat[5,2]
- mat[5,2]<-"some text"
- # All the data elements in the matrix have turned into characters
- # Data frame
- dat <- data.frame(8:10, c("exec", "mid", "exec"))
- dat <- data.frame(score = 8:10, role = c("exec", "mid", "exec")) # Better! This one has clear, understandable column names
- dat<-data.frame(nums, char.new,stringsAsFactors=FALSE) # stringsAsFactors=FALSE turn vectors with strings into a character vector
- dat<-data.frame(nums, busperf,char.new) # Oops. What is happening here?
- dat.mat <- data.frame(mat)
- dat<-data.frame(dat, dept=c("sales",
- "accounting", "sales", "IT", "RD"),stringsAsFactors=FALSE)
- dat[1:6] # Oops
- dat[1,c(1:3)] # First row, column 1 through 3
- dat[,c("dept","nums")] # Select only these columns, notice that you can change column order
- paygrade<-c(1,3,2,1,3)
- dat<-cbind(dat, paygrade) # Added the paygrade vector (column) into the dataframe
- dat<-rbind(dat,c(5,"David", "logistics",2)) # Added a row
- colnames(dat)<-c("Years", "Name", "Department", "Paygrade") # This is how you change column names of a dataframe
- dat$Paygrade<-NULL
- #### Answers to quiz
- hr<-data.frame(
- Experience=c(1,2,1,8),
- Performance=c(10,8,7,10),
- Employees=c(3,4,2,10),
- stringsAsFactors=FALSE
- )
- hr<-data.frame(hr,
- ID=c(12,13,14,15),
- stringsAsFactors=FALSE)
- hr[3,2]<-9
- mean(hr$Performance) # 9.25
- # List
- Years <- c(2, 3, 5)
- Code<- c("a1", "b1", "a1", "a1", "b1")
- SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
- sales<-list(Years, Code, SalesQuota)
- sales[[1]] # All the elements in the first member (in this case, Years)
- sales[[3]][3] # TRUE
- sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it
- #=======================================
- # BSAD 443, Session 3 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- # List
- Years <- c(2013, 2014, 2015)
- Code<- c("a1", "b1", "a1", "a1", "b1")
- SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
- sales<-list(Years, Code, SalesQuota)
- sales[[1]] # All the elements in the first member (in this case, Years)
- sales[[3]][3] # TRUE
- sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it
- # In case you want to, you can assign "column" names to a list like so:
- # lapply(sales, setNames, c("Years", "Code", "SalesQuota"))
- #### Answers to quiz
- hacker<-data.frame(software=c("R","Python","SAS","R","SPSS"), years=c(4,3,10,4,1))
- salary<-sample(85:150,20)
- hackersalary<-list(hacker,salary)
- hackersalary[[2]][2]<-100
- # Inspecting data
- data(mtcars) # load data from R.
- # If mtcars is not found do the following:
- # install.packages('datasets')
- # library(datasets)
- # data(mtcars)
- dim(mtcars) # Number of rows and columns
- ncol(mtcars) # Number of rows and columns
- nrow(mtcars) # Number of rows and columns
- names(mtcars)
- colnames(mtcars)
- rownames(mtcars)
- str(mtcars) # Shows the structure of each vector (column) in the data frame
- # I can also use rownames(mtcars) to see rows, but this is less commonly used
- head(mtcars) # First 6 rows of the data+column names
- tail(mtcars,n=10) # View last 10 rows+column names. We can do the same with head
- mtcars[1:10,2:6] # Rows 1-10, columns 2-6
- set.seed # We do this every time we run a simulation to ensure that it is random
- smp<-sample(1:nrow(mtcars),10) # 10 random numbers from 1 to number of rows
- mtcars[smp,] # Gives 10 random rows from the data
- View() # Pops up the GUI viewer
- fix() # To change (manually)
- View(mtcars[smp,]) # Using the viewer to examine a random sample
- #### Answers to quiz
- # install.packages(car) # If you don't already have it
- library(car)
- data(Freedman)
- dim(Freedman)
- str(Freedman)
- rownames(Freedman) # Cities are row names
- fix(Freedman) # Manually change the value
- Freedman$population[3]<-600 # That's one way of doing it
- set.seed(789)
- smp<-sample(1:nrow(Freedman), 25)
- Fsamp<-Freedman[smp,]
- # Exploratory stats
- # Univariate distributions
- summary(mtcars) # Basic distribution stats on each numeric variable in the data frame
- quantile(mtcars$mpg)
- quantile(mtcars$mpg,seq(0.1,1,by=0.1))
- # Get additional univariate stats from the pscyh package:
- # install.packages('psych')
- # library(psych)
- # describe(mtcars)
- # Plotting distributions
- par(mfrow=c(2,2)) #2x2 plots, so 4 plots at the same time
- stripchart(mtcars$mpg)
- dotchart(mtcars$mpg)
- boxplot(mtcars$mpg) # Look for IQR (distance between 1-3 quantile) and median in the box, and min/max values outside
- hist(mtcars$mpg) # Determining number of breaks should be decided
- hist(mtcars$mpg, breaks=3)
- hist(mtcars$mpg, breaks=12)
- plot(density(mtcars$mpg)) # Desity kernel
- d<-density(mtcars$mpg)
- str(d)
- summary(d)
- # Oops!
- dev.off()
- hist(mtcars$mpg)
- lines(density(mtcars$mpg)) # Oops!
- hist(mtcars$mpg, freq=F, ylim=c(0,0.09)) # Better
- lines(density(mtcars$mpg), col="red")
- # Now let's save it as a file
- png(filename="mpgplot.png")
- hist(mtcars$mpg, freq=F, ylim=c(0,0.09), main="MPG Distribution")
- lines(density(mtcars$mpg), col="red")
- rug(mtcars$mpg, col="blue")
- box()
- dev.off()
- View()
- fix()
- dim|ncol|nrow()
- head|tail(df,n=10)
- names|colnames|rownames()
- View()
- mtc<-mtcars
- fix(mtc)
- decreasing=T),
- mtcars<-mtcars[order(-mtcars$mpg),]
- #=======================================
- # BSAD 443, Session 4 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- # Upload csv into R
- failedbanks<-read.csv("http://www.fdic.gov/bank/individual/failed/banklist.csv", header=T) # Uploads failed banks data from FDIC into R.
- # Instead of a URL it can be a local file e.g.: "y:/data/banklist.csv".
- # Upload stock data (also csv)
- install.packages('TTR')
- library(TTR)
- link <-"http://finance.yahoo.com/d/quotes.csv?s=AAPL+GOOG+MSFT&f=nab"
- fin <- read.csv(link) # Oy, any ideas why? (hint: it's an easy fix)
- # Upload text file into R
- Apple2016Q1<-readLines("http://www.sec.gov/Archives/edgar/data/320193/000119312516559625/0001193125-16-559625.txt") # Uploads Apple's most recent 10-K filing.
- # Query a database
- install.packages("DBI","RPostgreSQL")
- library(RPostgreSQL)
- drv <- dbDriver("PostgreSQL")
- con <- dbConnect(drv,
- dbname="infs494",
- host="147.126.64.66",
- port=8432,
- user="dataminer1", # use dataminer2-9
- password="summer2016")
- dbListTables(con)
- dbListFields(con, "crime")
- crime.db <- dbReadTable(con, "crime")
- # Upload data into a database
- dbExistsTable(con, "crime")
- dbWriteTable(con, "crime.part1", crime.db[1:50,],
- row.names=FALSE)
- test<-dbReadTable(con, "crime.part1")
- dbDisconnect(con)
- # Website scraping
- # install.packages("rvest")
- # scrape data from Wikipedia
- library(rvest)
- tab<-read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
- tab<-html_table(tab, fill=TRUE)
- tab1<-tab[[1]]
- tab1<-data.frame(tab1)
- tab2<-data.frame(tab[[2]])
- tab2.cln<-tab2[3:nrow(tab2),]
- cols<-as.character(tab2[2,])
- colnames(tab2.cln)<-cols
- # Scrape data from Yelp
- library(httr)
- library(rvest)
- page<-read_html("http://www.yelp.com/search?find_desc=&find_loc=water+tower+chicago%2C+il&ns=1")
- nds<-html_nodes(page, "address")
- address<-html_text(nds) # Messy! We'll get to how to "clean" such data in the next session.
- #=======================================
- # BSAD 443, Session 5 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- #------------#
- # Session 5 #
- #------------#
- # Load messy data
- # Figure out what's going on here
- credanger<-read.csv("Credit complaints.csv", header=T)
- # Messy column names
- df <-data.frame(x.1=c("a",seq(1:4)),x.2=c("b",rnorm(4,mean=2,sd=.2)))
- head(df)
- names(df)
- colnames[df]<-df[1] # Nope
- colnames[df]<-as.vector(df[1]) # Nope
- mat<-as.matrix(df)
- cn<-as.vector(mat[1,])
- colnames(mat)<-cn
- df<-data.frame(mat)
- # Or simply (for a small number of columns)
- colnames(df)<("a", "b")
- df<-df[-1,]
- df
- # Incorrect column type
- df <-data.frame(a=c("1*",seq(1:4)),b=c(letters[1:5]))
- summary(df)
- str(df)
- # Input data and convert column to correct type
- df[1,1] <- 1
- df$a<-as.numeric(df$a)
- df$b<-as.character(df$b)
- str(df)
- # Get rid of odd characters
- c1<-c("111", "123*", "145.0")
- class(c1)
- as.numeric(c1) # Oops. Let's try again:
- c1<-c("111", "123*", "145.0")
- grep("\\.|\\*|\\,|\\+", c1, ignore.case = T) # Matches pattern
- gsub("\\.|\\*|\\,|\\+", c1) # Nope
- c1<-gsub("\\*|\\,|\\+", "", c1) # Aha!
- c1<- as.numeric(gsub("([0-9]+).*$", "\\1", c1)) # (\\1 means first match). Done!
- # Missing data
- age <- c(20, 30, NA, -99)
- mean(age) # We already know that this isn't going to work
- is.na(age) # Logical
- which(is.na(age)) # Gives location of the missing data
- mean(age, na.rm=T) # Not that easy
- which(is.na(age)|age<18) # So we need to use common sense, and we should also do this:
- age[age==-99]<-NA
- mean(age,na.rm=T)
- #Time formatting
- d<-c("2016/01/01", "2016/01/02", "2016/01/03", "2016/01/04")
- class(d)
- summary(d)
- d<-as.Date(d,"%Y/%m/%d")
- class(d) # New class: date.
- # But what if we want to extract year/month/day from a date?
- # It's best to use a package. The fastest and best today is this one:
- install.packages(lubridate)
- library(lubridate)
- d<-c("2016/01/01", "2016/01/02", "2016/01/03", "2016/01/04")
- d1<-ymd(d) # That was easy! And these are easy (and very useful) too:
- y<-year(ymd(d1))
- m<-month(ymd(d1))
- d<-day(ymd(d1))
- w<-week(ymd(d1))
- q<-quarter(ymd(d1))
- #Merge data
- A<-data.frame(loanID=c(20,21,22,23),gender=c("f", "m", "m","f"),
- amount=c(1000,5500,2000,6000))
- B<-data.frame(loanID=c(23,21,22,20,24),years.employed=c(3,2,4,5,6))
- AB<-merge(A,B, by="loanID")
- AB<-merge(A,B, by="loanID", all=T)
- AB<-merge(A,B, by="loanID", all.x = T)
- AB<-merge(A,B, by="loanID", all.y = T)
- AB<-merge(A,B, by="loanID", all=T)
- #=======================================
- # BSAD 443, Session 6 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- #------------#
- # Session 6 #
- #------------#
- # Read bank data
- path<-file.choose()
- bank<-read.csv(path, header=T)
- attach(bank) # This function loads the data into memory, so that you don't have to type df$variable. Just type variable.
- # When you are finished: detach(df)
- # Let's explore the data
- dim(bank)
- str(bank) # Seems to read it okay
- summary(bank)
- head(bank)
- tail(bank)
- # Review distribution of numeric varilables
- numcol<-sapply(bank, is.numeric) # sapply applies a function to a vector/list, in this case, is.numeric
- numbank<-bank[numcol] # Copied the numeric variables so it would be easier to inspec them
- sapply(numbank, mean)
- sapply(numbank, sd) # All excpet age seem very skewed. Let's verity.
- par(mfrow=c(2,2))
- hist(numbank$age)
- plot(density(numbank$age))
- qqnorm(numbank$age)
- qqline(numbank$age, col="red")
- # Compared to normal distribution
- n<-rnorm(1:200)
- qqnorm(n)
- qqline(n, col="red")
- dev.off()
- # There is also a formal test
- shapiro.test(numbank$age)
- shapiro.test(n) # Well, age is skewed, and it is the best one of our numeric vars! This is common in the BA data.
- # See an interesting discussion here: http://stats.stackexchange.com/questions/2492/is-normality-testing-essentially-useless
- # Now let's inspect one more numeric variable (you'll do the rest on your own)
- summary(numbank$balance) # We have a fair amount of negatives, and it does not seem normally distributed. What about sd?
- sd(numbank$balance) # Much higher than the mean. It is definately skewed.
- par(mfrow=c(2,2))
- hist(numbank$balance)
- plot(density(numbank$balance)) # Yes, it is very skewed,
- qqnorm(numbank$balance)
- qqline(numbank$balance, col="red")
- qqnorm(log(numbank$balance)) # Oops. Log doesn't work with negative numbers. What to do?
- # First, how many accounts with negative balance?
- tbl<-table(numbank$balance<0) # about 8%. Let's be accurate:
- # Do we remove them? No!
- lbal<-log(numbank$balance+abs(min(balance))+1)
- plot(density(lbal)) # Better, but not normally distributed. This is fine for now.
- # Let's test the transformed vars against the reponse var:
- boxplot(lbal~bank$y)
- # One last effort
- install.packages('extremevalues')
- library(extremevalues)
- go<-getOutliers(numbank$age)
- go$iRight
- numbank$age[go$iRight]
- # And for fun:
- evGui(numbank$age)
- # We can then remove the outliers (rows), and normalize the age variable. We could also do box-cox transformation.
- # But we'll leave it for now.
- # Before we get to examine factors, we need to examine the relationship among the numeric vars
- round(cor(numbank),2) # Nothing exciting here, except previous and pdays
- # And visually:
- pairs(numbank) # This could take some time
- # And then the relationship with these vars and our y
- boxplot(numbank$age~y) # The ages of those who responded may be a little more distributed
- boxplot(numbank$balance~y) # Hard to decipher
- boxplot(lbal~y) # Ditto
- # One more approach to consider is to cut the nueric data into ordinal categories:
- catbalance<-cut(numbank$balance, breaks=c(quantile(bank$balance, seq(0,1, by=0.25))))
- table(catbalance) # Equal sizes, but unclear lables
- catbalance<-cut(bank$balance, breaks=c(quantile(bank$balance, seq(0,1, by=0.25))), labels=F)
- tcbal<-table(catbalance,bank$y) # Not sure that this is ideal either. Consider recoding into sensible levels
- prop.table(tcbal) # Let's round it
- pcbal<-round(prop.table(tcbal),2)
- addmargins(pcbal,c(1,2))
- boxplot(catbalance~bank$y) # Interesting!
- # And now for factors
- # First, the response variable iteself
- table(y) # Shows frequencies
- table(y)/nrow(bank)# Shows proportions
- r<-round(table(y)/nrow(bank),2) # Easier to understand
- pt<-r*100 # Even better in percentages
- plot(bank$y) # Good old barplot, show 88% / 12%
- baplot(r) # The table from a few lines earlier
- plot(bank$marital)
- plot(bank$housing)
- # Do it for all variables. Which ones seem interesting?
- # Now let's explore bivariate relationships
- tmar<-table(bank$marital, bank$y)
- mtmar<-addmargins(tmar,c(1,2))
- plot(tmar) # Married seemed to be slightly less excited about this campaign
- # Or prop table
- ptmar<-prop.table(tmar)
- addmargins(ptmar, c(1,2))
- # Continue to explore these tables. What did you find?
- # What if we want more than 2-way tables?
- # ftable (flat table)
- ft<-ftable(bank$job,bank$marital, bank$education, bank$y)
- # Or write it this way:
- ftable(bank[c("y", "marital", "education", "y")]) # As many factors as you'd like
- # You can also use xtabs
- xt<-xtabs(~y+marital+education, data=bank) # More readable?
- prop.table(xt)
- # A better organized output with crosstabs
- install.packages('gmodels')
- library(gmodels)
- CrossTable(y=bank$y, x=bank$education) # Nice. There seem to be something going here
- CrossTable(y=bank$y, x=bank$education, chisq=T, format="SPSS") # I like this format,
- # and Chi-square test of independence, rejected. The two vars are related.
- # Finally, if you like pivot tables there are various options.
- # The reshape and plyr packages offer very good alternatives
- # And this one: http://www.magesblog.com/2015/03/pivot-tables-with-r.html cool interactive, web-based pivot tables
- #=======================================
- # BSAD 443, Session 7 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- #------------#
- # Session 7 #
- #------------#
- # A few bits of information:
- # Bank data come from here: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
- # This is how you drop rows when a certain column contains NAs
- df<-df[!is.na(df$var),]
- # or replace with avg.
- df$var[is.na(df$var)] <- mean(df$var,na.rm=T)
- # Decision trees
- # Steps to follow (with any machine learning algorithm)
- # 1. Load the data and clean the data if needed
- # 2. Chop the data (usually 90-10, 80-20, 75-25, or 67-33)
- # 3. Train
- # 4. Model
- # 5. Evaluate and fine-tune
- #1. Load the data (We did this yesterday, in session 6), and randomize it
- path<-file.choose()
- bank<-read.table(path, header=T, sep=";")
- set.seed(199)
- bankrand <- bank[order(runif(nrow(bank))), ] # runif() produces a list of random numbers. This one shuffles the data randomly
- bankrand<-bankrand[1:(nrow(bankrand)/10),] # I am using only 10% of the data. Otherwise, it will take a long time to run.
- #To make sure the random algorithm got it right
- tb<-(bank$y)
- prop.table(tb)
- tb1<-(bankrand$y)
- prop.table(tb1) # Looks pretty close. We're good to go.
- #2. Chop the data into training (90%) and testing (10%) sets
- trn<-1:(nrow(bankrand)*0.90)
- tst<-(tail(trn,1)+1):nrow(bankrand)
- bank_train <- bankrand[trn, ]
- bank_test <- bankrand[tst, ]
- # Just to be on the safe side. Looks pretty close. Good!
- round(prop.table(table(bank_train$y)),2)
- round(prop.table(table(bank_test$y)),2)
- #3. Train
- install.packages("C50") # There are a few more packages for CART. This one is considered among the best.
- library(C50)
- bank_model <- C5.0(bank_train[-17], bank_train$y) # ommits the outcome variable
- summary(bank_model) ) # Wow! Only 7% errors.
- # plot(bank_model # This will take a long time, but it is very informative
- plot(bank_model, subtree=2)
- #But don't jump to conclusions. Decision trees are known for
- # having a tendency to overfit the model to the training data.
- # For this reason, the error rate reported on training data may be overly optimistic, and it is especially important to evaluate decision trees on a test dataset.
- # 4. Model
- bank_pred <- predict(bank_model, bank_test)
- library(gmodels)
- CrossTable(bank_test$y, bank_pred, prop.chisq = F, prop.c = F, prop.r = F,
- dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
- # About 11% misses.
- # 5. Evaluate and fine-tune
- # The C5.0 algorithms let's us to add additional trials.
- # We'll start with 10 trials. That's the most very common # of trials,
- # as research suggests that this reduces error rates on test data by about 25 percent.
- # It also selects the best size of the tree (e.g., "pruning")
- # First, let's do some boosting
- bank_boost10 <- C5.0(bank_train[-17], bank_train$y, trials = 10)
- bank_boost10
- summary(bank_boost10) # Summarizes itterations, tree size and classification performance.
- plot(bank_boost10, subtree=2)
- bank_boost_pred10 <- predict(bank_boost10, bank_test)
- CrossTable(bank_test$y, bank_boost_pred10, prop.chisq = F, prop.c = F, prop.r = F,
- dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
- # Still not great, but we have improved the model by at least 1%
- # Now let's reduce the size of the tree ("pruning")
- bank_control <- C5.0(bank_train[-17], bank_train$y, control = C5.0Control(minCases = 20), trials=10)
- summary(bank_control) # Avg. tree size is 9. That's because of our control parameter
- plot(bank_control) # Having less nodes allows us to plot the tree model
- bank_control_pred <- predict(bank_control, bank_test)
- CrossTable(bank_test$y, bank_control_pred, prop.chisq = F, prop.c = F, prop.r = F,
- dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
- #### Now let's try doing the same with logistical regression
- # We already did stage 1 and 2
- # 3. Train
- logit_train<-glm(y ~.,family=binomial(link = "logit"), data=bank_train)
- summary(logit_train)
- anova(logit_train, test="Chisq") # Model is clearly preforming well when comparing null deiance against the residual deviance.
- # 4. Test
- logit_test <- predict(logit_mod,newdata=bank_test,type='response')
- # 5. Evaluate and fine-tune
- logit_test <- ifelse(logit_test > 0.5,1,0)
- logit_test <- gsub("0", "no", logit_test)
- logit_test <- gsub("1", "yes", logit_test)
- logit_test <- as.factor(logit_test)
- logitError <- table(logit_test == bank_test$y) # About 10% errors. Not bad!
- #=======================================
- # BSAD 443, Session 8 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- #------------#
- # Session 8 #
- #------------#
- ############# EVEN MORE FUNCTIONS #############
- # 1. CONVERTING A NUMERIC VARIABLE INTO A FACTOR WITH DIFFERENT LEVELS
- # Usually cut works, but not always
- an<-cut(loans$annual_inc, breaks=3) # Oops.
- # Solution 1, use cut2 from the Hmisc package
- install.packages(Hmisc)
- library(Hmisc)
- ai<-cut2(loans$annual_inc,m=5000)
- # Solution 2, (1) convert to integer and then to character. (2) Then match and convert strings into different categories
- ai<-as.character(as.integer(loans$annual_inc)) # Converts all non-integers into integers
- ai[ai %in% c(1:62410)]<-"low" # I can now select whatever categories I want
- ai[ai %in% c(62411:7446395)]<-"high" # This one could be slow becuase it matches a large number of strings
- # 2. BIVARIATE CORRELATIONS
- # When missing values are present
- cor(loans$loan_amnt, loans$annual_inc) # Gives NA
- cor(loans$loan_amnt, loans$annual_inc, use="complete.obs") # 0.3 They are correlated
- # Plot correlations
- plot(loans$loan_amnt, loans$annual_inc) # Generateds a scatterplot that is very messy, and we have an outlier
- identify(loans$loan_amnt, loans$annual_inc) # It's 19218
- loans<-loans[-19218,]
- plot(loans$loan_amnt, loans$annual_inc)
- plot(log(loans$loan_amnt), log(loans$annual_inc)
- abline(lm(log(loans$annual_inc)~log(loans$loan_amnt)), col="red") # This adds a fitted line from a regression model.
- #Note that the order of the vars has flipped and note the tilde.
- lines(loess.smooth(log(loans$loan_amnt), log(loans$annual_inc)),col="green") # This is a "smoother" which fits a line that may or may not be straight
- # 3. DATA SLICING AND AGGREGATION
- # Base R has an aggregate function, but it is overly complicated. We'll use dplyr instead.
- library(dplyr)
- chisal<-read.csv(path, header=T) # Read Chicago's salary data
- head(chisal)
- dim(chisal)
- chisal$Employee.Annual.Salary<-as.numeric(gsub("\\$","", chisal$Employee.Annual.Salary))
- names(chisal)
- # Slice the data
- slice(chisal, c(100:110, 5:50)) # Shows these rows
- head(arrange(chisal, desc(Employee.Annual.Salary))) # Orders the data by salar in a descending order
- head(select(chisal, Department, Employee.Annual.Salary)) # Selects only these two columns, in this order
- head(select(chisal,-(c(Name,Department)))) # Drops these two columns
- highpaychipol<-filter(chisal, Department=="POLICE", Employee.Annual.Salary>100000) # Select particular cases
- nrow(sample_frac(chisal, 0.1)) # Random sample of 0.1 (10% or rows)
- # Aggregate data
- by_dept <- group_by(chisal, Department)
- sals<-summarize(by_dept, count=n() , avg.salary=mean(Employee.Annual.Salary, na.rm=T))
- sals.bigdept<- filter(sals, count > 1000)
- # More information, including additional features: https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html
- # 4. SAVE DATA FRAME
- write.csv(df, "newfile.csv", row.names=F) # This will save a dataframe called df as a CSV file called newfile.csv in my working directory.
- # 5. MANAGE WORKSPACE
- save.image() # Will save all objects in R's session
- save(loans, chisal, path, file="test.RData") # Will save the objects called loans and path into the file test.RData
- rm(list=ls(all=TRUE)) # Removes ALL objects in R
- load(file="test.RData") # Will open the objects stored in the test.RData file
- ############# PLOTS #############
- install.packages("ggplot2")
- x<-seq(-4,9,0.2)
- y<-2*x^2+4*x-2
- plot(x,y)
- plot(x,y, pch=16, col="red")
- # pch are symbols. See: http://www.statmethods.net/advgraphs/parameters.html
- plot(x,y, pch=16, col="red", xlim=c(-5, 10), ylim=c(-5,210), main="Nice plot!", xlab="The X", ylab="The Y")
- lines(x,y, col="green") # Adds a line
- abline(h=1)
- abline(v=-0.5)
- text(-0,50,"Intresting!")
- legend(-5,100,"Legend here")
- # Baic plot types in R: p=points; l=lines; o=overplotted points and lines; b, c=joined by lines; s, S=stair steps; h=histogram-like vertical lines; n=nothing/empty
- colors() # Color names (657)
- grep("green", colors(), ignore.case = T, value = T) # All variations of "green"
- # Or use color codes instead, using Hexadecimal code: http://html-color-codes.com/
- # Add multiple trendlines
- X<-c(1,2,3,4,5,6,7)
- Y1<-c(2,4,5,7,12,14,16)
- Y2<-c(3,6,7,8,9,11,12)
- Y3<-seq(1,30, by=3)
- plot(X,Y1,type="o",pch=17,cex=1.2,col="darkgreen",ylim=c(0,20)) # Initiates the plot
- lines(Y2,type="o",pch=16,lty=2,col="blue")
- lines(Y3,type="o",pch=2,lty=4,col="darkblue")
- title(main="Three vars", col.main="darkgray", font.main=4)
- # https://en.wikibooks.org/wiki/R_Programming/Graphics
- # Good scripts to convert tables/descriptives to plots
- #http://tables2graphs.com/doku.php?id=03_descriptive_statistics#figure_1
- # ggplot2 (qplot)
- # http://www.statmethods.net/advgraphs/ggplot2.html
- # http://www.cookbook-r.com/Graphs/
- library(ggplot2)
- data("diamonds")
- sdiam<-sample_frac(diamonds, 0.01) # Select a random sample of 1%
- qplot(carat, price, data = sdiam)
- qplot(log(carat), log(price), data = sdiam)
- # Now let's add colors. Note how easy it is!
- qplot(carat, price, data = sdiam, shape=cut)
- qplot(carat, price, data = sdiam, color=cut)
- qplot(carat, price, data = sdiam, color = I("blue"))
- qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3)) # 0=transparant; 1=not
- qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3), size = I(5)) # Bigger
- qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3), size = I(5), shape=I(3))
- # Shapes are: 1=circle; 2=triangle; 3=plus; 4=cross; 5=diamonds
- # You can combine multiple qplot types with c(), as in geom = c("smooth", "point")
- qplot(carat, price, data = sdiam, geom = c("smooth", "point"), )
- # There are multiple smoothers, but loess (which is the most common), is the default
- # controling loess by span=1 (relatively straight, underfitting) to span=0 (curvy, overfitting)
- # NOTES
- # qplot can generate other types of plots (the geom parameter)
- # geom = "point" scatterplot, default for x,ylab
- # geom = "smooth" fits a smoother to the data and displays the smooth and its standard error
- # geom = "boxplot" produces a box-and-whisker plot to summarise the distribution of a set of points
- # geom = "path" and geom = "line" draw lines between the data points
- # Ordinal/continuous vars geom = "histogram"; geom ="freqpoly" a frequency polygon, and geom = "density" creates a density plot
- # For discrete variables, geom = "bar" makes a bar chart
- # Distribution plots
- # Continue using the recipe found here: http://www.cookbook-r.com/Graphs/Plotting_distributions_%28ggplot2%29/
- ggplot(bankrand, aes(x=balance)) + geom_histogram(binwidth=.5)
- ggplot(loans, aes(x=tot_hi_cred_lim)) + geom_histogram(binwidth=.5, color="black", fill="blue")
- ggplot(loans, aes(x=tot_hi_cred_lim)) + geom_density()
- # Correlation plots
- numcol<-sapply(loans, is.numeric)
- nl<-loans[,numcol]
- nl1<-nl[,5:10] # It is hard to interpret vars. You can select a few a time.
- ctab<-round(cor(nl1, use="complete.obs"),2)
- plotcorr(ctab, mar = c(0.1, 0.1, 0.1, 0.1))
- # Do the same, but with colors corresponding to value
- colorfun <- colorRamp(c("#CC0000","white","#3366CC"), space="Lab")
- plotcorr(ctab, col=rgb(colorfun((ctab+1)/2), maxColorValue=255), mar = c(0.1, 0.1, 0.1, 0.1))
- colorfun <- colorRamp(c("#CC0000","white","#3366CC"), space="Lab")
- plotcorr(ctab, col=rgb(colorfun((ctab+1)/2), maxColorValue=255),
- mar = c(0.1, 0.1, 0.1, 0.1))
- # INTERACTIVE/WEB PLOTS
- # From: http://ramnathv.github.io/rCharts/
- require(devtools)
- install_github('ramnathv/rCharts')
- names(iris) = gsub("\\.", "", names(iris))
- rPlot(SepalLength ~ SepalWidth | Species, data = iris, color = 'Species', type = 'point')
- data(economics, package = "ggplot2")
- econ <- transform(economics, date = as.character(date))
- m1 <- mPlot(x = "date", y = c("psavert", "uempmed"), type = "Line", data = econ)
- m1
- hair_eye_male <- subset(as.data.frame(HairEyeColor), Sex == "Male")
- n1 <- nPlot(Freq ~ Hair, group = "Eye", data = hair_eye_male, type = "multiBarChart")
- n1$save('mychart.html', standalone = TRUE) # Generates a webpage with embedded interactive chart
- # Or publish online
- n1$publish('Scatterplot', host = 'gist') # e.g., github
- #=======================================
- # BSAD 443, Session 9 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- #------------#
- # Session 9 #
- #------------#
- ######## FROM EXCEL ########
- install.packages("readxl")
- nasa<-read_excel("NASA_Labs_Facilities.xlsx")
- read_excel("my-spreadsheet.xls", sheet = "data")
- read_excel("my-spreadsheet.xls", sheet = 2)
- # Write to Excel using XLConenct: https://cran.r-project.org/web/packages/XLConnect/vignettes/XLConnect.pdf
- ######## WORDCLOUD ########
- install.packages(c("wordcloud","RColorBrewer"))
- library(wordcloud)
- library(RColorBrewer)
- pal <- brewer.pal(4,"RdGy") # This means palette 4. Try other ones.
- wordcloud("R is a programming language and software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis. Polls, surveys of data miners, and studies of scholarly literature databases show that R's popularity has increased substantially in recent years."
- , min.freq = 1, scale=c(2,0.5), random.color = T, color = pal) # Word cloud from the first paragraph of Wikipedia's entry on R
- # Now you try it with other text. For example:
- # Compare annual filings (try to read only 100 lines, or so. Otherwise, it can get slow):
- apple94<-readLines("http://www.sec.gov/Archives/edgar/data/320193/0000320193-94-000016.txt")[100:200]
- apple2000<-readLines("http://www.sec.gov/Archives/edgar/data/320193/000091205700053623/0000912057-00-053623.txt")[100:200]
- # Use the same procedure to plot Trump's address:
- http://blogs.wsj.com/washwire/2015/06/16/donald-trump-transcript-our-country-needs-a-truly-great-leader/
- # And Clinton's speech:
- http://time.com/3920332/transcript-full-text-hillary-clinton-campaign-launch/
- # And whatever you think could be intereting/fun. Project it on the screen so that all can enjoy!
- ######## TWITTER ########
- # 1. http://twitter.com/signup #first sign up. You'd need to provide a mobile number.
- # 2. https://apps.twitter.com (sign in with your account)
- # 3. Create New App
- # 4. Put some information (it doesn't have to be relevant/accurate), and accept
- # 5. Click on the âkeys and access tokenâ: You need
- #1. Consumer Key (API Key)
- # 2. Consumer Secret (API Secret)
- # click the âCreate my access tokenâ button.
- # 3. Access Token
- # 4. Access Token Secret
- # In R:
- install.packages('twitteR')
- install.packages('tm')
- library(twitteR)
- library(tm)
- consumer_key <- "" #API Key
- consumer_secret <- "" #API Secret
- access_token <- "" #Access Token
- access_secret <- "" #Access token secret
- setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) # Select 1
- rm(consumer_key, consumer_secret, access_token, access_secret)
- tweet("Test 1,2,3") # Can you see it?
- local_cache <- get("oauth_tken", twitteR:::oauth_cache) # saves the oauth token so we can reuse it
- save(local_cache, file="data/oauth_cache.RData")
- # Harvest tweets
- Clinton <- searchTwitter("Clinton", n=1500)
- length(Clinton)
- # Take a few of them
- cl<-Clinton[1:100]
- # Process text
- cl <- sapply(cl, function(x) x$getText())
- cl_corpus <- Corpus(VectorSource(cl))
- cl_corpus<- tm_map(cl_corpus, content_transformer(tolower))
- cl_corpus <- tm_map(cl_corpus, removePunctuation)
- cl_corpus <- tm_map(cl_corpus, function(x)removeWords(x,stopwords()))
- # Plot wordcloud
- wordcloud(cl_corpus)
- # Search for other terms
- # Apply wordcloud parameers above and share will class
- ######## DATA MINING ########
- wine <- read.csv("whitewines.csv")
- # examine the wine data
- str(wine)
- # Data are already randomized. We're going to cut it 75/25
- wine_train <- wine[1:3750, ]
- wine_test <- wine[3751:4898, ]
- # The outcome variable is numeric (quality), so we'll use the rpart decision tree algorithm
- install.packages('rpart')
- library(rpart)
- # Build the model
- m.rpart <- rpart(quality ~ ., data = wine_train)
- # get basic information about the tree
- m.rpart
- # get more detailed information about the tree
- summary(m.rpart)
- # use the rpart.plot package to create a visualization
- install.packages('rpart.plot')
- library(rpart.plot)
- # a few adjustments to the diagram
- rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
- # generate predictions for the testing dataset
- p.rpart <- predict(m.rpart, wine_test)
- # compare the distribution of predicted values vs. actual values
- summary(p.rpart)
- summary(wine_test$quality)
- # compare the correlation
- cor(p.rpart, wine_test$quality) # Highly correlated
- # function to calculate the mean absolute error
- MAE <- function(actual, predicted) {
- mean(abs(actual - predicted))
- }
- # mean absolute error between predicted and actual values
- MAE(p.rpart, wine_test$quality)
- # mean absolute error between actual values and mean value
- mean(wine_train$quality) # result = 0.59 quality units when the scale is 0-10. Not bad!
- # There are ways to fine-tune the model. But that's for anotehr class.
- #=======================================
- # BSAD 443, Session 9 - Answers
- # Quinlan School of Business - Loyola University Chicago
- # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
- #=======================================
- # ds1
- # 1.
- unirank<-read.csv(path, header=T, stringsAsFactors = F)
- unirank$score<-as.numeric(unirank$score)
- round(mean(unirank$score, na.rm=T),2)
- # 47.79
- round(range(unirank$score, na.rm=T),2)
- # 43.36 100.00
- round(IQR(unirank$score, na.rm=T),2)
- # 3.07
- # 2.
- library(dplyr)
- unirank$patents<-as.numeric(unirank$patents)
- by_country <- group_by(unirank, country)
- pat<-summarize(by_country, avg.patents=mean(patents, na.rm=T))
- pat[country=="USA"|country=="Japan"]
- pat[pat$country=="USA"|pat$country=="Japan",]
- # Japan 353.3962
- # USA 296.2792
- # 3.
- # easy way
- cor(unirank$publications, unirank$influence, use="complete.obs")
- # 0.8748249
- cor(unirank$publications, unirank$patents, use="complete.obs")
- # 0.6713219
- unirank$influence<-as.numeric(unirank$influence)
- cor(unirank$influence, unirank$patents, use="complete.obs")
- # 0.6111321
- # a bit harder (but better) way
- tmp<-unirank[,c("influence", "patents", "publications")]
- cor(tmp, use="complete.obs")
- # influence patents publications
- # influence 1.0000000 0.6111321 0.8747273
- # patents 0.6111321 1.0000000 0.6709847
- # publications 0.8747273 0.6709847 1.0000000
- # They are very highly correlated, just to illustrate:
- cor.test(tmp$influence, tmp$publications,use = "complete.obs" )
- plot(tmp$influence, tmp$publications)
- abline(lm(tmp$publications~tmp$patents),col="red", lwd=3)
- # ds2
- # 1.
- cars<-read.csv(path, header=T, stringsAsFactors = F)
- cars$year<-as.numeric(gsub("\\,","",cars$year))
- cars$price<-as.numeric(gsub("\\,","",cars$price))
- cars$mileage<-as.numeric(gsub("\\,","",cars$mileage))
- d<-as.Date(cars$date, "%d-%b-%y")
- library(lubridate)
- cars$month<-month(d)
- summer<-cars[cars$month>5,]
- sp<-mean(summer$price, na.rm=T)
- othermonths<-cars[cars$month<6,]
- op<-mean(othermonths$price, na.rm=T)
- op-sp
- # 3835.104
- # 2.
- by_color <- group_by(cars, color)
- color<-summarize(by_color, freq=n(), na.rm=T)
- color[order(color$freq),]
- # least: Gold, Yellow Most: Black, Silver
- hotcoldcars<-cars[(cars$color=="Black"|cars$color=="Silver"|cars$color=="Gold"|cars$color=="Yellow"),]
- nrow(hotcoldcars)
- # 70
- # 3.
- my<-max(cars$year)
- cars$age<-my-cars$year
- # 4.
- plot(cars$mileage, cars$age)
- abline(lm(cars$age~cars$mileage), col="red", lwd=2)
- cor(cars$mileage, cars$age)
- # 0.7603127
- #------------#
- # Homework 2 #
- #------------#
- # Task 1
- # 1. character, numeric and logical
- # 2. for example, class() or str()
- # 3.
- [1] FALSE
- [1] TRUE
- # 4. The first one is false because 10 is numeric. The second one is true because as.character converted the 10 into a character.
- # Task 2
- #1. It doesn't work because to extract a data element, you need to use brackets instead
- # of parentheses. Parentheses are for functions.
- #2.
- v<-c(1,2,3,9,15)
- v<-as.factor(v)
- # Simply way
- v[v==1]<-NA
- v[v==15]<-NA
- # Short way
- v[v==1&v==15]<-NA
- # Then
- levels(v)
- droplevels(v)
- # Task 3
- # 1.
- department<-data.frame(age=c(23,30,45), gender=c("M", "F", "M"), level=c(3,6,8), stringsAsFactors = FALSE)
- # 2.
- department<-rbind(department, c(31,"F", 7))
- # 3.
- department<-cbind(department, sales=c(10,9,9,8))
- # 4.
- snapshot<-department[,c("age","sales")]
- # 5.
- snapshot[snapshot$age>=31,]
- # 6.
- # There are various ways to doing this. For example:
- # You can say which rows to drop
- snapshot[-c(2),] # Drop this row
- # or
- delrows<-c(2)
- snapshot[-delrows,]
- # You can also say which rows to keep
- snapshot[c(1,3:4),] # Keep other rows
- # or
- keeprows<-c(1,3:4)
- snapshot[keeprows,]
- # 7.
- # Again, there are various ways to doing this. For example:
- # Use the NULL option
- snapshot[,c("age")]<-NULL
- snapshot$age<-NULL
- # Or declare a column to keep as a vector, and then refer to this vector.
- #The vector can be the name or number of the column.
- keepcols<-c("sales")
- snapshot[keepcols]
- # Or declare which column to drop as a vector, and then refer to this vector.
- # The vector can be the name or number of the column.
- dropcols<-1
- snapshot[-dropcols]
- #------------#
- # Homework 3 #
- #------------#
- # Task 1
- # 1.
- lst<-list(ID=seq(1,5), data.frame(name=c("a", "b"), age=c(55,42)))
- # 2.
- lst[[1]][2]<-10
- # 3.
- lst[[2]][1,2]
- # Task 2
- # 1.
- install.packages('car')
- library(car)
- data(Anscombe)
- # 2.
- # Multiple ways to do this. This one is simplest:
- quantile(Anscombe$education) # Note 25th quantile is 165
- low<-rownames(Anscombe[Anscombe$education<=165,]) # Creating a vector to makes it easier moving forward
- # 3.
- sort(low)
- # "AL" "AR" "GA" "KY" "LA" "MS" "NC" "NE" "OK" "SC" "TN" "TX" "WV"
- # 4.
- relspend<-Anscombe$education/Anscombe$income
- # 5. For example,
- mean(relspend)
- sd(relspend)
- # 6. For example,
- hist(relspend)
- plot(density(relspend))
- # 7. Yes
- # 8.
- Anscombe$relspend<-relspend
- # 9.
- hispenders<-Anscombe[Anscombe$relspend>median(Anscombe$relspend),]
- #------------#
- # Homework 4 #
- #------------#
- # Task 1
- # Much of the first stage can be done within Excel prior to uploading to R. But here is the way to do it all in R.
- # 1. Upload file
- fl13<-file.choose()
- fbi13<-read.csv(fl13, header=T)
- # 2. Data wrangling
- head(fbi13) # Need to remove first couple of lines and change column names, still I only care about city name, population and violent crime
- fbi13<-fbi13[,c(1:3,10)] # Keep only the columns I need. I will save time.
- fbi13<-fbi13[3:nrow(fbi13),]
- tail(fbi13) # Oops. Lots of junk there too. Let's see how far it goes:
- tail(fbi13, n=20) # Okay, action starts at line 9295. Let's drop anything after it.
- fbi13<-fbi13[-c(9295:nrow(fbi13)),] # Multiple ways to doing this. We can also just declare the rows we want to keep, as opposed to those we want to drop.
- colnames(fbi13)<-c("state", "city", "population", "property.crimes") # still need to drop the first row--the one containing original column names.
- fbi13<-fbi13[-1,] # Good!
- # Variable wrangling
- fbi13$population<-gsub("\\,", "", fbi13$population)
- fbi13$population<-as.numeric(fbi13$population)
- fbi13$property.crimes<-gsub("\\,", "", fbi13$property.crimes)
- fbi13$property.crimes<-as.numeric(fbi13$property.crimes)
- # 3. Select big cities
- big.cities<-fbi13[fbi13$population>200000,]
- View(big.cities) # There are some issues here.
- # 1. State has little role to play here, so let's drop it.
- # 2. A few rows contain NA values. We need to drop them.
- # 3. Some city names have numbers next to them (this was originally a footnote). We need go get rid of these.
- # Here we go:
- # 1. Remove state column:
- big.cities$state<-NULL
- # 2. Remove rows with NAs:
- big.cities<-big.cities[complete.cases(big.cities),]
- # or
- big.cities<-na.omit(big.cities)
- # 3. Remove numbers and special characters from city names
- big.cities$city<-gsub("\\d","",big.cities$city) # \\d removes digits from a string
- big.cities$city<-gsub("[[:punct:]]","" , big.cities$city) # removes special characters from city names
- # 4. Identify cities with the lowest crime rates
- big.cities$crime.ratio<-big.cities$property.crimes/(big.cities$population/1000)
- # We can also round it, but this is not needed. It just looks nicer.
- big.cities$crime.ratio<-round(big.cities$crime.ratio,2)
- # Now let's sort the data
- big.cities<-big.cities[order(big.cities$crime.ratio),]
- big.cities$city[1:10]
- # In case you are wondering, Chicago is here
- grep("chicago", big.cities$city, ignore.case=T) # 42 out of 112
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement