BSAD 443 - Business Analytics - Quinlan School of Business

#=======================================
# BSAD 443, Session 1
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

# OLS regression model
#====

model <- lm(price ~ .,realestate) # the period means all variables
summary(model)


# But first a bit of R
#====

# Orientation
#=
getwd() # shows us where the working directory is
setwd("c:/temp") # sets the working directory to the "temp" folder under drive "c" -- note direction of slash!
dir.create('BSAD443') # creates a directory called IDS575 under temp. Note that R is, for the most part, accpets both single and double quatation marks
setwd("BSAD443")
getwd()
# [1] "c:/temp/BSAD443"

# Install a package
#=
# use RStudio's gui, or
install.packages("dplyr")
library(dplyr) # makes all functions in dplyr available, or
dplyr::  # brings up a list of functions to choose from

# Basic operations
#=
5+6
# [1] 11
8*7
# [1] 35
10*5 / 0.5
# [1] 100

# Vectors
a <- 5+6
b <- 8*7
c <- a+b
c

x <- "BSAD443"
y <- "is where you will learn that"
z <- "equals"
something <- c(x,y, a, "plus", b, z, c) # c is a function, stands for combine, or concatenate
length(something) # length is also a function
# [1] 7 - there are 7 elements in the "something" vector


# Classes and structures
#=
is.vector(something)
is.vector(something)==FALSE
class(something) # class is also a function
class(c)
class(a>b)
a>b
?class # help on a function
help(package="ggplot2") # help on a package (along with its functions)

# Environment
#=
ls()  # shows a list of objects that were saved into the R environment (also see them in the top-right pane)
save(a,b,x,y, file="firststeps.Rdata")
rm(list=ls())
ls()
# character(0)
load("firststeps.Rdata")
ls()
# [1] "a" "b" "x" "y"

# Classes/data type
#=
a <-5
b<-"hello"
c<-FALSE

class(a)
#[1] "numeric"
class(b)
#[1] "character"
class(c)
#[1] "logical"

is.logical(a)
is.numeric(a)==FALSE
class(a)!=class(c)


#. Functions (note that R is a functional prog. language)
a1<-1:10
a2<-seq(1:10)
identical(a1,a2)
?identical
?seq()
a3 <- seq(from=1, to=10, by=0.01)
round(a3,1)
a3 # oops. don't forget to assign!
a3 <- round(a3,1)
head(a3)
tail(a3)

# Load data and summary stats
#=
 realestate <- read.csv("realestate.csv", header=T)

 # but let's import through RStudio's interface
head(realestate)
tail(realestate)
dim(realestate)
str(realestate)
summary(realestate)
# We need to fix the data types of all the factors (category) variable
realestate$driveway <- as.factor(realestate$driveway)
realestate$recroom <- as.factor(realestate$recroom)
realestate$fullbase <- as.factor(realestate$fullbase)
realestate$gashw <- as.factor(realestate$gashw)
realestate$airco <- as.factor(realestate$airco)
realestate$garagepl <- as.factor(realestate$garagepl)
realestate$prefarea <- as.factor(realestate$prefarea)
summary(realestate) # better!


#====================================
# Business Analytics, Session 2
#====================================
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
# Quinlan School of Business - Loyola University Chicago
# Do not distribute or use outside this class without explicit permission from the instructor.

# Load packages
#-
library('readxl') # for reading excel files
library('lubridate') # for handling dates
library('zoo') # for handling dates

####################################################################
# Go to the City of Chicago Data Portal (https://data.cityofchicago.org/)
# Feel free to check out NYC and LA as well.
# Select a data set that is collected over time. Download it as CSV or Excel file.
# Note the location of the downloaded file on your computer.
####################################################################

# Loading data
#=
# Know where your file is located and file name (e.g., desktop, user/yourname/data/file.csv)

# To read a CSV file
realestate <- read.csv("realestate.csv", header=T, stringsAsFactors=F)

# Use readr::read_csv, if you have a large CSV file (over 50MB)
realestate1 <- readr::read_csv("realestate.csv")

# Or, easier, but not great for replication
realestate1 <- read_csv(file.choose())

# Or, even easier, but worse in terms of replication
# Use RStudio's interface

# Now for excel
endowments <- read_excel(file.choose())
dummy <- read_excel(file.choose(), sheet=2)

# Inspect the data
#=
realestate
realestate1
names(realestate)
head(realestate)
tail(realestate, n=10)
dim(realestate) # cols and rows
View(realestate)
View(endowments)

####################################################################
# Load the previously downloaded data into R. Inspect the data.
####################################################################

# Data types
#=
str(realestate)
summary(realestate)
glimpse(realestate)

# Changing data type
#-
attach(realestate)
summary(garagepl)
hist(garagepl) # not a "good" numeric variable. Let's turn it into a factor
garagepl_f <- as.factor(garagepl) # saving as a new column to be on the safe side
# other common options: as.numeric, as.character

# Let's see what we've got:
summary(garagepl_f)
plot(garagepl_f) # We will learn much more on plots. This is not-so-great sneak preview
# Same as
barplot(table(garagepl_f))
# And add some colors
barplot(table(garagepl_f), col=c("orange", "steelblue", "red")) # Note great, but these plots (and variables will get better)

####################################################################
# Convert variables to their "correct" type in your data
####################################################################

# Manipulating dates
#=
# Dates are another important data type. But it is a bit of a process.
glimpse(bp_samp) # random sample of 10% of the entire building permits data
detach(realestate)
attach(bp_samp)
summary(ISSUE_DATE) #we'll focus on ISSUE_DATE
head(ISSUE_DATE) # not very useful
as.Date(head(ISSUE_DATE)) #oops. one more time
as.Date(head(ISSUE_DATE), format="%m/%d/%Y")
bp_samp$date <- as.Date(ISSUE_DATE, format="%m/%d/%Y") # but who wants to deal with these abbreviations?
# plus, what if we have date AND time? we use lubridate.
mdy(head(ISSUE_DATE)) # voila!
# now, let's assume we had a variable with time
sometimes <- c("1/12/2017 7:00:01 PM", "18/12/2017 7:00:20 AM", "23/12/2017 7:02:15 PM")
dmy_hms(sometimes, tz="UTC") # UTC is actually read automatically from my computer.
# We will learn more about time data when we get to time series analysis. But here's a quick trick:
bp_samp$yearq <- as.yearqtr(bp_samp$date)
head(bp_samp$date)
head(bp_samp$yearq)

####################################################################
# Convert at least time variable in your data.
####################################################################

# Manipulating strings (characters)
#=
# We use a common procedure in computing languages, called regular expression (yes, even Excel uses it!)
# It offers a simple yet powerful way to handle messy data. It is used to find, extract, replace, and add characters.
cost <- ESTIMATED_COST[1:50]
cost
grep("$", cost) # row output
grep("$", cost, value=T) # value output
grepl("$", cost) # logical output
grep("$50", cost)
grep("\\$50", cost) # We need to escape special symbols, such as $,.() etc.
grep("\\$50|\\$20", cost) # the | stands for "or"
grep("\\b500\\b", cost, value=T) # \\b \\b are string boundaries. we use these for exact match

# Let's replace strings
gsub("50", "IDS", cost)
gsub("\\$", "", cost) # remember to reassign the processed output and convert to the correct data type if needed
bp$ESTIMATED_COST <- as.numeric(gsub("\\$", "", bp$ESTIMATED_COST))

#################
# Find all variables containing odd characters in the data you downloaded.
# Replace these characters with blank or other characters, as needed.
# If you do not have any such characters in your data, find all "/" in the PERMIT_TYPE column and replaced them with ":"
# Then, in the same column, remove all the spaces.
# Finally, convert the data into the correct data type (e.g. from character to numeric)
#################

# Tidying data with tidyverse (and some other tools)
#=
library('tidyverse') # this an entire "universe" of packages, all are useful
# more on this, next time.
#=======================================
# BSAD 443, Session 2
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
# Do not distribute or use outside this class without explicit permission from the instructor.
#=======================================

# Assign a few objects

#Numeric vector
nums  <- c(1,3.5,5)
result <-2^sqrt(91)
int <- c(1L,3L,5L) # Notice capitalized L to denote an integer

# Character vector
grades <- c("high", "low", "high")  # Character vector
names <- c("Abby", "Ben", "Charlie")
mixed<- c(pi, 1, "sometext", TRUE) # Mixed vector, turns into character

# Logical vector
right <- c(TRUE, T, FALSE) # T and F are equivalent to TRUE and FALSE
wrong <- 2>10

# Let's examine these vectors
typeof(nums)
class(int)
mode(result) # These three functions may be regarded as equivalent
str(grades) # Checks vector structure
str(wrong)
is.na(c(grades, mixed, NA)) # Returns a logical vector testing for NA
length(grades) # Returns the number of observations in a vector
length(c(grades, mixed)) # Pretty straightforward
nchar(names) # Counts the character in each observation in a character vector.

# Access vector elements

nums[2]  # First
nums[2:3]  # Second to third
nums[c(1,3)] # First and third
sort(nums) # Order a vector
nums
order(nums) # Nearly the same but not exactly
class(sort(nums)) # Different class, but for most purposes they are the same
class(sort(integer))
sort(nums, decreasing=T) # Reverse

# Change vector elements
char.new <- c(grades,names) # Input and combine elements and vectors
char.new <- c(char.new, "Alan", "Bo")
char.new <- c(names, "Alan", "Bo")
nums[2]<-5   # Substitutes the second element
nums[c(1,3)]<-c(5,10) # Substitutes the first and third elements
nums<-c(nums,5,10)

# Change vector type
is.character(char.new) # First logical test for vector type
is.character(nums)
mixed
as.numeric(mixed) # Really did coerce a vector conversion. Turned non-numeric elements to NAs
grades
as.numeric(value.mixed)
is.factor(grades)
class(grades)
as.factor(grades)
grades # Remember to assign the coerced type
grades<-as.factor(grades)
levels(grades)
is.character(levels(grades)) # Yes, factors levels are characters
typeof(grades) # And factors are a special type of integer

# Missing data
num.new <- c(nums*nums,6,NA,9,NA)
mean(num.new)
is.na(num.new)
mean(num.new, na.rm = T)

# Array and matrix
mat<- matrix(1,4,5)
mat1<- matrix(1:10,4,5) # Sequence 1-10, 4 rows, 5 columns
mat2<- matrix(1:10, ncol=5,nrow=4) # Same code, but easier to understand
identical(mat1,mat2) # Yes, they are the same
mat[1:6]
mat[1,c(1:3)] # The first row, data for the first three columns
is.matrix(mat) # Logical expression testing if it is a matrix structure
length(mat[,2]) # Length of the second column of the matrix
ncol(mat) # Number of columns
nrow(mat) # Number of rows
dim(mat) # Dimensions of the matrix (rows and columns)


# Data frame
dat <- data.frame(8:10, c("exec", "mid", "exec"))
dat <- data.frame(score = 8:10, role = c("exec", "mid", "exec")) # Better! This one has clear, understandable column names
dat<-data.frame(nums, char.new,stringsAsFactors=FALSE) # stringsAsFactors=FALSE turn vectors with strings into a character vector
dat<-data.frame(nums, busperf,char.new) # Oops. What is happening here?
dat.mat <- data.frame(mat)
dat<-data.frame(dat, dept=c("sales",
"accounting", "sales", "IT", "RD"),stringsAsFactors=FALSE)
dat[1:6] # Oops
dat[1,c(1:3)]  # First row, column 1 through 3
dat[,c("dept","nums")] # Select only these columns, notice that you can change column order
paygrade<-c(1,3,2,1,3)
dat<-cbind(dat, paygrade) # Added the paygrade vector (column) into the dataframe
dat<-rbind(dat,c(5,"David", "logistics",2)) # Added a row
colnames(dat)<-c("Years", "Name", "Department", "Paygrade") # This is how you change column names of a dataframe
dat$Paygrade<-NULL

# List
Years <- c(2, 3, 5)
Code<- c("a1", "b1", "a1", "a1", "b1")
SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
sales<-list(Years, Code, SalesQuota)
sales[[1]] # All the elements in the first member (in this case, Years)
sales[[3]][3] # TRUE
sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it


#=======================================
# BSAD 443, Session 1 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

# Assign a few objects

#Numeric vector
nums  <- c(1,3.5,5)
result <-2^sqrt(91)
int <- c(1L,3L,5L) # Notice capitalized L to denote an integer

# Character vector
grades <- c("high", "low", "high")  # Character vector
names <- c("Abby", "Ben", "Charlie")
mixed<- c(pi, 1, "sometext", TRUE) # Mixed vector, turns into character

# Logical vector
right <- c(TRUE, T, FALSE) # T and F are equivalent to TRUE and FALSE
wrong <- 2>10

# Let's examine these vectors
typeof(nums)
class(int)
mode(result) # These three functions may be regarded as equivalent

str(grades) # Checks vector structure
str(wrong)

is.na(c(grades, mixed, NA, "NA", ".","")) # Returns a logical vector testing for NA

is.null(c(grades, mixed, "", NA, ".")) # Oops, this function behaves differently
is.null("") # Still not working
is.null(" ") # Nope
is.null(NULL) # Okay! But why?
class(NULL)
class(NA) # That's why. NULL is a class of it's own. We used it, for example, to drop data.

length(grades) # Returns the number of observations in a vector
length(c(grades, mixed)) # Pretty straightforward

nchar(names) # Counts the character in each observation in a character vector.

#### Answers to quiz
busperf<-c(9.5, 8, 9, 8.2, 7.1)
class(busperf)# numeric
mean(busperf) # 8.36
busperf<-c(busperf, 81,7)
mean(busperf) # 18.54

# Access vector elements

nums[2]  # First
nums[2:3]  # Second to third
nums[c(1,3)] # First and third

sort(nums) # Order a vector
nums
order(nums) # Nearly the same but not exactly
class(sort(nums)) # Different class, but for most purposes they are the same
class(sort(integer))
sort(nums, decreasing=T) # Reverse

#### Answers to quiz
nothundred<-seq(4,100)
halfhundred<-nothundred[nothundred>50]
sort(halfhundred, decreasing=T)

# Change vector elements
char.new <- c(grades,names) # Input and combine elements and vectors
char.new <- c(char.new, "Alan", "Bo")
char.new <- c(names, "Alan", "Bo")

nums[2]<-5   # Substitutes the second element
nums[c(1,3)]<-c(5,10) # Substitutes the first and third elements
nums<-c(nums,5,10)

#### Answers to quiz
num<-seq(9,99, by=3)
quantile(num) # 31.5 and 76.5
sd(num[num>20]) # 23.81176
# various ways to solve this one. Perhapse the easiest to understand:
n1<-num[num<51]
n2<-num[num>71]
n3<-c(n1,n2)
mean(n3) # 52.25

# Change vector type
is.character(char.new) # First logical test for vector type
is.character(nums)
mixed
as.numeric(mixed) # Really did coerce a vector conversion. Turned non-numeric elements to NAs
grades
as.numeric(value.mixed)
is.factor(grades)
class(grades)
as.factor(grades)
grades # Remember to assign the coerced type
grades<-as.factor(grades)
levels(grades)
is.character(levels(grades)) # Yes, factors levels are characters
typeof(grades) # And factors are a special type of integer

# Missing data
num.new <- c(nums*nums,6,NA,9,NA)
mean(num.new)
is.na(num.new)
mean(num.new, na.rm = T)

#### Answers to quiz
busperf<-sort(busperf)
busperf[c(2)]<-7.9
busperf[busperf==9]<-NA

# Array and matrix
mat<- matrix(1,4,5)
mat1<- matrix(1:10,4,5) # Sequence 1-10, 4 rows, 5 columns
mat2<- matrix(1:10, ncol=5,nrow=4) # Same code, but easier to understand
identical(mat1,mat2) # Yes, they are the same

mat[1:6]
mat[1,c(1:3)] # The first row, data for the first three columns
is.matrix(mat) # Logical expression testing if it is a matrix structure
length(mat[,2]) # Length of the second column of the matrix
ncol(mat) # Number of columns
nrow(mat) # Number of rows
dim(mat) # Dimensions of the matrix (rows and columns)

#### Answers to quiz
mat<-matrix(1:4,nrow=6, ncol=2)
mat[5,2]
mat[5,2]<-"some text"
# All the data elements in the matrix have turned into characters

# Data frame
dat <- data.frame(8:10, c("exec", "mid", "exec"))
dat <- data.frame(score = 8:10, role = c("exec", "mid", "exec")) # Better! This one has clear, understandable column names
dat<-data.frame(nums, char.new,stringsAsFactors=FALSE) # stringsAsFactors=FALSE turn vectors with strings into a character vector
dat<-data.frame(nums, busperf,char.new) # Oops. What is happening here?
dat.mat <- data.frame(mat)
dat<-data.frame(dat, dept=c("sales",
"accounting", "sales", "IT", "RD"),stringsAsFactors=FALSE)

dat[1:6] # Oops
dat[1,c(1:3)]  # First row, column 1 through 3
dat[,c("dept","nums")] # Select only these columns, notice that you can change column order

paygrade<-c(1,3,2,1,3)
dat<-cbind(dat, paygrade) # Added the paygrade vector (column) into the dataframe
dat<-rbind(dat,c(5,"David", "logistics",2)) # Added a row
colnames(dat)<-c("Years", "Name", "Department", "Paygrade") # This is how you change column names of a dataframe
dat$Paygrade<-NULL

#### Answers to quiz
hr<-data.frame(
	Experience=c(1,2,1,8),
	Performance=c(10,8,7,10),
	Employees=c(3,4,2,10),
	stringsAsFactors=FALSE
	)

hr<-data.frame(hr,
ID=c(12,13,14,15),
	stringsAsFactors=FALSE)

hr[3,2]<-9
mean(hr$Performance) # 9.25


# List
Years <- c(2, 3, 5)
Code<- c("a1", "b1", "a1", "a1", "b1")
SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
sales<-list(Years, Code, SalesQuota)

sales[[1]] # All the elements in the first member (in this case, Years)
sales[[3]][3] # TRUE
sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it


#=======================================
# BSAD 443, Session 3 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

# List
Years <- c(2013, 2014, 2015)
Code<- c("a1", "b1", "a1", "a1", "b1")
SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
sales<-list(Years, Code, SalesQuota)

sales[[1]] # All the elements in the first member (in this case, Years)
sales[[3]][3] # TRUE
sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it

# In case you want to, you can assign "column" names to a list like so:
# lapply(sales, setNames, c("Years", "Code", "SalesQuota"))

#### Answers to quiz
hacker<-data.frame(software=c("R","Python","SAS","R","SPSS"), years=c(4,3,10,4,1))
salary<-sample(85:150,20)
hackersalary<-list(hacker,salary)
hackersalary[[2]][2]<-100

# Inspecting data
data(mtcars) # load data from R.
	# If mtcars is not found do the following:
	# install.packages('datasets')
	# library(datasets)
	# data(mtcars)

dim(mtcars) # Number of rows and columns
ncol(mtcars) # Number of rows and columns
nrow(mtcars) # Number of rows and columns
names(mtcars)
colnames(mtcars)
rownames(mtcars)
str(mtcars) # Shows the structure of each vector (column) in the data frame
# I can also use rownames(mtcars) to see rows, but this is less commonly used
head(mtcars) # First 6 rows of the data+column names
tail(mtcars,n=10) # View last 10 rows+column names. We can do the same with head
mtcars[1:10,2:6] # Rows 1-10, columns 2-6
set.seed # We do this every time we run a simulation to ensure that it is random
smp<-sample(1:nrow(mtcars),10) # 10 random numbers from 1 to number of rows
mtcars[smp,] # Gives 10 random rows from the data
View() # Pops up the GUI viewer
fix() # To change (manually)
View(mtcars[smp,]) # Using the viewer to examine a random sample

#### Answers to quiz
# install.packages(car) # If you don't already have it
library(car)
data(Freedman)
dim(Freedman)
str(Freedman)
rownames(Freedman) # Cities are row names
fix(Freedman) # Manually change the value
Freedman$population[3]<-600 # That's one way of doing it
set.seed(789)
smp<-sample(1:nrow(Freedman), 25)
Fsamp<-Freedman[smp,]

# Exploratory stats
# Univariate distributions
summary(mtcars) # Basic distribution stats on each numeric variable in the data frame
quantile(mtcars$mpg)
quantile(mtcars$mpg,seq(0.1,1,by=0.1))
# Get additional univariate stats from the pscyh package:
# install.packages('psych')
# library(psych)
# describe(mtcars)

# Plotting distributions
par(mfrow=c(2,2)) #2x2 plots, so 4 plots at the same time
stripchart(mtcars$mpg)
dotchart(mtcars$mpg)
boxplot(mtcars$mpg) # Look for IQR (distance between 1-3 quantile) and median in the box, and min/max values outside
hist(mtcars$mpg) # Determining number of breaks should be decided
hist(mtcars$mpg, breaks=3)
hist(mtcars$mpg, breaks=12)
plot(density(mtcars$mpg)) # Desity kernel

d<-density(mtcars$mpg)
str(d)
summary(d)

# Oops!
dev.off()
hist(mtcars$mpg)
lines(density(mtcars$mpg)) # Oops!
hist(mtcars$mpg, freq=F, ylim=c(0,0.09)) # Better
lines(density(mtcars$mpg), col="red")

# Now let's save it as a file
png(filename="mpgplot.png")
hist(mtcars$mpg, freq=F, ylim=c(0,0.09), main="MPG Distribution")
lines(density(mtcars$mpg), col="red")
rug(mtcars$mpg, col="blue")
box()
dev.off()
View()
fix()
dim|ncol|nrow()
head|tail(df,n=10)
names|colnames|rownames()
View()
mtc<-mtcars
fix(mtc)
decreasing=T),
mtcars<-mtcars[order(-mtcars$mpg),]


#=======================================
# BSAD 443, Session 4 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

# Upload csv into R
failedbanks<-read.csv("http://www.fdic.gov/bank/individual/failed/banklist.csv", header=T) # Uploads failed banks data from FDIC into R.
# Instead of a URL it can be a local file e.g.: "y:/data/banklist.csv".

# Upload stock data (also csv)
install.packages('TTR')
library(TTR)
link <-"http://finance.yahoo.com/d/quotes.csv?s=AAPL+GOOG+MSFT&f=nab"
fin <- read.csv(link) # Oy, any ideas why? (hint: it's an easy fix)

# Upload text file into R
Apple2016Q1<-readLines("http://www.sec.gov/Archives/edgar/data/320193/000119312516559625/0001193125-16-559625.txt") # Uploads Apple's most recent 10-K filing.

# Query a database
install.packages("DBI","RPostgreSQL")
library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv,
	dbname="infs494",
	host="147.126.64.66",
	port=8432,
	user="dataminer1",            # use dataminer2-9
	password="summer2016")
dbListTables(con)
dbListFields(con, "crime")
crime.db <- dbReadTable(con, "crime")

# Upload data into a database
dbExistsTable(con, "crime")
dbWriteTable(con, "crime.part1", crime.db[1:50,],
	row.names=FALSE)
test<-dbReadTable(con, "crime.part1")
dbDisconnect(con)

# Website scraping
# install.packages("rvest")
# scrape data from Wikipedia
library(rvest)
tab<-read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
tab<-html_table(tab, fill=TRUE)
tab1<-tab[[1]]
tab1<-data.frame(tab1)
tab2<-data.frame(tab[[2]])
tab2.cln<-tab2[3:nrow(tab2),]
cols<-as.character(tab2[2,])
colnames(tab2.cln)<-cols

# Scrape data from Yelp
library(httr)
library(rvest)
page<-read_html("http://www.yelp.com/search?find_desc=&find_loc=water+tower+chicago%2C+il&ns=1")
nds<-html_nodes(page, "address")
address<-html_text(nds) # Messy! We'll get to how to "clean" such data in the next session.


#=======================================
# BSAD 443, Session 5 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

#------------#
# Session 5  #
#------------#

# Load messy data
# Figure out what's going on here
credanger<-read.csv("Credit complaints.csv", header=T)

# Messy column names
df <-data.frame(x.1=c("a",seq(1:4)),x.2=c("b",rnorm(4,mean=2,sd=.2)))
head(df)
names(df)
colnames[df]<-df[1] # Nope
colnames[df]<-as.vector(df[1]) # Nope
mat<-as.matrix(df)
cn<-as.vector(mat[1,])
colnames(mat)<-cn
df<-data.frame(mat)

# Or simply (for a small number of columns)
colnames(df)<("a", "b")
df<-df[-1,]
df

# Incorrect column type
df <-data.frame(a=c("1*",seq(1:4)),b=c(letters[1:5]))
summary(df)
str(df)

# Input data and convert column to correct type
df[1,1] <- 1
df$a<-as.numeric(df$a)
df$b<-as.character(df$b)
str(df)

# Get rid of odd characters
c1<-c("111", "123*", "145.0")
class(c1)
as.numeric(c1) # Oops. Let's try again:
c1<-c("111", "123*", "145.0")
grep("\\.|\\*|\\,|\\+", c1, ignore.case = T) # Matches pattern
gsub("\\.|\\*|\\,|\\+", c1) # Nope
c1<-gsub("\\*|\\,|\\+", "", c1) # Aha!
c1<- as.numeric(gsub("([0-9]+).*$", "\\1", c1)) # (\\1 means first match). Done!

# Missing data
age <- c(20, 30, NA, -99)
mean(age) # We already know that this isn't going to work
is.na(age) # Logical
which(is.na(age)) # Gives location of the missing data
mean(age, na.rm=T) # Not that easy
which(is.na(age)|age<18) # So we need to use common sense, and we should also do this:
age[age==-99]<-NA
mean(age,na.rm=T)

#Time formatting
d<-c("2016/01/01", "2016/01/02", "2016/01/03", "2016/01/04")
class(d)
summary(d)
d<-as.Date(d,"%Y/%m/%d")
class(d) # New class: date.

# But what if we want to extract year/month/day from a date?
# It's best to use a package. The fastest and best today is this one:
install.packages(lubridate)
library(lubridate)
d<-c("2016/01/01", "2016/01/02", "2016/01/03", "2016/01/04")
d1<-ymd(d) # That was easy! And these are easy (and very useful) too:
y<-year(ymd(d1))
m<-month(ymd(d1))
d<-day(ymd(d1))
w<-week(ymd(d1))
q<-quarter(ymd(d1))
#Merge data
A<-data.frame(loanID=c(20,21,22,23),gender=c("f", "m", "m","f"),
              amount=c(1000,5500,2000,6000))
B<-data.frame(loanID=c(23,21,22,20,24),years.employed=c(3,2,4,5,6))
AB<-merge(A,B, by="loanID")
AB<-merge(A,B, by="loanID", all=T)
AB<-merge(A,B, by="loanID", all.x = T)
AB<-merge(A,B, by="loanID", all.y = T)
AB<-merge(A,B, by="loanID", all=T)


#=======================================
# BSAD 443, Session 6 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

#------------#
# Session 6  #
#------------#

# Read bank data
path<-file.choose()
bank<-read.csv(path, header=T)
attach(bank) # This function loads the data into memory, so that you don't have to type df$variable. Just type variable.
# When you are finished: detach(df)

# Let's explore the data
dim(bank)
str(bank) 	# Seems to read it okay
summary(bank)
head(bank)
tail(bank)

# Review distribution of numeric varilables
numcol<-sapply(bank, is.numeric) 	# sapply applies a function to a vector/list, in this case, is.numeric
numbank<-bank[numcol] 	# Copied the numeric variables so it would be easier to inspec them

sapply(numbank, mean)
sapply(numbank, sd) 	# All excpet age seem very skewed. Let's verity.

par(mfrow=c(2,2))
hist(numbank$age)
plot(density(numbank$age))
qqnorm(numbank$age)
qqline(numbank$age, col="red")
# Compared to normal distribution
n<-rnorm(1:200)
qqnorm(n)
qqline(n, col="red")
dev.off()
# There is also a formal test
shapiro.test(numbank$age)
shapiro.test(n) # Well, age is skewed, and it is the best one of our numeric vars! This is common in the BA data.
# See an interesting discussion here: http://stats.stackexchange.com/questions/2492/is-normality-testing-essentially-useless
# Now let's inspect one more numeric variable (you'll do the rest on your own)
summary(numbank$balance) # We have a fair amount of negatives, and it does not seem normally distributed. What about sd?
sd(numbank$balance)	# Much higher than the mean. It is definately skewed.
par(mfrow=c(2,2))
hist(numbank$balance)
plot(density(numbank$balance)) # Yes, it is very skewed,
qqnorm(numbank$balance)
qqline(numbank$balance, col="red")
qqnorm(log(numbank$balance)) 	# Oops. Log doesn't work with negative numbers. What to do?
# First, how many accounts with negative balance?
tbl<-table(numbank$balance<0) # about 8%. Let's be accurate:
# Do we remove them? No!
lbal<-log(numbank$balance+abs(min(balance))+1)
plot(density(lbal)) # Better, but not normally distributed. This is fine for now.

# Let's test the transformed vars against the reponse var:
boxplot(lbal~bank$y)
# One last effort
install.packages('extremevalues')
library(extremevalues)
go<-getOutliers(numbank$age)
go$iRight
numbank$age[go$iRight]
# And for fun:
evGui(numbank$age)

# We can then remove the outliers (rows), and normalize the age variable. We could also do box-cox transformation.
# But we'll leave it for now.

# Before we get to examine factors, we need to examine the relationship among the numeric vars
round(cor(numbank),2) # Nothing exciting here, except previous and pdays
# And visually:
pairs(numbank) # This could take some time


# And then the relationship with these vars and our y
boxplot(numbank$age~y) # The ages of those who responded may be a little more distributed
boxplot(numbank$balance~y) # Hard to decipher
boxplot(lbal~y) # Ditto

# One more approach to consider is to cut the nueric data into ordinal categories:
catbalance<-cut(numbank$balance, breaks=c(quantile(bank$balance, seq(0,1, by=0.25))))
table(catbalance)  # Equal sizes, but unclear lables
catbalance<-cut(bank$balance, breaks=c(quantile(bank$balance, seq(0,1, by=0.25))), labels=F)
tcbal<-table(catbalance,bank$y) # Not sure that this is ideal either. Consider recoding into sensible levels
prop.table(tcbal) # Let's round it
pcbal<-round(prop.table(tcbal),2)
addmargins(pcbal,c(1,2))
boxplot(catbalance~bank$y) # Interesting!

# And now for factors
# First, the response variable iteself
table(y) # Shows frequencies
table(y)/nrow(bank)# Shows proportions
r<-round(table(y)/nrow(bank),2) # Easier to understand
pt<-r*100 # Even better in percentages

plot(bank$y) # Good old barplot, show 88% / 12%
baplot(r) # The table from a few lines earlier
plot(bank$marital)
plot(bank$housing)

# Do it for all variables. Which ones seem interesting?

# Now let's explore bivariate relationships
tmar<-table(bank$marital, bank$y)
mtmar<-addmargins(tmar,c(1,2))
plot(tmar) # Married seemed to be slightly less excited about this campaign
# Or prop table
ptmar<-prop.table(tmar)
addmargins(ptmar, c(1,2))

# Continue to explore these tables. What did you find?

# What if we want more than 2-way tables?
# ftable (flat table)
ft<-ftable(bank$job,bank$marital, bank$education, bank$y)
# Or write it this way:
ftable(bank[c("y", "marital", "education", "y")]) # As many factors as you'd like

# You can also use xtabs
xt<-xtabs(~y+marital+education, data=bank) # More readable?
prop.table(xt)

# A better organized output with crosstabs
install.packages('gmodels')
library(gmodels)
CrossTable(y=bank$y, x=bank$education) # Nice. There seem to be something going here
CrossTable(y=bank$y, x=bank$education, chisq=T, format="SPSS") # I like this format,
# and Chi-square test of independence, rejected. The two vars are related.

# Finally, if you like pivot tables there are various options.
# The reshape and plyr packages offer very good alternatives
# And this one: http://www.magesblog.com/2015/03/pivot-tables-with-r.html cool interactive, web-based pivot tables


#=======================================
# BSAD 443, Session 7 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

#------------#
# Session 7  #
#------------#

# A few bits of information:
# Bank data come from here: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

# This is how you drop rows when a certain column contains NAs
df<-df[!is.na(df$var),]
# or replace with avg.
df$var[is.na(df$var)] <- mean(df$var,na.rm=T)

# Decision trees
# Steps to follow (with any machine learning algorithm)
# 1. Load the data and clean the data if needed
# 2. Chop the data (usually 90-10, 80-20, 75-25, or 67-33)
# 3. Train
# 4. Model
# 5. Evaluate and fine-tune

#1. Load the data (We did this yesterday, in session 6), and randomize it
path<-file.choose()
bank<-read.table(path, header=T, sep=";")
set.seed(199)
bankrand <- bank[order(runif(nrow(bank))), ] # runif() produces a list of random numbers. This one shuffles the data randomly
bankrand<-bankrand[1:(nrow(bankrand)/10),] # I am using only 10% of the data. Otherwise, it will take a long time to run.

#To make sure the random algorithm got it right
tb<-(bank$y)
prop.table(tb)
tb1<-(bankrand$y)
prop.table(tb1) # Looks pretty close. We're good to go.

#2. Chop the data into training (90%) and testing (10%) sets
trn<-1:(nrow(bankrand)*0.90)
tst<-(tail(trn,1)+1):nrow(bankrand)
bank_train <- bankrand[trn, ]
bank_test <- bankrand[tst, ]

# Just to be on the safe side. Looks pretty close. Good!
round(prop.table(table(bank_train$y)),2)
round(prop.table(table(bank_test$y)),2)

#3. Train
install.packages("C50") # There are a few more packages for CART. This one is considered among the best.
library(C50)
bank_model <- C5.0(bank_train[-17], bank_train$y) # ommits the outcome variable
summary(bank_model) ) # Wow! Only 7% errors.
# plot(bank_model # This will take a long time, but it is very informative
plot(bank_model, subtree=2)

#But don't jump to conclusions. Decision trees are known for
# having a tendency to overfit the model to the training data.
# For this reason, the error rate reported on training data may be overly optimistic, and it is especially important to evaluate decision trees on a test dataset.

# 4. Model
bank_pred <- predict(bank_model, bank_test)
library(gmodels)
CrossTable(bank_test$y, bank_pred, prop.chisq = F, prop.c = F, prop.r = F,
dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
# About 11% misses.

# 5. Evaluate and fine-tune

# The C5.0 algorithms let's us to add additional  trials.
# We'll start with 10 trials. That's the most very common # of trials,
# as research suggests that this reduces error rates on test data by about 25 percent.
# It also selects the best size of the tree (e.g., "pruning")

# First, let's do some boosting
bank_boost10 <- C5.0(bank_train[-17], bank_train$y, trials = 10)
bank_boost10
summary(bank_boost10) # Summarizes itterations, tree size and classification performance.
plot(bank_boost10, subtree=2)
bank_boost_pred10 <- predict(bank_boost10, bank_test)
CrossTable(bank_test$y, bank_boost_pred10, prop.chisq = F, prop.c = F, prop.r = F,
dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
# Still not great, but we have improved the model by at least 1%

# Now let's reduce the size of the tree ("pruning")
bank_control <- C5.0(bank_train[-17], bank_train$y, control = C5.0Control(minCases = 20), trials=10)
summary(bank_control) # Avg. tree size is 9. That's because of our control parameter
plot(bank_control) # Having less nodes allows us to plot the tree model
bank_control_pred <- predict(bank_control, bank_test)
CrossTable(bank_test$y, bank_control_pred, prop.chisq = F, prop.c = F, prop.r = F,
           dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))

#### Now let's try doing the same with logistical regression
# We already did stage 1 and 2
# 3. Train
logit_train<-glm(y ~.,family=binomial(link = "logit"), data=bank_train)
summary(logit_train)
anova(logit_train, test="Chisq") # Model is clearly preforming well when comparing null deiance against the residual deviance.
# 4. Test
logit_test <- predict(logit_mod,newdata=bank_test,type='response')

# 5. Evaluate and fine-tune
logit_test <- ifelse(logit_test > 0.5,1,0)
logit_test <- gsub("0", "no", logit_test)
logit_test <- gsub("1", "yes", logit_test)
logit_test <- as.factor(logit_test)
logitError <- table(logit_test == bank_test$y) # About 10% errors. Not bad!


#=======================================
# BSAD 443, Session 8 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

#------------#
# Session 8  #
#------------#

############# EVEN MORE FUNCTIONS #############

# 1.  CONVERTING A NUMERIC VARIABLE INTO A FACTOR WITH DIFFERENT LEVELS
# Usually cut works, but not always
an<-cut(loans$annual_inc, breaks=3) # Oops.

# Solution 1, use cut2 from the Hmisc package
install.packages(Hmisc)
library(Hmisc)
ai<-cut2(loans$annual_inc,m=5000)

# Solution 2, (1)  convert to integer and then to character. (2) Then match and convert strings into different categories
ai<-as.character(as.integer(loans$annual_inc))  # Converts all non-integers into integers
ai[ai %in% c(1:62410)]<-"low" # I can now select whatever categories I want
ai[ai %in% c(62411:7446395)]<-"high" # This one could be slow becuase it matches a large number of strings

# 2. BIVARIATE CORRELATIONS
# When missing values are present
cor(loans$loan_amnt, loans$annual_inc) # Gives NA
cor(loans$loan_amnt, loans$annual_inc, use="complete.obs") # 0.3 They are correlated

# Plot correlations
plot(loans$loan_amnt, loans$annual_inc) # Generateds a scatterplot that is very messy, and we have an outlier
identify(loans$loan_amnt, loans$annual_inc) # It's 19218
loans<-loans[-19218,]
plot(loans$loan_amnt, loans$annual_inc)
plot(log(loans$loan_amnt), log(loans$annual_inc)
abline(lm(log(loans$annual_inc)~log(loans$loan_amnt)), col="red") # This adds a fitted line from a regression model.
#Note that the order of the vars has flipped and note the tilde.
lines(loess.smooth(log(loans$loan_amnt), log(loans$annual_inc)),col="green") # This is a "smoother" which fits a line that may or may not be straight

# 3. DATA SLICING AND AGGREGATION
# Base R has an aggregate function, but it is overly complicated. We'll use dplyr instead.
library(dplyr)
chisal<-read.csv(path, header=T) # Read Chicago's salary data
head(chisal)
dim(chisal)
chisal$Employee.Annual.Salary<-as.numeric(gsub("\\$","", chisal$Employee.Annual.Salary))
names(chisal)

# Slice the data
slice(chisal, c(100:110, 5:50)) # Shows these rows
head(arrange(chisal, desc(Employee.Annual.Salary))) # Orders the data by salar in a descending order
head(select(chisal, Department, Employee.Annual.Salary)) # Selects only these two columns, in this order
head(select(chisal,-(c(Name,Department))))  # Drops these two columns
highpaychipol<-filter(chisal, Department=="POLICE", Employee.Annual.Salary>100000) # Select particular cases
nrow(sample_frac(chisal, 0.1)) # Random sample of 0.1 (10% or rows)
# Aggregate data
by_dept <- group_by(chisal, Department)
sals<-summarize(by_dept, count=n() , avg.salary=mean(Employee.Annual.Salary, na.rm=T))
sals.bigdept<- filter(sals, count > 1000)
# More information, including additional features: https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html

# 4. SAVE DATA FRAME
write.csv(df, "newfile.csv", row.names=F)  # This will save a dataframe called df as a CSV file called newfile.csv in my working directory.

# 5. MANAGE WORKSPACE
save.image() # Will save all objects in R's session
save(loans, chisal, path, file="test.RData") # Will save the objects called loans and path into the file test.RData
rm(list=ls(all=TRUE)) # Removes ALL objects in R
load(file="test.RData") # Will open the objects stored in the test.RData file

############# PLOTS #############
install.packages("ggplot2")
x<-seq(-4,9,0.2)
y<-2*x^2+4*x-2
plot(x,y)
plot(x,y, pch=16, col="red")
# pch are symbols. See: http://www.statmethods.net/advgraphs/parameters.html
plot(x,y, pch=16, col="red", xlim=c(-5, 10), ylim=c(-5,210), main="Nice plot!", xlab="The X", ylab="The Y")
lines(x,y, col="green") # Adds a line
abline(h=1)
abline(v=-0.5)
text(-0,50,"Intresting!")
legend(-5,100,"Legend here")
# Baic plot types in R: p=points; l=lines; o=overplotted points and lines; b, c=joined by lines; s, S=stair steps; h=histogram-like vertical lines; n=nothing/empty
colors() # Color names (657)
grep("green", colors(), ignore.case = T, value = T) # All variations of "green"
# Or use color codes instead, using Hexadecimal code: http://html-color-codes.com/

# Add multiple trendlines
X<-c(1,2,3,4,5,6,7)
Y1<-c(2,4,5,7,12,14,16)
Y2<-c(3,6,7,8,9,11,12)
Y3<-seq(1,30, by=3)
plot(X,Y1,type="o",pch=17,cex=1.2,col="darkgreen",ylim=c(0,20)) # Initiates the plot
lines(Y2,type="o",pch=16,lty=2,col="blue")
lines(Y3,type="o",pch=2,lty=4,col="darkblue")
title(main="Three vars", col.main="darkgray", font.main=4)

# https://en.wikibooks.org/wiki/R_Programming/Graphics
# Good scripts to convert tables/descriptives to plots
#http://tables2graphs.com/doku.php?id=03_descriptive_statistics#figure_1


# ggplot2 (qplot)
# http://www.statmethods.net/advgraphs/ggplot2.html
# http://www.cookbook-r.com/Graphs/

library(ggplot2)
data("diamonds")
sdiam<-sample_frac(diamonds, 0.01) # Select a random sample of 1%
qplot(carat, price, data = sdiam)
qplot(log(carat), log(price), data = sdiam)

# Now let's add colors. Note how easy it is!
qplot(carat, price, data = sdiam, shape=cut)
qplot(carat, price, data = sdiam, color=cut)
qplot(carat, price, data = sdiam, color = I("blue"))
qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3)) # 0=transparant; 1=not
qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3), size = I(5)) # Bigger
qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3), size = I(5), shape=I(3))
# Shapes are: 1=circle; 2=triangle; 3=plus; 4=cross; 5=diamonds

# You can combine multiple qplot types with c(), as in geom = c("smooth", "point")
qplot(carat, price, data = sdiam, geom = c("smooth", "point"), )

# There are multiple smoothers, but loess (which is the most common), is the default
# controling loess by span=1 (relatively straight, underfitting) to span=0 (curvy, overfitting)

# NOTES
# qplot can generate other types of plots (the geom parameter)
# geom = "point" scatterplot, default for x,ylab
# geom = "smooth" fits a smoother to the data and displays the smooth and its standard error
# geom = "boxplot" produces a box-and-whisker plot to summarise the distribution of a set of points
# geom = "path" and geom = "line" draw lines between the data points

# Ordinal/continuous vars geom = "histogram"; geom ="freqpoly" a frequency polygon, and geom = "density" creates a density plot
# For discrete variables, geom = "bar" makes a bar chart


# Distribution plots
# Continue using the recipe found here: http://www.cookbook-r.com/Graphs/Plotting_distributions_%28ggplot2%29/
ggplot(bankrand, aes(x=balance)) + geom_histogram(binwidth=.5)
ggplot(loans, aes(x=tot_hi_cred_lim)) + geom_histogram(binwidth=.5, color="black", fill="blue")
ggplot(loans, aes(x=tot_hi_cred_lim)) + geom_density()

# Correlation plots
numcol<-sapply(loans, is.numeric)
nl<-loans[,numcol]
nl1<-nl[,5:10] # It is hard to interpret vars. You can select a few a time.
ctab<-round(cor(nl1, use="complete.obs"),2)
plotcorr(ctab, mar = c(0.1, 0.1, 0.1, 0.1))

# Do the same, but with colors corresponding to value
colorfun <- colorRamp(c("#CC0000","white","#3366CC"), space="Lab")
plotcorr(ctab, col=rgb(colorfun((ctab+1)/2), maxColorValue=255), mar = c(0.1, 0.1, 0.1, 0.1))

colorfun <- colorRamp(c("#CC0000","white","#3366CC"), space="Lab")
plotcorr(ctab, col=rgb(colorfun((ctab+1)/2), maxColorValue=255),
         mar = c(0.1, 0.1, 0.1, 0.1))

# INTERACTIVE/WEB PLOTS
# From: http://ramnathv.github.io/rCharts/
require(devtools)
install_github('ramnathv/rCharts')
names(iris) = gsub("\\.", "", names(iris))
rPlot(SepalLength ~ SepalWidth | Species, data = iris, color = 'Species', type = 'point')

data(economics, package = "ggplot2")
econ <- transform(economics, date = as.character(date))
m1 <- mPlot(x = "date", y = c("psavert", "uempmed"), type = "Line", data = econ)
m1
hair_eye_male <- subset(as.data.frame(HairEyeColor), Sex == "Male")
n1 <- nPlot(Freq ~ Hair, group = "Eye", data = hair_eye_male, type = "multiBarChart")
n1$save('mychart.html', standalone = TRUE) # Generates a webpage with embedded interactive chart
# Or publish online
n1$publish('Scatterplot', host = 'gist') # e.g., github


#=======================================
# BSAD 443, Session 9 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

#------------#
# Session 9  #
#------------#

######## FROM EXCEL ########
install.packages("readxl")
nasa<-read_excel("NASA_Labs_Facilities.xlsx")
read_excel("my-spreadsheet.xls", sheet = "data")
read_excel("my-spreadsheet.xls", sheet = 2)

# Write to Excel using XLConenct: https://cran.r-project.org/web/packages/XLConnect/vignettes/XLConnect.pdf


######## WORDCLOUD ########

install.packages(c("wordcloud","RColorBrewer"))
library(wordcloud)
library(RColorBrewer)

pal <- brewer.pal(4,"RdGy") # This means palette 4. Try other ones.

wordcloud("R is a programming language and software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis. Polls, surveys of data miners, and studies of scholarly literature databases show that R's popularity has increased substantially in recent years."
, min.freq = 1, scale=c(2,0.5), random.color = T, color = pal) # Word cloud from the first paragraph of Wikipedia's entry on R

# Now you try it with other text. For example:

# Compare annual filings (try to read only 100 lines, or so. Otherwise, it can get slow):
apple94<-readLines("http://www.sec.gov/Archives/edgar/data/320193/0000320193-94-000016.txt")[100:200]
apple2000<-readLines("http://www.sec.gov/Archives/edgar/data/320193/000091205700053623/0000912057-00-053623.txt")[100:200]


# Use the same procedure to plot Trump's address:
  http://blogs.wsj.com/washwire/2015/06/16/donald-trump-transcript-our-country-needs-a-truly-great-leader/

# And Clinton's speech:
  http://time.com/3920332/transcript-full-text-hillary-clinton-campaign-launch/

# And whatever you think could be intereting/fun. Project it on the screen so that all can enjoy!


######## TWITTER ########
# 1. http://twitter.com/signup #first sign up. You'd need to provide a mobile number.
# 2. https://apps.twitter.com (sign in with your account)
# 3. Create New App
# 4. Put some information (it doesn't have to be relevant/accurate), and accept
# 5. Click on the âkeys and access tokenâ: You need
	#1. Consumer Key (API Key)
	# 2. Consumer Secret (API Secret)
	# click the âCreate my access tokenâ button.
	# 3. Access Token
	# 4. Access Token Secret
# In R:
install.packages('twitteR')
install.packages('tm')
library(twitteR)
library(tm)
consumer_key <- "" #API Key
consumer_secret <- "" #API Secret
access_token <- "" #Access Token
access_secret <- "" #Access token secret
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) # Select 1
rm(consumer_key, consumer_secret, access_token, access_secret)
tweet("Test 1,2,3") # Can you see it?
local_cache <- get("oauth_tken", twitteR:::oauth_cache) # saves the oauth token so we can reuse it
save(local_cache, file="data/oauth_cache.RData")

# Harvest tweets
Clinton <- searchTwitter("Clinton", n=1500)
          length(Clinton)
# Take a few of them
cl<-Clinton[1:100]
# Process text
cl <- sapply(cl, function(x) x$getText())
cl_corpus <- Corpus(VectorSource(cl))
cl_corpus<- tm_map(cl_corpus, content_transformer(tolower))
cl_corpus <- tm_map(cl_corpus, removePunctuation)
cl_corpus <- tm_map(cl_corpus, function(x)removeWords(x,stopwords()))
# Plot wordcloud
wordcloud(cl_corpus)

# Search for other terms
# Apply wordcloud parameers above and share will class


######## DATA MINING ########
wine <- read.csv("whitewines.csv")
# examine the wine data
str(wine)
# Data are already randomized. We're going to cut it 75/25
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
# The outcome variable is numeric (quality), so we'll use the rpart decision tree algorithm
install.packages('rpart')
library(rpart)
# Build the model
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
# get more detailed information about the tree
summary(m.rpart)
# use the rpart.plot package to create a visualization
install.packages('rpart.plot')
library(rpart.plot)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
# generate predictions for the testing dataset
p.rpart <- predict(m.rpart, wine_test)
# compare the distribution of predicted values vs. actual values
summary(p.rpart)
summary(wine_test$quality)
# compare the correlation
cor(p.rpart, wine_test$quality) # Highly correlated
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
  mean(abs(actual - predicted))
}
# mean absolute error between predicted and actual values
MAE(p.rpart, wine_test$quality)
# mean absolute error between actual values and mean value
mean(wine_train$quality) # result = 0.59 quality units when the scale is 0-10. Not bad!

# There are ways to fine-tune the model. But that's for anotehr class.


#=======================================
# BSAD 443, Session 9 - Answers
# Quinlan School of Business - Loyola University Chicago
# Copyright Zack Kertcher, PhD, 2017. All rights reserved.
#=======================================

# ds1

# 1.
unirank<-read.csv(path, header=T, stringsAsFactors = F)
unirank$score<-as.numeric(unirank$score)
round(mean(unirank$score, na.rm=T),2)
# 47.79
round(range(unirank$score, na.rm=T),2)
#  43.36 100.00
round(IQR(unirank$score, na.rm=T),2)
# 3.07

# 2.
library(dplyr)
unirank$patents<-as.numeric(unirank$patents)
by_country <- group_by(unirank, country)
pat<-summarize(by_country, avg.patents=mean(patents, na.rm=T))
pat[country=="USA"|country=="Japan"]
pat[pat$country=="USA"|pat$country=="Japan",]
# Japan    353.3962
# USA    296.2792

# 3.
# easy way
cor(unirank$publications, unirank$influence, use="complete.obs")
# 0.8748249
cor(unirank$publications, unirank$patents, use="complete.obs")
# 0.6713219

unirank$influence<-as.numeric(unirank$influence)
cor(unirank$influence, unirank$patents, use="complete.obs")
# 0.6111321

# a bit harder (but better) way
tmp<-unirank[,c("influence", "patents", "publications")]
cor(tmp, use="complete.obs")

#             influence   patents publications
# influence    1.0000000 0.6111321    0.8747273
# patents      0.6111321 1.0000000    0.6709847
# publications 0.8747273 0.6709847    1.0000000

# They are very highly correlated, just to illustrate:
cor.test(tmp$influence, tmp$publications,use = "complete.obs" )
plot(tmp$influence, tmp$publications)
abline(lm(tmp$publications~tmp$patents),col="red", lwd=3)


# ds2

# 1.
cars<-read.csv(path, header=T, stringsAsFactors = F)
cars$year<-as.numeric(gsub("\\,","",cars$year))
cars$price<-as.numeric(gsub("\\,","",cars$price))
cars$mileage<-as.numeric(gsub("\\,","",cars$mileage))
d<-as.Date(cars$date, "%d-%b-%y")
library(lubridate)
cars$month<-month(d)
summer<-cars[cars$month>5,]
sp<-mean(summer$price, na.rm=T)
othermonths<-cars[cars$month<6,]
op<-mean(othermonths$price, na.rm=T)
op-sp
# 3835.104

# 2.

by_color <- group_by(cars, color)
color<-summarize(by_color, freq=n(), na.rm=T)
color[order(color$freq),]
# least: Gold, Yellow  Most: Black, Silver
hotcoldcars<-cars[(cars$color=="Black"|cars$color=="Silver"|cars$color=="Gold"|cars$color=="Yellow"),]
nrow(hotcoldcars)
# 70

# 3.
my<-max(cars$year)
cars$age<-my-cars$year

# 4.
plot(cars$mileage, cars$age)
abline(lm(cars$age~cars$mileage), col="red", lwd=2)
cor(cars$mileage, cars$age)
# 0.7603127


#------------#
# Homework 2 #
#------------#
# Task 1

# 1. character, numeric and logical
# 2. for example, class() or str()
# 3.
[1] FALSE
[1] TRUE
# 4. The first one is false because 10 is numeric. The second one is true because as.character converted the 10 into a character.

# Task 2
#1. It doesn't work because to extract a data element, you need to use brackets instead
# of parentheses. Parentheses are for functions.

#2.
v<-c(1,2,3,9,15)
v<-as.factor(v)
# Simply way
v[v==1]<-NA
v[v==15]<-NA
# Short way
v[v==1&v==15]<-NA
# Then
levels(v)
droplevels(v)

# Task 3
# 1.
department<-data.frame(age=c(23,30,45), gender=c("M", "F", "M"), level=c(3,6,8), stringsAsFactors = FALSE)
# 2.
department<-rbind(department, c(31,"F", 7))
# 3.
department<-cbind(department, sales=c(10,9,9,8))
# 4.
snapshot<-department[,c("age","sales")]
# 5.
snapshot[snapshot$age>=31,]
# 6.
# There are various ways to doing this. For example:
# You can say which rows to drop
snapshot[-c(2),] # Drop this row
# or
delrows<-c(2)
snapshot[-delrows,]
# You can also say which rows to keep
snapshot[c(1,3:4),] # Keep other rows
# or
keeprows<-c(1,3:4)
snapshot[keeprows,]

# 7.
# Again, there are various ways to doing this. For example:
# Use the NULL option
snapshot[,c("age")]<-NULL
snapshot$age<-NULL
# Or declare a column to keep as a vector, and then refer to this vector.
#The vector can be the name or number of the column.
keepcols<-c("sales")
snapshot[keepcols]
# Or declare which column to drop as a vector, and then refer to this vector.
# The vector can be the name or number of the column.
dropcols<-1
snapshot[-dropcols]


#------------#
# Homework 3 #
#------------#
# Task 1
# 1.
lst<-list(ID=seq(1,5), data.frame(name=c("a", "b"), age=c(55,42)))

# 2.
lst[[1]][2]<-10
# 3.
lst[[2]][1,2]

# Task 2
# 1.
install.packages('car')
library(car)
data(Anscombe)

# 2.
# Multiple ways to do this. This one is simplest:
quantile(Anscombe$education) # Note 25th quantile is 165
low<-rownames(Anscombe[Anscombe$education<=165,])  # Creating a vector to makes it easier moving forward

# 3.
sort(low)
# "AL" "AR" "GA" "KY" "LA" "MS" "NC" "NE" "OK" "SC" "TN" "TX" "WV"

# 4.
relspend<-Anscombe$education/Anscombe$income

# 5. For example,
mean(relspend)
sd(relspend)

# 6. For example,
hist(relspend)
plot(density(relspend))

# 7. Yes

# 8.
Anscombe$relspend<-relspend

# 9.
hispenders<-Anscombe[Anscombe$relspend>median(Anscombe$relspend),]


#------------#
# Homework 4 #
#------------#

# Task 1

# Much of the first stage can be done within Excel prior to uploading to R. But here is the way to do it all in R.

# 1. Upload file
fl13<-file.choose()
fbi13<-read.csv(fl13, header=T)

# 2. Data wrangling
head(fbi13) # Need to remove first couple of lines and change column names, still I only care about city name, population and violent crime
fbi13<-fbi13[,c(1:3,10)] # Keep only the columns I need. I will save time.
fbi13<-fbi13[3:nrow(fbi13),]
tail(fbi13) # Oops. Lots of junk there too. Let's see how far it goes:
tail(fbi13, n=20) # Okay, action starts at line 9295. Let's drop anything after it.
fbi13<-fbi13[-c(9295:nrow(fbi13)),] # Multiple ways to doing this. We can also just declare the rows we want to keep, as opposed to those we want to drop.
colnames(fbi13)<-c("state", "city", "population", "property.crimes") # still need to drop the first row--the one containing original column names.
fbi13<-fbi13[-1,] # Good!

# Variable wrangling
fbi13$population<-gsub("\\,", "", fbi13$population)
fbi13$population<-as.numeric(fbi13$population)
fbi13$property.crimes<-gsub("\\,", "", fbi13$property.crimes)
fbi13$property.crimes<-as.numeric(fbi13$property.crimes)

# 3. Select big cities
big.cities<-fbi13[fbi13$population>200000,]
View(big.cities) # There are some issues here.
	# 1. State has little role to play here, so let's drop it.
	# 2. A few rows contain NA values. We need to drop them.
	# 3. Some city names have numbers next to them (this was originally a footnote). We need go get rid of these.
# Here we go:
	# 1. Remove state column:
	big.cities$state<-NULL
	# 2. Remove rows with NAs:
	big.cities<-big.cities[complete.cases(big.cities),]
	# or
	big.cities<-na.omit(big.cities)
	# 3. Remove numbers and special characters from city names
	big.cities$city<-gsub("\\d","",big.cities$city) # \\d removes digits from a string
	big.cities$city<-gsub("[[:punct:]]","" , big.cities$city) # removes special characters from city names

# 4. Identify cities with the lowest crime rates
big.cities$crime.ratio<-big.cities$property.crimes/(big.cities$population/1000)
# We can also round it, but this is not needed. It just looks nicer.
big.cities$crime.ratio<-round(big.cities$crime.ratio,2)
# Now let's sort the data
big.cities<-big.cities[order(big.cities$crime.ratio),]
big.cities$city[1:10]

# In case you are wondering, Chicago is here
grep("chicago", big.cities$city, ignore.case=T) # 42 out of 112