Advertisement
Guest User

BSAD 443 - Business Analytics - Zack Kertcher

a guest
Mar 5th, 2017
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 57.28 KB | None | 0 0
  1. #=======================================
  2. # BSAD 443, Session 1
  3. # Quinlan School of Business - Loyola University Chicago
  4. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  5. #=======================================
  6.  
  7. # OLS regression model
  8. #====
  9.  
  10. model <- lm(price ~ .,realestate) # the period means all variables
  11. summary(model)
  12.  
  13.  
  14. # But first a bit of R
  15. #====
  16.  
  17. # Orientation
  18. #=
  19. getwd() # shows us where the working directory is
  20. setwd("c:/temp") # sets the working directory to the "temp" folder under drive "c" -- note direction of slash!
  21. dir.create('BSAD443') # creates a directory called IDS575 under temp. Note that R is, for the most part, accpets both single and double quatation marks
  22. setwd("BSAD443")
  23. getwd()
  24. # [1] "c:/temp/BSAD443"
  25.  
  26. # Install a package
  27. #=
  28. # use RStudio's gui, or
  29. install.packages("dplyr")
  30. library(dplyr) # makes all functions in dplyr available, or
  31. dplyr::  # brings up a list of functions to choose from
  32.  
  33. # Basic operations
  34. #=
  35. 5+6
  36. # [1] 11
  37. 8*7
  38. # [1] 35
  39. 10*5 / 0.5
  40. # [1] 100
  41.  
  42. # Vectors  
  43. a <- 5+6
  44. b <- 8*7
  45. c <- a+b
  46. c
  47.  
  48. x <- "BSAD443"
  49. y <- "is where you will learn that"
  50. z <- "equals"
  51. something <- c(x,y, a, "plus", b, z, c) # c is a function, stands for combine, or concatenate
  52. length(something) # length is also a function
  53. # [1] 7 - there are 7 elements in the "something" vector
  54.  
  55.  
  56. # Classes and structures
  57. #=
  58. is.vector(something)
  59. is.vector(something)==FALSE
  60. class(something) # class is also a function
  61. class(c)
  62. class(a>b)
  63. a>b
  64. ?class # help on a function
  65. help(package="ggplot2") # help on a package (along with its functions)
  66.  
  67. # Environment
  68. #=
  69. ls()  # shows a list of objects that were saved into the R environment (also see them in the top-right pane)
  70. save(a,b,x,y, file="firststeps.Rdata")
  71. rm(list=ls())
  72. ls()
  73. # character(0)
  74. load("firststeps.Rdata")
  75. ls()
  76. # [1] "a" "b" "x" "y"
  77.  
  78. # Classes/data type
  79. #=
  80. a <-5
  81. b<-"hello"
  82. c<-FALSE
  83.  
  84. class(a)
  85. #[1] "numeric"
  86. class(b)
  87. #[1] "character"
  88. class(c)
  89. #[1] "logical"
  90.  
  91. is.logical(a)
  92. is.numeric(a)==FALSE
  93. class(a)!=class(c)
  94.  
  95.  
  96. #. Functions (note that R is a functional prog. language)
  97. a1<-1:10
  98. a2<-seq(1:10)
  99. identical(a1,a2)
  100. ?identical
  101. ?seq()
  102. a3 <- seq(from=1, to=10, by=0.01)
  103. round(a3,1)
  104. a3 # oops. don't forget to assign!
  105. a3 <- round(a3,1)
  106. head(a3)
  107. tail(a3)
  108.  
  109. # Load data and summary stats
  110. #=
  111.  realestate <- read.csv("realestate.csv", header=T)
  112.  
  113.  # but let's import through RStudio's interface
  114. head(realestate)
  115. tail(realestate)
  116. dim(realestate)
  117. str(realestate)
  118. summary(realestate)
  119. # We need to fix the data types of all the factors (category) variable
  120. realestate$driveway <- as.factor(realestate$driveway)
  121. realestate$recroom <- as.factor(realestate$recroom)
  122. realestate$fullbase <- as.factor(realestate$fullbase)
  123. realestate$gashw <- as.factor(realestate$gashw)
  124. realestate$airco <- as.factor(realestate$airco)
  125. realestate$garagepl <- as.factor(realestate$garagepl)
  126. realestate$prefarea <- as.factor(realestate$prefarea)
  127. summary(realestate) # better!
  128.  
  129.  
  130. #====================================
  131. # Business Analytics, Session 2
  132. #====================================
  133. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  134. # Quinlan School of Business - Loyola University Chicago
  135. # Do not distribute or use outside this class without explicit permission from the instructor.  
  136.  
  137. # Load packages
  138. #-
  139. library('readxl') # for reading excel files
  140. library('lubridate') # for handling dates
  141. library('zoo') # for handling dates
  142.  
  143. ####################################################################
  144. # Go to the City of Chicago Data Portal (https://data.cityofchicago.org/)
  145. # Feel free to check out NYC and LA as well.
  146. # Select a data set that is collected over time. Download it as CSV or Excel file.
  147. # Note the location of the downloaded file on your computer.
  148. ####################################################################
  149.  
  150. # Loading data
  151. #=
  152. # Know where your file is located and file name (e.g., desktop, user/yourname/data/file.csv)
  153.  
  154. # To read a CSV file
  155. realestate <- read.csv("realestate.csv", header=T, stringsAsFactors=F)
  156.  
  157. # Use readr::read_csv, if you have a large CSV file (over 50MB)
  158. realestate1 <- readr::read_csv("realestate.csv")
  159.  
  160. # Or, easier, but not great for replication  
  161. realestate1 <- read_csv(file.choose())
  162.  
  163. # Or, even easier, but worse in terms of replication
  164. # Use RStudio's interface
  165.  
  166. # Now for excel
  167. endowments <- read_excel(file.choose())
  168. dummy <- read_excel(file.choose(), sheet=2)
  169.  
  170. # Inspect the data
  171. #=
  172. realestate
  173. realestate1
  174. names(realestate)
  175. head(realestate)
  176. tail(realestate, n=10)
  177. dim(realestate) # cols and rows
  178. View(realestate)
  179. View(endowments)
  180.  
  181. ####################################################################
  182. # Load the previously downloaded data into R. Inspect the data.
  183. ####################################################################
  184.  
  185. # Data types
  186. #=
  187. str(realestate)
  188. summary(realestate)
  189. glimpse(realestate)
  190.  
  191. # Changing data type
  192. #-
  193. attach(realestate)
  194. summary(garagepl)
  195. hist(garagepl) # not a "good" numeric variable. Let's turn it into a factor
  196. garagepl_f <- as.factor(garagepl) # saving as a new column to be on the safe side
  197. # other common options: as.numeric, as.character
  198.  
  199. # Let's see what we've got:
  200. summary(garagepl_f)
  201. plot(garagepl_f) # We will learn much more on plots. This is not-so-great sneak preview
  202. # Same as
  203. barplot(table(garagepl_f))
  204. # And add some colors
  205. barplot(table(garagepl_f), col=c("orange", "steelblue", "red")) # Note great, but these plots (and variables will get better)
  206.  
  207. ####################################################################
  208. # Convert variables to their "correct" type in your data
  209. ####################################################################
  210.  
  211. # Manipulating dates
  212. #=
  213. # Dates are another important data type. But it is a bit of a process.
  214. glimpse(bp_samp) # random sample of 10% of the entire building permits data
  215. detach(realestate)
  216. attach(bp_samp)
  217. summary(ISSUE_DATE) #we'll focus on ISSUE_DATE
  218. head(ISSUE_DATE) # not very useful
  219. as.Date(head(ISSUE_DATE)) #oops. one more time
  220. as.Date(head(ISSUE_DATE), format="%m/%d/%Y")
  221. bp_samp$date <- as.Date(ISSUE_DATE, format="%m/%d/%Y") # but who wants to deal with these abbreviations?
  222. # plus, what if we have date AND time? we use lubridate.  
  223. mdy(head(ISSUE_DATE)) # voila!
  224. # now, let's assume we had a variable with time
  225. sometimes <- c("1/12/2017 7:00:01 PM", "18/12/2017 7:00:20 AM", "23/12/2017 7:02:15 PM")
  226. dmy_hms(sometimes, tz="UTC") # UTC is actually read automatically from my computer.
  227. # We will learn more about time data when we get to time series analysis. But here's a quick trick:
  228. bp_samp$yearq <- as.yearqtr(bp_samp$date)
  229. head(bp_samp$date)
  230. head(bp_samp$yearq)
  231.  
  232. ####################################################################
  233. # Convert at least time variable in your data.
  234. ####################################################################
  235.  
  236. # Manipulating strings (characters)
  237. #=
  238. # We use a common procedure in computing languages, called regular expression (yes, even Excel uses it!)
  239. # It offers a simple yet powerful way to handle messy data. It is used to find, extract, replace, and add characters.
  240. cost <- ESTIMATED_COST[1:50]
  241. cost
  242. grep("$", cost) # row output
  243. grep("$", cost, value=T) # value output
  244. grepl("$", cost) # logical output
  245. grep("$50", cost)
  246. grep("\\$50", cost) # We need to escape special symbols, such as $,.() etc.
  247. grep("\\$50|\\$20", cost) # the | stands for "or"
  248. grep("\\b500\\b", cost, value=T) # \\b \\b are string boundaries. we use these for exact match
  249.  
  250. # Let's replace strings
  251. gsub("50", "IDS", cost)
  252. gsub("\\$", "", cost) # remember to reassign the processed output and convert to the correct data type if needed
  253. bp$ESTIMATED_COST <- as.numeric(gsub("\\$", "", bp$ESTIMATED_COST))
  254.  
  255. #################
  256. # Find all variables containing odd characters in the data you downloaded.
  257. # Replace these characters with blank or other characters, as needed.
  258. # If you do not have any such characters in your data, find all "/" in the PERMIT_TYPE column and replaced them with ":"
  259. # Then, in the same column, remove all the spaces.
  260. # Finally, convert the data into the correct data type (e.g. from character to numeric)
  261. #################
  262.  
  263. # Tidying data with tidyverse (and some other tools)
  264. #=
  265. library('tidyverse') # this an entire "universe" of packages, all are useful
  266. # more on this, next time.
  267. #=======================================
  268. # BSAD 443, Session 2
  269. # Quinlan School of Business - Loyola University Chicago
  270. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  271. # Do not distribute or use outside this class without explicit permission from the instructor.  
  272. #=======================================
  273.  
  274. # Assign a few objects
  275.  
  276. #Numeric vector
  277. nums  <- c(1,3.5,5)
  278. result <-2^sqrt(91)
  279. int <- c(1L,3L,5L) # Notice capitalized L to denote an integer
  280.  
  281. # Character vector
  282. grades <- c("high", "low", "high")  # Character vector
  283. names <- c("Abby", "Ben", "Charlie")
  284. mixed<- c(pi, 1, "sometext", TRUE) # Mixed vector, turns into character
  285.  
  286. # Logical vector
  287. right <- c(TRUE, T, FALSE) # T and F are equivalent to TRUE and FALSE
  288. wrong <- 2>10
  289.  
  290. # Let's examine these vectors
  291. typeof(nums)
  292. class(int)
  293. mode(result) # These three functions may be regarded as equivalent
  294. str(grades) # Checks vector structure
  295. str(wrong)
  296. is.na(c(grades, mixed, NA)) # Returns a logical vector testing for NA
  297. length(grades) # Returns the number of observations in a vector
  298. length(c(grades, mixed)) # Pretty straightforward
  299. nchar(names) # Counts the character in each observation in a character vector.
  300.  
  301. # Access vector elements
  302.  
  303. nums[2]  # First
  304. nums[2:3]  # Second to third
  305. nums[c(1,3)] # First and third
  306. sort(nums) # Order a vector
  307. nums
  308. order(nums) # Nearly the same but not exactly
  309. class(sort(nums)) # Different class, but for most purposes they are the same
  310. class(sort(integer))
  311. sort(nums, decreasing=T) # Reverse
  312.  
  313. # Change vector elements
  314. char.new <- c(grades,names) # Input and combine elements and vectors
  315. char.new <- c(char.new, "Alan", "Bo")
  316. char.new <- c(names, "Alan", "Bo")
  317. nums[2]<-5   # Substitutes the second element
  318. nums[c(1,3)]<-c(5,10) # Substitutes the first and third elements
  319. nums<-c(nums,5,10)
  320.  
  321. # Change vector type
  322. is.character(char.new) # First logical test for vector type
  323. is.character(nums)
  324. mixed
  325. as.numeric(mixed) # Really did coerce a vector conversion. Turned non-numeric elements to NAs
  326. grades
  327. as.numeric(value.mixed)
  328. is.factor(grades)
  329. class(grades)
  330. as.factor(grades)
  331. grades # Remember to assign the coerced type
  332. grades<-as.factor(grades)
  333. levels(grades)
  334. is.character(levels(grades)) # Yes, factors levels are characters
  335. typeof(grades) # And factors are a special type of integer
  336.  
  337. # Missing data
  338. num.new <- c(nums*nums,6,NA,9,NA)
  339. mean(num.new)
  340. is.na(num.new)
  341. mean(num.new, na.rm = T)
  342.  
  343. # Array and matrix
  344. mat<- matrix(1,4,5)
  345. mat1<- matrix(1:10,4,5) # Sequence 1-10, 4 rows, 5 columns
  346. mat2<- matrix(1:10, ncol=5,nrow=4) # Same code, but easier to understand
  347. identical(mat1,mat2) # Yes, they are the same
  348. mat[1:6]
  349. mat[1,c(1:3)] # The first row, data for the first three columns
  350. is.matrix(mat) # Logical expression testing if it is a matrix structure
  351. length(mat[,2]) # Length of the second column of the matrix
  352. ncol(mat) # Number of columns
  353. nrow(mat) # Number of rows
  354. dim(mat) # Dimensions of the matrix (rows and columns)
  355.  
  356.  
  357. # Data frame
  358. dat <- data.frame(8:10, c("exec", "mid", "exec"))
  359. dat <- data.frame(score = 8:10, role = c("exec", "mid", "exec")) # Better! This one has clear, understandable column names
  360. dat<-data.frame(nums, char.new,stringsAsFactors=FALSE) # stringsAsFactors=FALSE turn vectors with strings into a character vector
  361. dat<-data.frame(nums, busperf,char.new) # Oops. What is happening here?
  362. dat.mat <- data.frame(mat)
  363. dat<-data.frame(dat, dept=c("sales",
  364. "accounting", "sales", "IT", "RD"),stringsAsFactors=FALSE)
  365. dat[1:6] # Oops
  366. dat[1,c(1:3)]  # First row, column 1 through 3
  367. dat[,c("dept","nums")] # Select only these columns, notice that you can change column order
  368. paygrade<-c(1,3,2,1,3)
  369. dat<-cbind(dat, paygrade) # Added the paygrade vector (column) into the dataframe
  370. dat<-rbind(dat,c(5,"David", "logistics",2)) # Added a row
  371. colnames(dat)<-c("Years", "Name", "Department", "Paygrade") # This is how you change column names of a dataframe
  372. dat$Paygrade<-NULL
  373.  
  374. # List
  375. Years <- c(2, 3, 5)
  376. Code<- c("a1", "b1", "a1", "a1", "b1")
  377. SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
  378. sales<-list(Years, Code, SalesQuota)
  379. sales[[1]] # All the elements in the first member (in this case, Years)
  380. sales[[3]][3] # TRUE
  381. sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it
  382.  
  383.  
  384. #=======================================
  385. # BSAD 443, Session 1 - Answers
  386. # Quinlan School of Business - Loyola University Chicago
  387. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  388. #=======================================
  389.  
  390. # Assign a few objects
  391.  
  392. #Numeric vector
  393. nums  <- c(1,3.5,5)
  394. result <-2^sqrt(91)
  395. int <- c(1L,3L,5L) # Notice capitalized L to denote an integer
  396.  
  397. # Character vector
  398. grades <- c("high", "low", "high")  # Character vector
  399. names <- c("Abby", "Ben", "Charlie")
  400. mixed<- c(pi, 1, "sometext", TRUE) # Mixed vector, turns into character
  401.  
  402. # Logical vector
  403. right <- c(TRUE, T, FALSE) # T and F are equivalent to TRUE and FALSE
  404. wrong <- 2>10
  405.  
  406. # Let's examine these vectors
  407. typeof(nums)
  408. class(int)
  409. mode(result) # These three functions may be regarded as equivalent
  410.  
  411. str(grades) # Checks vector structure
  412. str(wrong)
  413.  
  414. is.na(c(grades, mixed, NA, "NA", ".","")) # Returns a logical vector testing for NA
  415.  
  416. is.null(c(grades, mixed, "", NA, ".")) # Oops, this function behaves differently
  417. is.null("") # Still not working
  418. is.null(" ") # Nope
  419. is.null(NULL) # Okay! But why?
  420. class(NULL)
  421. class(NA) # That's why. NULL is a class of it's own. We used it, for example, to drop data.
  422.  
  423. length(grades) # Returns the number of observations in a vector
  424. length(c(grades, mixed)) # Pretty straightforward
  425.  
  426. nchar(names) # Counts the character in each observation in a character vector.
  427.  
  428. #### Answers to quiz
  429. busperf<-c(9.5, 8, 9, 8.2, 7.1)
  430. class(busperf)# numeric
  431. mean(busperf) # 8.36
  432. busperf<-c(busperf, 81,7)
  433. mean(busperf) # 18.54
  434.  
  435. # Access vector elements
  436.  
  437. nums[2]  # First
  438. nums[2:3]  # Second to third
  439. nums[c(1,3)] # First and third
  440.  
  441. sort(nums) # Order a vector
  442. nums
  443. order(nums) # Nearly the same but not exactly
  444. class(sort(nums)) # Different class, but for most purposes they are the same
  445. class(sort(integer))
  446. sort(nums, decreasing=T) # Reverse
  447.  
  448. #### Answers to quiz
  449. nothundred<-seq(4,100)
  450. halfhundred<-nothundred[nothundred>50]
  451. sort(halfhundred, decreasing=T)
  452.  
  453. # Change vector elements
  454. char.new <- c(grades,names) # Input and combine elements and vectors
  455. char.new <- c(char.new, "Alan", "Bo")
  456. char.new <- c(names, "Alan", "Bo")
  457.  
  458. nums[2]<-5   # Substitutes the second element
  459. nums[c(1,3)]<-c(5,10) # Substitutes the first and third elements
  460. nums<-c(nums,5,10)
  461.  
  462. #### Answers to quiz
  463. num<-seq(9,99, by=3)
  464. quantile(num) # 31.5 and 76.5
  465. sd(num[num>20]) # 23.81176
  466. # various ways to solve this one. Perhapse the easiest to understand:
  467. n1<-num[num<51]
  468. n2<-num[num>71]
  469. n3<-c(n1,n2)
  470. mean(n3) # 52.25
  471.  
  472. # Change vector type
  473. is.character(char.new) # First logical test for vector type
  474. is.character(nums)
  475. mixed
  476. as.numeric(mixed) # Really did coerce a vector conversion. Turned non-numeric elements to NAs
  477. grades
  478. as.numeric(value.mixed)
  479. is.factor(grades)
  480. class(grades)
  481. as.factor(grades)
  482. grades # Remember to assign the coerced type
  483. grades<-as.factor(grades)
  484. levels(grades)
  485. is.character(levels(grades)) # Yes, factors levels are characters
  486. typeof(grades) # And factors are a special type of integer
  487.  
  488. # Missing data
  489. num.new <- c(nums*nums,6,NA,9,NA)
  490. mean(num.new)
  491. is.na(num.new)
  492. mean(num.new, na.rm = T)
  493.  
  494. #### Answers to quiz
  495. busperf<-sort(busperf)
  496. busperf[c(2)]<-7.9
  497. busperf[busperf==9]<-NA
  498.  
  499. # Array and matrix
  500. mat<- matrix(1,4,5)
  501. mat1<- matrix(1:10,4,5) # Sequence 1-10, 4 rows, 5 columns
  502. mat2<- matrix(1:10, ncol=5,nrow=4) # Same code, but easier to understand
  503. identical(mat1,mat2) # Yes, they are the same
  504.  
  505. mat[1:6]
  506. mat[1,c(1:3)] # The first row, data for the first three columns
  507. is.matrix(mat) # Logical expression testing if it is a matrix structure
  508. length(mat[,2]) # Length of the second column of the matrix
  509. ncol(mat) # Number of columns
  510. nrow(mat) # Number of rows
  511. dim(mat) # Dimensions of the matrix (rows and columns)
  512.  
  513. #### Answers to quiz
  514. mat<-matrix(1:4,nrow=6, ncol=2)
  515. mat[5,2]
  516. mat[5,2]<-"some text"
  517. # All the data elements in the matrix have turned into characters
  518.  
  519. # Data frame
  520. dat <- data.frame(8:10, c("exec", "mid", "exec"))
  521. dat <- data.frame(score = 8:10, role = c("exec", "mid", "exec")) # Better! This one has clear, understandable column names
  522. dat<-data.frame(nums, char.new,stringsAsFactors=FALSE) # stringsAsFactors=FALSE turn vectors with strings into a character vector
  523. dat<-data.frame(nums, busperf,char.new) # Oops. What is happening here?
  524. dat.mat <- data.frame(mat)
  525. dat<-data.frame(dat, dept=c("sales",
  526. "accounting", "sales", "IT", "RD"),stringsAsFactors=FALSE)
  527.  
  528. dat[1:6] # Oops
  529. dat[1,c(1:3)]  # First row, column 1 through 3
  530. dat[,c("dept","nums")] # Select only these columns, notice that you can change column order
  531.  
  532. paygrade<-c(1,3,2,1,3)
  533. dat<-cbind(dat, paygrade) # Added the paygrade vector (column) into the dataframe
  534. dat<-rbind(dat,c(5,"David", "logistics",2)) # Added a row
  535. colnames(dat)<-c("Years", "Name", "Department", "Paygrade") # This is how you change column names of a dataframe
  536. dat$Paygrade<-NULL
  537.  
  538. #### Answers to quiz
  539. hr<-data.frame(
  540.     Experience=c(1,2,1,8),
  541.     Performance=c(10,8,7,10),
  542.     Employees=c(3,4,2,10),
  543.     stringsAsFactors=FALSE
  544.     )
  545.    
  546. hr<-data.frame(hr,
  547. ID=c(12,13,14,15),
  548.     stringsAsFactors=FALSE)
  549.  
  550. hr[3,2]<-9
  551. mean(hr$Performance) # 9.25
  552.  
  553.  
  554. # List
  555. Years <- c(2, 3, 5)
  556. Code<- c("a1", "b1", "a1", "a1", "b1")
  557. SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
  558. sales<-list(Years, Code, SalesQuota)
  559.  
  560. sales[[1]] # All the elements in the first member (in this case, Years)
  561. sales[[3]][3] # TRUE
  562. sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it
  563.  
  564.  
  565. #=======================================
  566. # BSAD 443, Session 3 - Answers
  567. # Quinlan School of Business - Loyola University Chicago
  568. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  569. #=======================================
  570.  
  571. # List
  572. Years <- c(2013, 2014, 2015)
  573. Code<- c("a1", "b1", "a1", "a1", "b1")
  574. SalesQuota <- c(TRUE, FALSE, TRUE, FALSE)
  575. sales<-list(Years, Code, SalesQuota)
  576.  
  577. sales[[1]] # All the elements in the first member (in this case, Years)
  578. sales[[3]][3] # TRUE
  579. sales[[3]][3]<-F # Assigning FALSE to the third member (SalesQuota), and the third element within it
  580.  
  581. # In case you want to, you can assign "column" names to a list like so:
  582. # lapply(sales, setNames, c("Years", "Code", "SalesQuota"))
  583.  
  584. #### Answers to quiz
  585. hacker<-data.frame(software=c("R","Python","SAS","R","SPSS"), years=c(4,3,10,4,1))
  586. salary<-sample(85:150,20)
  587. hackersalary<-list(hacker,salary)
  588. hackersalary[[2]][2]<-100
  589.  
  590. # Inspecting data
  591. data(mtcars) # load data from R.
  592.     # If mtcars is not found do the following:
  593.     # install.packages('datasets')
  594.     # library(datasets)
  595.     # data(mtcars)
  596.    
  597. dim(mtcars) # Number of rows and columns
  598. ncol(mtcars) # Number of rows and columns
  599. nrow(mtcars) # Number of rows and columns
  600. names(mtcars)
  601. colnames(mtcars)
  602. rownames(mtcars)
  603. str(mtcars) # Shows the structure of each vector (column) in the data frame
  604. # I can also use rownames(mtcars) to see rows, but this is less commonly used
  605. head(mtcars) # First 6 rows of the data+column names
  606. tail(mtcars,n=10) # View last 10 rows+column names. We can do the same with head
  607. mtcars[1:10,2:6] # Rows 1-10, columns 2-6
  608. set.seed # We do this every time we run a simulation to ensure that it is random
  609. smp<-sample(1:nrow(mtcars),10) # 10 random numbers from 1 to number of rows
  610. mtcars[smp,] # Gives 10 random rows from the data
  611. View() # Pops up the GUI viewer
  612. fix() # To change (manually)
  613. View(mtcars[smp,]) # Using the viewer to examine a random sample
  614.  
  615. #### Answers to quiz
  616. # install.packages(car) # If you don't already have it
  617. library(car)
  618. data(Freedman)
  619. dim(Freedman)
  620. str(Freedman)
  621. rownames(Freedman) # Cities are row names
  622. fix(Freedman) # Manually change the value
  623. Freedman$population[3]<-600 # That's one way of doing it
  624. set.seed(789)
  625. smp<-sample(1:nrow(Freedman), 25)
  626. Fsamp<-Freedman[smp,]
  627.  
  628. # Exploratory stats
  629. # Univariate distributions
  630. summary(mtcars) # Basic distribution stats on each numeric variable in the data frame
  631. quantile(mtcars$mpg)
  632. quantile(mtcars$mpg,seq(0.1,1,by=0.1))
  633. # Get additional univariate stats from the pscyh package:
  634. # install.packages('psych')
  635. # library(psych)
  636. # describe(mtcars)
  637.  
  638. # Plotting distributions
  639. par(mfrow=c(2,2)) #2x2 plots, so 4 plots at the same time
  640. stripchart(mtcars$mpg)
  641. dotchart(mtcars$mpg)
  642. boxplot(mtcars$mpg) # Look for IQR (distance between 1-3 quantile) and median in the box, and min/max values outside
  643. hist(mtcars$mpg) # Determining number of breaks should be decided
  644. hist(mtcars$mpg, breaks=3)
  645. hist(mtcars$mpg, breaks=12)
  646. plot(density(mtcars$mpg)) # Desity kernel
  647.  
  648. d<-density(mtcars$mpg)
  649. str(d)
  650. summary(d)
  651.  
  652. # Oops!
  653. dev.off()
  654. hist(mtcars$mpg)
  655. lines(density(mtcars$mpg)) # Oops!
  656. hist(mtcars$mpg, freq=F, ylim=c(0,0.09)) # Better
  657. lines(density(mtcars$mpg), col="red")
  658.  
  659. # Now let's save it as a file
  660. png(filename="mpgplot.png")
  661. hist(mtcars$mpg, freq=F, ylim=c(0,0.09), main="MPG Distribution")
  662. lines(density(mtcars$mpg), col="red")
  663. rug(mtcars$mpg, col="blue")
  664. box()
  665. dev.off()
  666. View()
  667. fix()  
  668. dim|ncol|nrow()
  669. head|tail(df,n=10)
  670. names|colnames|rownames()
  671. View()
  672. mtc<-mtcars
  673. fix(mtc)
  674. decreasing=T),
  675. mtcars<-mtcars[order(-mtcars$mpg),]
  676.  
  677.  
  678. #=======================================
  679. # BSAD 443, Session 4 - Answers
  680. # Quinlan School of Business - Loyola University Chicago
  681. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  682. #=======================================
  683.  
  684. # Upload csv into R
  685. failedbanks<-read.csv("http://www.fdic.gov/bank/individual/failed/banklist.csv", header=T) # Uploads failed banks data from FDIC into R.
  686. # Instead of a URL it can be a local file e.g.: "y:/data/banklist.csv".
  687.  
  688. # Upload stock data (also csv)
  689. install.packages('TTR')
  690. library(TTR)
  691. link <-"http://finance.yahoo.com/d/quotes.csv?s=AAPL+GOOG+MSFT&f=nab"
  692. fin <- read.csv(link) # Oy, any ideas why? (hint: it's an easy fix)
  693.  
  694. # Upload text file into R
  695. Apple2016Q1<-readLines("http://www.sec.gov/Archives/edgar/data/320193/000119312516559625/0001193125-16-559625.txt") # Uploads Apple's most recent 10-K filing.
  696.  
  697. # Query a database
  698. install.packages("DBI","RPostgreSQL")
  699. library(RPostgreSQL)
  700. drv <- dbDriver("PostgreSQL")
  701. con <- dbConnect(drv,              
  702.     dbname="infs494",
  703.     host="147.126.64.66",
  704.     port=8432,
  705.     user="dataminer1",            # use dataminer2-9
  706.     password="summer2016")
  707. dbListTables(con)
  708. dbListFields(con, "crime")
  709. crime.db <- dbReadTable(con, "crime")
  710.  
  711. # Upload data into a database
  712. dbExistsTable(con, "crime")
  713. dbWriteTable(con, "crime.part1", crime.db[1:50,],
  714.     row.names=FALSE)
  715. test<-dbReadTable(con, "crime.part1")
  716. dbDisconnect(con)
  717.  
  718. # Website scraping
  719. # install.packages("rvest")
  720. # scrape data from Wikipedia
  721. library(rvest)
  722. tab<-read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
  723. tab<-html_table(tab, fill=TRUE)
  724. tab1<-tab[[1]]
  725. tab1<-data.frame(tab1)
  726. tab2<-data.frame(tab[[2]])
  727. tab2.cln<-tab2[3:nrow(tab2),]
  728. cols<-as.character(tab2[2,])
  729. colnames(tab2.cln)<-cols
  730.  
  731. # Scrape data from Yelp
  732. library(httr)
  733. library(rvest)
  734. page<-read_html("http://www.yelp.com/search?find_desc=&find_loc=water+tower+chicago%2C+il&ns=1")
  735. nds<-html_nodes(page, "address")
  736. address<-html_text(nds) # Messy! We'll get to how to "clean" such data in the next session.
  737.  
  738.  
  739. #=======================================
  740. # BSAD 443, Session 5 - Answers
  741. # Quinlan School of Business - Loyola University Chicago
  742. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  743. #=======================================
  744.  
  745. #------------#
  746. # Session 5  #
  747. #------------#
  748.  
  749. # Load messy data
  750. # Figure out what's going on here
  751. credanger<-read.csv("Credit complaints.csv", header=T)
  752.  
  753. # Messy column names
  754. df <-data.frame(x.1=c("a",seq(1:4)),x.2=c("b",rnorm(4,mean=2,sd=.2)))
  755. head(df)
  756. names(df)
  757. colnames[df]<-df[1] # Nope
  758. colnames[df]<-as.vector(df[1]) # Nope
  759. mat<-as.matrix(df)
  760. cn<-as.vector(mat[1,])
  761. colnames(mat)<-cn
  762. df<-data.frame(mat)
  763.  
  764. # Or simply (for a small number of columns)
  765. colnames(df)<("a", "b")
  766. df<-df[-1,]
  767. df
  768.  
  769. # Incorrect column type
  770. df <-data.frame(a=c("1*",seq(1:4)),b=c(letters[1:5]))
  771. summary(df)
  772. str(df)
  773.  
  774. # Input data and convert column to correct type
  775. df[1,1] <- 1
  776. df$a<-as.numeric(df$a)
  777. df$b<-as.character(df$b)
  778. str(df)
  779.  
  780. # Get rid of odd characters
  781. c1<-c("111", "123*", "145.0")
  782. class(c1)
  783. as.numeric(c1) # Oops. Let's try again:
  784. c1<-c("111", "123*", "145.0")
  785. grep("\\.|\\*|\\,|\\+", c1, ignore.case = T) # Matches pattern
  786. gsub("\\.|\\*|\\,|\\+", c1) # Nope
  787. c1<-gsub("\\*|\\,|\\+", "", c1) # Aha!
  788. c1<- as.numeric(gsub("([0-9]+).*$", "\\1", c1)) # (\\1 means first match). Done!
  789.  
  790. # Missing data
  791. age <- c(20, 30, NA, -99)
  792. mean(age) # We already know that this isn't going to work
  793. is.na(age) # Logical
  794. which(is.na(age)) # Gives location of the missing data
  795. mean(age, na.rm=T) # Not that easy
  796. which(is.na(age)|age<18) # So we need to use common sense, and we should also do this:
  797. age[age==-99]<-NA
  798. mean(age,na.rm=T)
  799.  
  800. #Time formatting
  801. d<-c("2016/01/01", "2016/01/02", "2016/01/03", "2016/01/04")
  802. class(d)
  803. summary(d)
  804. d<-as.Date(d,"%Y/%m/%d")
  805. class(d) # New class: date.
  806.  
  807. # But what if we want to extract year/month/day from a date?
  808. # It's best to use a package. The fastest and best today is this one:  
  809. install.packages(lubridate)
  810. library(lubridate)
  811. d<-c("2016/01/01", "2016/01/02", "2016/01/03", "2016/01/04")
  812. d1<-ymd(d) # That was easy! And these are easy (and very useful) too:
  813. y<-year(ymd(d1))
  814. m<-month(ymd(d1))
  815. d<-day(ymd(d1))
  816. w<-week(ymd(d1))
  817. q<-quarter(ymd(d1))
  818. #Merge data
  819. A<-data.frame(loanID=c(20,21,22,23),gender=c("f", "m", "m","f"),
  820.               amount=c(1000,5500,2000,6000))
  821. B<-data.frame(loanID=c(23,21,22,20,24),years.employed=c(3,2,4,5,6))
  822. AB<-merge(A,B, by="loanID")
  823. AB<-merge(A,B, by="loanID", all=T)
  824. AB<-merge(A,B, by="loanID", all.x = T)
  825. AB<-merge(A,B, by="loanID", all.y = T)
  826. AB<-merge(A,B, by="loanID", all=T)
  827.  
  828.  
  829. #=======================================
  830. # BSAD 443, Session 6 - Answers
  831. # Quinlan School of Business - Loyola University Chicago
  832. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  833. #=======================================
  834.  
  835. #------------#
  836. # Session 6  #
  837. #------------#
  838.  
  839. # Read bank data
  840. path<-file.choose()
  841. bank<-read.csv(path, header=T)
  842. attach(bank) # This function loads the data into memory, so that you don't have to type df$variable. Just type variable.
  843. # When you are finished: detach(df)
  844.  
  845. # Let's explore the data
  846. dim(bank)
  847. str(bank)   # Seems to read it okay
  848. summary(bank)
  849. head(bank)
  850. tail(bank)
  851.  
  852. # Review distribution of numeric varilables
  853. numcol<-sapply(bank, is.numeric)    # sapply applies a function to a vector/list, in this case, is.numeric
  854. numbank<-bank[numcol]   # Copied the numeric variables so it would be easier to inspec them
  855.  
  856. sapply(numbank, mean)
  857. sapply(numbank, sd)     # All excpet age seem very skewed. Let's verity.
  858.  
  859. par(mfrow=c(2,2))
  860. hist(numbank$age)
  861. plot(density(numbank$age))
  862. qqnorm(numbank$age)
  863. qqline(numbank$age, col="red")
  864. # Compared to normal distribution
  865. n<-rnorm(1:200)
  866. qqnorm(n)
  867. qqline(n, col="red")
  868. dev.off()
  869. # There is also a formal test
  870. shapiro.test(numbank$age)
  871. shapiro.test(n) # Well, age is skewed, and it is the best one of our numeric vars! This is common in the BA data.
  872. # See an interesting discussion here: http://stats.stackexchange.com/questions/2492/is-normality-testing-essentially-useless
  873. # Now let's inspect one more numeric variable (you'll do the rest on your own)
  874. summary(numbank$balance) # We have a fair amount of negatives, and it does not seem normally distributed. What about sd?
  875. sd(numbank$balance) # Much higher than the mean. It is definately skewed.
  876. par(mfrow=c(2,2))
  877. hist(numbank$balance)
  878. plot(density(numbank$balance)) # Yes, it is very skewed,
  879. qqnorm(numbank$balance)
  880. qqline(numbank$balance, col="red")
  881. qqnorm(log(numbank$balance))    # Oops. Log doesn't work with negative numbers. What to do?
  882. # First, how many accounts with negative balance?
  883. tbl<-table(numbank$balance<0) # about 8%. Let's be accurate:
  884. # Do we remove them? No!
  885. lbal<-log(numbank$balance+abs(min(balance))+1)
  886. plot(density(lbal)) # Better, but not normally distributed. This is fine for now.
  887.  
  888. # Let's test the transformed vars against the reponse var:
  889. boxplot(lbal~bank$y)
  890. # One last effort
  891. install.packages('extremevalues')
  892. library(extremevalues)
  893. go<-getOutliers(numbank$age)
  894. go$iRight
  895. numbank$age[go$iRight]
  896. # And for fun:
  897. evGui(numbank$age)
  898.  
  899. # We can then remove the outliers (rows), and normalize the age variable. We could also do box-cox transformation.
  900. # But we'll leave it for now.
  901.  
  902. # Before we get to examine factors, we need to examine the relationship among the numeric vars
  903. round(cor(numbank),2) # Nothing exciting here, except previous and pdays
  904. # And visually:
  905. pairs(numbank) # This could take some time
  906.  
  907.  
  908. # And then the relationship with these vars and our y
  909. boxplot(numbank$age~y) # The ages of those who responded may be a little more distributed
  910. boxplot(numbank$balance~y) # Hard to decipher
  911. boxplot(lbal~y) # Ditto
  912.  
  913. # One more approach to consider is to cut the nueric data into ordinal categories:
  914. catbalance<-cut(numbank$balance, breaks=c(quantile(bank$balance, seq(0,1, by=0.25))))
  915. table(catbalance)  # Equal sizes, but unclear lables
  916. catbalance<-cut(bank$balance, breaks=c(quantile(bank$balance, seq(0,1, by=0.25))), labels=F)
  917. tcbal<-table(catbalance,bank$y) # Not sure that this is ideal either. Consider recoding into sensible levels
  918. prop.table(tcbal) # Let's round it
  919. pcbal<-round(prop.table(tcbal),2)
  920. addmargins(pcbal,c(1,2))
  921. boxplot(catbalance~bank$y) # Interesting!
  922.  
  923. # And now for factors
  924. # First, the response variable iteself
  925. table(y) # Shows frequencies
  926. table(y)/nrow(bank)# Shows proportions
  927. r<-round(table(y)/nrow(bank),2) # Easier to understand
  928. pt<-r*100 # Even better in percentages
  929.  
  930. plot(bank$y) # Good old barplot, show 88% / 12%
  931. baplot(r) # The table from a few lines earlier
  932. plot(bank$marital)
  933. plot(bank$housing)
  934.  
  935. # Do it for all variables. Which ones seem interesting?
  936.  
  937. # Now let's explore bivariate relationships
  938. tmar<-table(bank$marital, bank$y)
  939. mtmar<-addmargins(tmar,c(1,2))
  940. plot(tmar) # Married seemed to be slightly less excited about this campaign
  941. # Or prop table
  942. ptmar<-prop.table(tmar)
  943. addmargins(ptmar, c(1,2))
  944.  
  945. # Continue to explore these tables. What did you find?
  946.  
  947. # What if we want more than 2-way tables?
  948. # ftable (flat table)
  949. ft<-ftable(bank$job,bank$marital, bank$education, bank$y)
  950. # Or write it this way:
  951. ftable(bank[c("y", "marital", "education", "y")]) # As many factors as you'd like
  952.  
  953. # You can also use xtabs
  954. xt<-xtabs(~y+marital+education, data=bank) # More readable?
  955. prop.table(xt)
  956.  
  957. # A better organized output with crosstabs
  958. install.packages('gmodels')
  959. library(gmodels)
  960. CrossTable(y=bank$y, x=bank$education) # Nice. There seem to be something going here
  961. CrossTable(y=bank$y, x=bank$education, chisq=T, format="SPSS") # I like this format,
  962. # and Chi-square test of independence, rejected. The two vars are related.
  963.  
  964. # Finally, if you like pivot tables there are various options.
  965. # The reshape and plyr packages offer very good alternatives
  966. # And this one: http://www.magesblog.com/2015/03/pivot-tables-with-r.html cool interactive, web-based pivot tables
  967.  
  968.  
  969. #=======================================
  970. # BSAD 443, Session 7 - Answers
  971. # Quinlan School of Business - Loyola University Chicago
  972. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  973. #=======================================
  974.  
  975. #------------#
  976. # Session 7  #
  977. #------------#
  978.  
  979. # A few bits of information:
  980. # Bank data come from here: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
  981.  
  982. # This is how you drop rows when a certain column contains NAs
  983. df<-df[!is.na(df$var),]
  984. # or replace with avg.
  985. df$var[is.na(df$var)] <- mean(df$var,na.rm=T)
  986.  
  987. # Decision trees
  988. # Steps to follow (with any machine learning algorithm)
  989. # 1. Load the data and clean the data if needed
  990. # 2. Chop the data (usually 90-10, 80-20, 75-25, or 67-33)
  991. # 3. Train
  992. # 4. Model
  993. # 5. Evaluate and fine-tune
  994.  
  995. #1. Load the data (We did this yesterday, in session 6), and randomize it
  996. path<-file.choose()
  997. bank<-read.table(path, header=T, sep=";")
  998. set.seed(199)
  999. bankrand <- bank[order(runif(nrow(bank))), ] # runif() produces a list of random numbers. This one shuffles the data randomly
  1000. bankrand<-bankrand[1:(nrow(bankrand)/10),] # I am using only 10% of the data. Otherwise, it will take a long time to run.
  1001.  
  1002. #To make sure the random algorithm got it right
  1003. tb<-(bank$y)
  1004. prop.table(tb)
  1005. tb1<-(bankrand$y)
  1006. prop.table(tb1) # Looks pretty close. We're good to go.
  1007.  
  1008. #2. Chop the data into training (90%) and testing (10%) sets
  1009. trn<-1:(nrow(bankrand)*0.90)
  1010. tst<-(tail(trn,1)+1):nrow(bankrand)
  1011. bank_train <- bankrand[trn, ]
  1012. bank_test <- bankrand[tst, ]
  1013.  
  1014. # Just to be on the safe side. Looks pretty close. Good!
  1015. round(prop.table(table(bank_train$y)),2)
  1016. round(prop.table(table(bank_test$y)),2)
  1017.  
  1018. #3. Train
  1019. install.packages("C50") # There are a few more packages for CART. This one is considered among the best.
  1020. library(C50)
  1021. bank_model <- C5.0(bank_train[-17], bank_train$y) # ommits the outcome variable
  1022. summary(bank_model) ) # Wow! Only 7% errors.
  1023. # plot(bank_model # This will take a long time, but it is very informative
  1024. plot(bank_model, subtree=2)
  1025.  
  1026. #But don't jump to conclusions. Decision trees are known for
  1027. # having a tendency to overfit the model to the training data.
  1028. # For this reason, the error rate reported on training data may be overly optimistic, and it is especially important to evaluate decision trees on a test dataset.
  1029.  
  1030. # 4. Model  
  1031. bank_pred <- predict(bank_model, bank_test)
  1032. library(gmodels)
  1033. CrossTable(bank_test$y, bank_pred, prop.chisq = F, prop.c = F, prop.r = F,
  1034. dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
  1035. # About 11% misses.  
  1036.  
  1037. # 5. Evaluate and fine-tune
  1038.  
  1039. # The C5.0 algorithms let's us to add additional  trials.
  1040. # We'll start with 10 trials. That's the most very common # of trials,
  1041. # as research suggests that this reduces error rates on test data by about 25 percent.
  1042. # It also selects the best size of the tree (e.g., "pruning")
  1043.  
  1044. # First, let's do some boosting
  1045. bank_boost10 <- C5.0(bank_train[-17], bank_train$y, trials = 10)
  1046. bank_boost10
  1047. summary(bank_boost10) # Summarizes itterations, tree size and classification performance.
  1048. plot(bank_boost10, subtree=2)
  1049. bank_boost_pred10 <- predict(bank_boost10, bank_test)
  1050. CrossTable(bank_test$y, bank_boost_pred10, prop.chisq = F, prop.c = F, prop.r = F,
  1051. dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
  1052. # Still not great, but we have improved the model by at least 1%
  1053.  
  1054. # Now let's reduce the size of the tree ("pruning")
  1055. bank_control <- C5.0(bank_train[-17], bank_train$y, control = C5.0Control(minCases = 20), trials=10)
  1056. summary(bank_control) # Avg. tree size is 9. That's because of our control parameter
  1057. plot(bank_control) # Having less nodes allows us to plot the tree model
  1058. bank_control_pred <- predict(bank_control, bank_test)
  1059. CrossTable(bank_test$y, bank_control_pred, prop.chisq = F, prop.c = F, prop.r = F,
  1060.            dnn = c('actual purchase', 'predicted purchase'), format=c("SPSS"))
  1061.  
  1062. #### Now let's try doing the same with logistical regression
  1063. # We already did stage 1 and 2
  1064. # 3. Train
  1065. logit_train<-glm(y ~.,family=binomial(link = "logit"), data=bank_train)
  1066. summary(logit_train)
  1067. anova(logit_train, test="Chisq") # Model is clearly preforming well when comparing null deiance against the residual deviance.
  1068. # 4. Test
  1069. logit_test <- predict(logit_mod,newdata=bank_test,type='response')
  1070.  
  1071. # 5. Evaluate and fine-tune
  1072. logit_test <- ifelse(logit_test > 0.5,1,0)
  1073. logit_test <- gsub("0", "no", logit_test)
  1074. logit_test <- gsub("1", "yes", logit_test)
  1075. logit_test <- as.factor(logit_test)
  1076. logitError <- table(logit_test == bank_test$y) # About 10% errors. Not bad!
  1077.  
  1078.  
  1079. #=======================================
  1080. # BSAD 443, Session 8 - Answers
  1081. # Quinlan School of Business - Loyola University Chicago
  1082. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  1083. #=======================================
  1084.  
  1085. #------------#
  1086. # Session 8  #
  1087. #------------#
  1088.  
  1089. ############# EVEN MORE FUNCTIONS #############
  1090.  
  1091. # 1.  CONVERTING A NUMERIC VARIABLE INTO A FACTOR WITH DIFFERENT LEVELS
  1092. # Usually cut works, but not always
  1093. an<-cut(loans$annual_inc, breaks=3) # Oops.
  1094.  
  1095. # Solution 1, use cut2 from the Hmisc package
  1096. install.packages(Hmisc)
  1097. library(Hmisc)
  1098. ai<-cut2(loans$annual_inc,m=5000)
  1099.  
  1100. # Solution 2, (1)  convert to integer and then to character. (2) Then match and convert strings into different categories  
  1101. ai<-as.character(as.integer(loans$annual_inc))  # Converts all non-integers into integers
  1102. ai[ai %in% c(1:62410)]<-"low" # I can now select whatever categories I want
  1103. ai[ai %in% c(62411:7446395)]<-"high" # This one could be slow becuase it matches a large number of strings
  1104.  
  1105. # 2. BIVARIATE CORRELATIONS
  1106. # When missing values are present
  1107. cor(loans$loan_amnt, loans$annual_inc) # Gives NA
  1108. cor(loans$loan_amnt, loans$annual_inc, use="complete.obs") # 0.3 They are correlated
  1109.  
  1110. # Plot correlations
  1111. plot(loans$loan_amnt, loans$annual_inc) # Generateds a scatterplot that is very messy, and we have an outlier
  1112. identify(loans$loan_amnt, loans$annual_inc) # It's 19218
  1113. loans<-loans[-19218,]
  1114. plot(loans$loan_amnt, loans$annual_inc)
  1115. plot(log(loans$loan_amnt), log(loans$annual_inc)
  1116. abline(lm(log(loans$annual_inc)~log(loans$loan_amnt)), col="red") # This adds a fitted line from a regression model.
  1117. #Note that the order of the vars has flipped and note the tilde.
  1118. lines(loess.smooth(log(loans$loan_amnt), log(loans$annual_inc)),col="green") # This is a "smoother" which fits a line that may or may not be straight
  1119.  
  1120. # 3. DATA SLICING AND AGGREGATION
  1121. # Base R has an aggregate function, but it is overly complicated. We'll use dplyr instead.
  1122. library(dplyr)
  1123. chisal<-read.csv(path, header=T) # Read Chicago's salary data
  1124. head(chisal)
  1125. dim(chisal)
  1126. chisal$Employee.Annual.Salary<-as.numeric(gsub("\\$","", chisal$Employee.Annual.Salary))
  1127. names(chisal)
  1128.  
  1129. # Slice the data
  1130. slice(chisal, c(100:110, 5:50)) # Shows these rows
  1131. head(arrange(chisal, desc(Employee.Annual.Salary))) # Orders the data by salar in a descending order
  1132. head(select(chisal, Department, Employee.Annual.Salary)) # Selects only these two columns, in this order
  1133. head(select(chisal,-(c(Name,Department))))  # Drops these two columns
  1134. highpaychipol<-filter(chisal, Department=="POLICE", Employee.Annual.Salary>100000) # Select particular cases
  1135. nrow(sample_frac(chisal, 0.1)) # Random sample of 0.1 (10% or rows)
  1136. # Aggregate data
  1137. by_dept <- group_by(chisal, Department)
  1138. sals<-summarize(by_dept, count=n() , avg.salary=mean(Employee.Annual.Salary, na.rm=T))
  1139. sals.bigdept<- filter(sals, count > 1000)
  1140. # More information, including additional features: https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html
  1141.  
  1142. # 4. SAVE DATA FRAME
  1143. write.csv(df, "newfile.csv", row.names=F)  # This will save a dataframe called df as a CSV file called newfile.csv in my working directory.
  1144.  
  1145. # 5. MANAGE WORKSPACE
  1146. save.image() # Will save all objects in R's session
  1147. save(loans, chisal, path, file="test.RData") # Will save the objects called loans and path into the file test.RData
  1148. rm(list=ls(all=TRUE)) # Removes ALL objects in R
  1149. load(file="test.RData") # Will open the objects stored in the test.RData file
  1150.  
  1151. ############# PLOTS #############
  1152. install.packages("ggplot2")
  1153. x<-seq(-4,9,0.2)
  1154. y<-2*x^2+4*x-2
  1155. plot(x,y)
  1156. plot(x,y, pch=16, col="red")
  1157. # pch are symbols. See: http://www.statmethods.net/advgraphs/parameters.html
  1158. plot(x,y, pch=16, col="red", xlim=c(-5, 10), ylim=c(-5,210), main="Nice plot!", xlab="The X", ylab="The Y")
  1159. lines(x,y, col="green") # Adds a line
  1160. abline(h=1)
  1161. abline(v=-0.5)
  1162. text(-0,50,"Intresting!")
  1163. legend(-5,100,"Legend here")
  1164. # Baic plot types in R: p=points; l=lines; o=overplotted points and lines; b, c=joined by lines; s, S=stair steps; h=histogram-like vertical lines; n=nothing/empty
  1165. colors() # Color names (657)
  1166. grep("green", colors(), ignore.case = T, value = T) # All variations of "green"
  1167. # Or use color codes instead, using Hexadecimal code: http://html-color-codes.com/
  1168.  
  1169. # Add multiple trendlines
  1170. X<-c(1,2,3,4,5,6,7)
  1171. Y1<-c(2,4,5,7,12,14,16)
  1172. Y2<-c(3,6,7,8,9,11,12)
  1173. Y3<-seq(1,30, by=3)
  1174. plot(X,Y1,type="o",pch=17,cex=1.2,col="darkgreen",ylim=c(0,20)) # Initiates the plot
  1175. lines(Y2,type="o",pch=16,lty=2,col="blue")
  1176. lines(Y3,type="o",pch=2,lty=4,col="darkblue")
  1177. title(main="Three vars", col.main="darkgray", font.main=4)
  1178.  
  1179. # https://en.wikibooks.org/wiki/R_Programming/Graphics
  1180. # Good scripts to convert tables/descriptives to plots
  1181. #http://tables2graphs.com/doku.php?id=03_descriptive_statistics#figure_1
  1182.  
  1183.  
  1184. # ggplot2 (qplot)
  1185. # http://www.statmethods.net/advgraphs/ggplot2.html
  1186. # http://www.cookbook-r.com/Graphs/
  1187.  
  1188. library(ggplot2)
  1189. data("diamonds")
  1190. sdiam<-sample_frac(diamonds, 0.01) # Select a random sample of 1%
  1191. qplot(carat, price, data = sdiam)
  1192. qplot(log(carat), log(price), data = sdiam)
  1193.  
  1194. # Now let's add colors. Note how easy it is!
  1195. qplot(carat, price, data = sdiam, shape=cut)
  1196. qplot(carat, price, data = sdiam, color=cut)
  1197. qplot(carat, price, data = sdiam, color = I("blue"))
  1198. qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3)) # 0=transparant; 1=not
  1199. qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3), size = I(5)) # Bigger
  1200. qplot(carat, price, data = sdiam, color = I("blue"), alpha = I(0.3), size = I(5), shape=I(3))
  1201. # Shapes are: 1=circle; 2=triangle; 3=plus; 4=cross; 5=diamonds
  1202.  
  1203. # You can combine multiple qplot types with c(), as in geom = c("smooth", "point")
  1204. qplot(carat, price, data = sdiam, geom = c("smooth", "point"), )
  1205.  
  1206. # There are multiple smoothers, but loess (which is the most common), is the default
  1207. # controling loess by span=1 (relatively straight, underfitting) to span=0 (curvy, overfitting)
  1208.  
  1209. # NOTES
  1210. # qplot can generate other types of plots (the geom parameter)
  1211. # geom = "point" scatterplot, default for x,ylab
  1212. # geom = "smooth" fits a smoother to the data and displays the smooth and its standard error
  1213. # geom = "boxplot" produces a box-and-whisker plot to summarise the distribution of a set of points
  1214. # geom = "path" and geom = "line" draw lines between the data points
  1215.  
  1216. # Ordinal/continuous vars geom = "histogram"; geom ="freqpoly" a frequency polygon, and geom = "density" creates a density plot
  1217. # For discrete variables, geom = "bar" makes a bar chart
  1218.  
  1219.  
  1220.  
  1221. # Distribution plots
  1222. # Continue using the recipe found here: http://www.cookbook-r.com/Graphs/Plotting_distributions_%28ggplot2%29/
  1223. ggplot(bankrand, aes(x=balance)) + geom_histogram(binwidth=.5)
  1224. ggplot(loans, aes(x=tot_hi_cred_lim)) + geom_histogram(binwidth=.5, color="black", fill="blue")
  1225. ggplot(loans, aes(x=tot_hi_cred_lim)) + geom_density()
  1226.  
  1227. # Correlation plots
  1228. numcol<-sapply(loans, is.numeric)
  1229. nl<-loans[,numcol]
  1230. nl1<-nl[,5:10] # It is hard to interpret vars. You can select a few a time.  
  1231. ctab<-round(cor(nl1, use="complete.obs"),2)
  1232. plotcorr(ctab, mar = c(0.1, 0.1, 0.1, 0.1))
  1233.  
  1234. # Do the same, but with colors corresponding to value
  1235. colorfun <- colorRamp(c("#CC0000","white","#3366CC"), space="Lab")
  1236. plotcorr(ctab, col=rgb(colorfun((ctab+1)/2), maxColorValue=255), mar = c(0.1, 0.1, 0.1, 0.1))
  1237.  
  1238. colorfun <- colorRamp(c("#CC0000","white","#3366CC"), space="Lab")
  1239. plotcorr(ctab, col=rgb(colorfun((ctab+1)/2), maxColorValue=255),
  1240.          mar = c(0.1, 0.1, 0.1, 0.1))
  1241.          
  1242. # INTERACTIVE/WEB PLOTS
  1243. # From: http://ramnathv.github.io/rCharts/
  1244. require(devtools)
  1245. install_github('ramnathv/rCharts')
  1246. names(iris) = gsub("\\.", "", names(iris))
  1247. rPlot(SepalLength ~ SepalWidth | Species, data = iris, color = 'Species', type = 'point')
  1248.  
  1249. data(economics, package = "ggplot2")
  1250. econ <- transform(economics, date = as.character(date))
  1251. m1 <- mPlot(x = "date", y = c("psavert", "uempmed"), type = "Line", data = econ)
  1252. m1
  1253. hair_eye_male <- subset(as.data.frame(HairEyeColor), Sex == "Male")
  1254. n1 <- nPlot(Freq ~ Hair, group = "Eye", data = hair_eye_male, type = "multiBarChart")
  1255. n1$save('mychart.html', standalone = TRUE) # Generates a webpage with embedded interactive chart
  1256. # Or publish online
  1257. n1$publish('Scatterplot', host = 'gist') # e.g., github
  1258.  
  1259.  
  1260. #=======================================
  1261. # BSAD 443, Session 9 - Answers
  1262. # Quinlan School of Business - Loyola University Chicago
  1263. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  1264. #=======================================
  1265.  
  1266. #------------#
  1267. # Session 9  #
  1268. #------------#
  1269.  
  1270. ######## FROM EXCEL ########
  1271. install.packages("readxl")
  1272. nasa<-read_excel("NASA_Labs_Facilities.xlsx")
  1273. read_excel("my-spreadsheet.xls", sheet = "data")
  1274. read_excel("my-spreadsheet.xls", sheet = 2)
  1275.  
  1276. # Write to Excel using XLConenct: https://cran.r-project.org/web/packages/XLConnect/vignettes/XLConnect.pdf
  1277.  
  1278.  
  1279. ######## WORDCLOUD ########
  1280.  
  1281. install.packages(c("wordcloud","RColorBrewer"))
  1282. library(wordcloud)
  1283. library(RColorBrewer)
  1284.  
  1285. pal <- brewer.pal(4,"RdGy") # This means palette 4. Try other ones.
  1286.  
  1287. wordcloud("R is a programming language and software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis. Polls, surveys of data miners, and studies of scholarly literature databases show that R's popularity has increased substantially in recent years."
  1288. , min.freq = 1, scale=c(2,0.5), random.color = T, color = pal) # Word cloud from the first paragraph of Wikipedia's entry on R
  1289.  
  1290. # Now you try it with other text. For example:
  1291.  
  1292. # Compare annual filings (try to read only 100 lines, or so. Otherwise, it can get slow):
  1293. apple94<-readLines("http://www.sec.gov/Archives/edgar/data/320193/0000320193-94-000016.txt")[100:200]
  1294. apple2000<-readLines("http://www.sec.gov/Archives/edgar/data/320193/000091205700053623/0000912057-00-053623.txt")[100:200]
  1295.  
  1296.  
  1297. # Use the same procedure to plot Trump's address:
  1298.   http://blogs.wsj.com/washwire/2015/06/16/donald-trump-transcript-our-country-needs-a-truly-great-leader/
  1299.  
  1300. # And Clinton's speech:
  1301.   http://time.com/3920332/transcript-full-text-hillary-clinton-campaign-launch/
  1302.  
  1303. # And whatever you think could be intereting/fun. Project it on the screen so that all can enjoy!
  1304.  
  1305.  
  1306. ######## TWITTER ########  
  1307. # 1. http://twitter.com/signup #first sign up. You'd need to provide a mobile number.
  1308. # 2. https://apps.twitter.com (sign in with your account)
  1309. # 3. Create New App
  1310. # 4. Put some information (it doesn't have to be relevant/accurate), and accept
  1311. # 5. Click on the “keys and access token”: You need
  1312.     #1. Consumer Key (API Key)
  1313.     # 2. Consumer Secret (API Secret)
  1314.     # click the “Create my access token” button.
  1315.     # 3. Access Token
  1316.     # 4. Access Token Secret
  1317. # In R:
  1318. install.packages('twitteR')
  1319. install.packages('tm')
  1320. library(twitteR)
  1321. library(tm)
  1322. consumer_key <- "" #API Key
  1323. consumer_secret <- "" #API Secret
  1324. access_token <- "" #Access Token
  1325. access_secret <- "" #Access token secret
  1326. setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) # Select 1
  1327. rm(consumer_key, consumer_secret, access_token, access_secret)
  1328. tweet("Test 1,2,3") # Can you see it?
  1329. local_cache <- get("oauth_tken", twitteR:::oauth_cache) # saves the oauth token so we can reuse it
  1330. save(local_cache, file="data/oauth_cache.RData")
  1331.  
  1332. # Harvest tweets
  1333. Clinton <- searchTwitter("Clinton", n=1500)
  1334.           length(Clinton)
  1335. # Take a few of them
  1336. cl<-Clinton[1:100]
  1337. # Process text
  1338. cl <- sapply(cl, function(x) x$getText())
  1339. cl_corpus <- Corpus(VectorSource(cl))
  1340. cl_corpus<- tm_map(cl_corpus, content_transformer(tolower))
  1341. cl_corpus <- tm_map(cl_corpus, removePunctuation)
  1342. cl_corpus <- tm_map(cl_corpus, function(x)removeWords(x,stopwords()))
  1343. # Plot wordcloud
  1344. wordcloud(cl_corpus)
  1345.  
  1346. # Search for other terms
  1347. # Apply wordcloud parameers above and share will class
  1348.          
  1349.  
  1350. ######## DATA MINING ########
  1351. wine <- read.csv("whitewines.csv")
  1352. # examine the wine data
  1353. str(wine)
  1354. # Data are already randomized. We're going to cut it 75/25
  1355. wine_train <- wine[1:3750, ]
  1356. wine_test <- wine[3751:4898, ]
  1357. # The outcome variable is numeric (quality), so we'll use the rpart decision tree algorithm
  1358. install.packages('rpart')
  1359. library(rpart)
  1360. # Build the model
  1361. m.rpart <- rpart(quality ~ ., data = wine_train)
  1362. # get basic information about the tree
  1363. m.rpart
  1364. # get more detailed information about the tree
  1365. summary(m.rpart)
  1366. # use the rpart.plot package to create a visualization
  1367. install.packages('rpart.plot')
  1368. library(rpart.plot)
  1369. # a few adjustments to the diagram
  1370. rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
  1371. # generate predictions for the testing dataset
  1372. p.rpart <- predict(m.rpart, wine_test)
  1373. # compare the distribution of predicted values vs. actual values
  1374. summary(p.rpart)
  1375. summary(wine_test$quality)
  1376. # compare the correlation
  1377. cor(p.rpart, wine_test$quality) # Highly correlated
  1378. # function to calculate the mean absolute error
  1379. MAE <- function(actual, predicted) {
  1380.   mean(abs(actual - predicted))  
  1381. }
  1382. # mean absolute error between predicted and actual values
  1383. MAE(p.rpart, wine_test$quality)  
  1384. # mean absolute error between actual values and mean value
  1385. mean(wine_train$quality) # result = 0.59 quality units when the scale is 0-10. Not bad!
  1386.  
  1387. # There are ways to fine-tune the model. But that's for anotehr class.
  1388.  
  1389.  
  1390.  
  1391. #=======================================
  1392. # BSAD 443, Session 9 - Answers
  1393. # Quinlan School of Business - Loyola University Chicago
  1394. # Copyright Zack Kertcher, PhD, 2017. All rights reserved.
  1395. #=======================================
  1396.  
  1397. # ds1
  1398.  
  1399. # 1.
  1400. unirank<-read.csv(path, header=T, stringsAsFactors = F)
  1401. unirank$score<-as.numeric(unirank$score)
  1402. round(mean(unirank$score, na.rm=T),2)
  1403. # 47.79
  1404. round(range(unirank$score, na.rm=T),2)
  1405. #  43.36 100.00
  1406. round(IQR(unirank$score, na.rm=T),2)
  1407. # 3.07
  1408.  
  1409. # 2.
  1410. library(dplyr)
  1411. unirank$patents<-as.numeric(unirank$patents)
  1412. by_country <- group_by(unirank, country)
  1413. pat<-summarize(by_country, avg.patents=mean(patents, na.rm=T))
  1414. pat[country=="USA"|country=="Japan"]
  1415. pat[pat$country=="USA"|pat$country=="Japan",]
  1416. # Japan    353.3962
  1417. # USA    296.2792
  1418.  
  1419. # 3.
  1420. # easy way
  1421. cor(unirank$publications, unirank$influence, use="complete.obs")
  1422. # 0.8748249
  1423. cor(unirank$publications, unirank$patents, use="complete.obs")
  1424. # 0.6713219
  1425.  
  1426. unirank$influence<-as.numeric(unirank$influence)
  1427. cor(unirank$influence, unirank$patents, use="complete.obs")
  1428. # 0.6111321
  1429.  
  1430. # a bit harder (but better) way
  1431. tmp<-unirank[,c("influence", "patents", "publications")]
  1432. cor(tmp, use="complete.obs")
  1433.  
  1434. #             influence   patents publications
  1435. # influence    1.0000000 0.6111321    0.8747273
  1436. # patents      0.6111321 1.0000000    0.6709847
  1437. # publications 0.8747273 0.6709847    1.0000000
  1438.  
  1439. # They are very highly correlated, just to illustrate:
  1440. cor.test(tmp$influence, tmp$publications,use = "complete.obs" )
  1441. plot(tmp$influence, tmp$publications)
  1442. abline(lm(tmp$publications~tmp$patents),col="red", lwd=3)
  1443.  
  1444.  
  1445. # ds2
  1446.  
  1447. # 1.
  1448. cars<-read.csv(path, header=T, stringsAsFactors = F)
  1449. cars$year<-as.numeric(gsub("\\,","",cars$year))
  1450. cars$price<-as.numeric(gsub("\\,","",cars$price))
  1451. cars$mileage<-as.numeric(gsub("\\,","",cars$mileage))
  1452. d<-as.Date(cars$date, "%d-%b-%y")
  1453. library(lubridate)
  1454. cars$month<-month(d)
  1455. summer<-cars[cars$month>5,]
  1456. sp<-mean(summer$price, na.rm=T)
  1457. othermonths<-cars[cars$month<6,]
  1458. op<-mean(othermonths$price, na.rm=T)
  1459. op-sp
  1460. # 3835.104
  1461.  
  1462. # 2.
  1463.  
  1464. by_color <- group_by(cars, color)
  1465. color<-summarize(by_color, freq=n(), na.rm=T)
  1466. color[order(color$freq),]
  1467. # least: Gold, Yellow  Most: Black, Silver
  1468. hotcoldcars<-cars[(cars$color=="Black"|cars$color=="Silver"|cars$color=="Gold"|cars$color=="Yellow"),]
  1469. nrow(hotcoldcars)
  1470. # 70
  1471.  
  1472. # 3.
  1473. my<-max(cars$year)
  1474. cars$age<-my-cars$year
  1475.  
  1476. # 4.
  1477. plot(cars$mileage, cars$age)
  1478. abline(lm(cars$age~cars$mileage), col="red", lwd=2)
  1479. cor(cars$mileage, cars$age)
  1480. # 0.7603127
  1481.  
  1482.  
  1483. #------------#
  1484. # Homework 2 #
  1485. #------------#
  1486. # Task 1
  1487.  
  1488. # 1. character, numeric and logical
  1489. # 2. for example, class() or str()
  1490. # 3.
  1491. [1] FALSE
  1492. [1] TRUE
  1493. # 4. The first one is false because 10 is numeric. The second one is true because as.character converted the 10 into a character.
  1494.  
  1495. # Task 2
  1496. #1. It doesn't work because to extract a data element, you need to use brackets instead
  1497. # of parentheses. Parentheses are for functions.
  1498.  
  1499. #2.
  1500. v<-c(1,2,3,9,15)
  1501. v<-as.factor(v)
  1502. # Simply way
  1503. v[v==1]<-NA
  1504. v[v==15]<-NA
  1505. # Short way
  1506. v[v==1&v==15]<-NA
  1507. # Then
  1508. levels(v)
  1509. droplevels(v)
  1510.  
  1511. # Task 3
  1512. # 1.
  1513. department<-data.frame(age=c(23,30,45), gender=c("M", "F", "M"), level=c(3,6,8), stringsAsFactors = FALSE)
  1514. # 2.
  1515. department<-rbind(department, c(31,"F", 7))
  1516. # 3.
  1517. department<-cbind(department, sales=c(10,9,9,8))
  1518. # 4.
  1519. snapshot<-department[,c("age","sales")]
  1520. # 5.
  1521. snapshot[snapshot$age>=31,]
  1522. # 6.
  1523. # There are various ways to doing this. For example:
  1524. # You can say which rows to drop
  1525. snapshot[-c(2),] # Drop this row
  1526. # or
  1527. delrows<-c(2)
  1528. snapshot[-delrows,]
  1529. # You can also say which rows to keep
  1530. snapshot[c(1,3:4),] # Keep other rows
  1531. # or
  1532. keeprows<-c(1,3:4)
  1533. snapshot[keeprows,]
  1534.  
  1535. # 7.
  1536. # Again, there are various ways to doing this. For example:
  1537. # Use the NULL option
  1538. snapshot[,c("age")]<-NULL
  1539. snapshot$age<-NULL
  1540. # Or declare a column to keep as a vector, and then refer to this vector.
  1541. #The vector can be the name or number of the column.
  1542. keepcols<-c("sales")
  1543. snapshot[keepcols]
  1544. # Or declare which column to drop as a vector, and then refer to this vector.
  1545. # The vector can be the name or number of the column.
  1546. dropcols<-1
  1547. snapshot[-dropcols]
  1548.  
  1549.  
  1550. #------------#
  1551. # Homework 3 #
  1552. #------------#
  1553. # Task 1
  1554. # 1.
  1555. lst<-list(ID=seq(1,5), data.frame(name=c("a", "b"), age=c(55,42)))
  1556.  
  1557. # 2.
  1558. lst[[1]][2]<-10
  1559. # 3.
  1560. lst[[2]][1,2]
  1561.  
  1562. # Task 2
  1563. # 1.
  1564. install.packages('car')
  1565. library(car)
  1566. data(Anscombe)
  1567.  
  1568. # 2.
  1569. # Multiple ways to do this. This one is simplest:
  1570. quantile(Anscombe$education) # Note 25th quantile is 165
  1571. low<-rownames(Anscombe[Anscombe$education<=165,])  # Creating a vector to makes it easier moving forward
  1572.  
  1573. # 3.
  1574. sort(low)
  1575. # "AL" "AR" "GA" "KY" "LA" "MS" "NC" "NE" "OK" "SC" "TN" "TX" "WV"
  1576.  
  1577. # 4.
  1578. relspend<-Anscombe$education/Anscombe$income
  1579.  
  1580. # 5. For example,
  1581. mean(relspend)
  1582. sd(relspend)
  1583.  
  1584. # 6. For example,
  1585. hist(relspend)
  1586. plot(density(relspend))
  1587.  
  1588. # 7. Yes
  1589.  
  1590. # 8.
  1591. Anscombe$relspend<-relspend
  1592.  
  1593. # 9.
  1594. hispenders<-Anscombe[Anscombe$relspend>median(Anscombe$relspend),]
  1595.  
  1596.  
  1597. #------------#
  1598. # Homework 4 #
  1599. #------------#
  1600.  
  1601. # Task 1
  1602.  
  1603. # Much of the first stage can be done within Excel prior to uploading to R. But here is the way to do it all in R.
  1604.  
  1605. # 1. Upload file
  1606. fl13<-file.choose()
  1607. fbi13<-read.csv(fl13, header=T)
  1608.  
  1609. # 2. Data wrangling
  1610. head(fbi13) # Need to remove first couple of lines and change column names, still I only care about city name, population and violent crime
  1611. fbi13<-fbi13[,c(1:3,10)] # Keep only the columns I need. I will save time.
  1612. fbi13<-fbi13[3:nrow(fbi13),]
  1613. tail(fbi13) # Oops. Lots of junk there too. Let's see how far it goes:
  1614. tail(fbi13, n=20) # Okay, action starts at line 9295. Let's drop anything after it.
  1615. fbi13<-fbi13[-c(9295:nrow(fbi13)),] # Multiple ways to doing this. We can also just declare the rows we want to keep, as opposed to those we want to drop.
  1616. colnames(fbi13)<-c("state", "city", "population", "property.crimes") # still need to drop the first row--the one containing original column names.
  1617. fbi13<-fbi13[-1,] # Good!
  1618.  
  1619. # Variable wrangling
  1620. fbi13$population<-gsub("\\,", "", fbi13$population)
  1621. fbi13$population<-as.numeric(fbi13$population)
  1622. fbi13$property.crimes<-gsub("\\,", "", fbi13$property.crimes)
  1623. fbi13$property.crimes<-as.numeric(fbi13$property.crimes)
  1624.  
  1625. # 3. Select big cities
  1626. big.cities<-fbi13[fbi13$population>200000,]
  1627. View(big.cities) # There are some issues here.
  1628.     # 1. State has little role to play here, so let's drop it.
  1629.     # 2. A few rows contain NA values. We need to drop them.
  1630.     # 3. Some city names have numbers next to them (this was originally a footnote). We need go get rid of these.
  1631. # Here we go:
  1632.     # 1. Remove state column:  
  1633.     big.cities$state<-NULL
  1634.     # 2. Remove rows with NAs:
  1635.     big.cities<-big.cities[complete.cases(big.cities),]
  1636.     # or
  1637.     big.cities<-na.omit(big.cities)
  1638.     # 3. Remove numbers and special characters from city names
  1639.     big.cities$city<-gsub("\\d","",big.cities$city) # \\d removes digits from a string
  1640.     big.cities$city<-gsub("[[:punct:]]","" , big.cities$city) # removes special characters from city names
  1641.    
  1642. # 4. Identify cities with the lowest crime rates
  1643. big.cities$crime.ratio<-big.cities$property.crimes/(big.cities$population/1000)
  1644. # We can also round it, but this is not needed. It just looks nicer.  
  1645. big.cities$crime.ratio<-round(big.cities$crime.ratio,2)
  1646. # Now let's sort the data
  1647. big.cities<-big.cities[order(big.cities$crime.ratio),]
  1648. big.cities$city[1:10]
  1649.  
  1650. # In case you are wondering, Chicago is here
  1651. grep("chicago", big.cities$city, ignore.case=T) # 42 out of 112
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement