Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- rm(list=ls())
- # Read data from disk
- car_data = read.csv("../car_data.csv")
- # Create tables with freqs for CID
- absfreq = table(car_data[, c(1, 5)])
- freq = prop.table(absfreq, 1)
- freqSum = rowSums(prop.table(absfreq))
- GINI_CUSTOMERS = numeric(20)
- GINI_ID = 0
- for (i in 1:20) {
- GINI_CUSTOMERS[i] = 1 - freq[i, 'No']^2 - freq[i, 'Yes']^2
- GINI_ID = GINI_ID + freqSum[i] * GINI_CUSTOMERS[i]
- }
- gini_process <-function(absfreq,splitvar = NULL){
- freq = prop.table(absfreq, 1)
- freqSum = rowSums(prop.table(absfreq))
- row_1 = rownames(freq)[1]
- row_2 = rownames(freq)[2]
- GINI_1 = 1 - freq[row_1, "No"]^2 - freq[row_1, "Yes"]^2
- GINI_1
- GINI_2 = 1 - freq[row_2, "No"]^2 - freq[row_2, "Yes"]^2
- GINI_2
- GINI = freqSum[row_1] * GINI_1 + freqSum[row_2] * GINI_2
- return (c(GINI_1, GINI_2, GINI))
- }
- # Create tables with frequencies for Sex
- absfreq = table(car_data[, c(2, 5)])
- list[GINI_MALE, GINI_FEMALE, GINI_SEX] = gini_process(absfreq)
- freq = prop.table(absfreq, 1)
- freqSum = rowSums(prop.table(absfreq))
- # Calculate GINI index of Sex
- GINI_Male = 1 - freq["M", "No"]^2 - freq["M", "Yes"]^2
- GINI_Female = 1 - freq["F", "No"]^2 - freq["F", "Yes"]^2
- GINI_Sex = freqSum["M"] * GINI_Male + freqSum["F"] * GINI_Female
- ##########################
- # Types: Sedan / Family / Sport
- # Create tables with frequencies for CarType, multisplit
- splita = car_data[,c(3,5)]
- splita$CarType = as.character(splita$CarType)
- splita$CarType[splita$CarType == 'Sedan'] <- 'FamilySedan'
- splita$CarType[splita$CarType == 'Family'] <- 'FamilySedan'
- absfreq = table(splita)
- freq = prop.table(absfreq, 1)
- freqSum = rowSums(prop.table(absfreq))
- # Calculate GINI index when splitting Family-Sedan vs Sport
- GINI_FamilySedan = 1 - freq["FamilySedan", "No"]^2 - freq["FamilySedan", "Yes"]^2
- GINI_Sport = 1 - freq["Sport", "No"]^2 - freq["Sport", "Yes"]^2
- GINI_SplitA = freqSum["FamilySedan"] * GINI_FamilySedan + freqSum["Sport"] * GINI_Sport
- ############### Family-Sport vs Sedan
- splitb = car_data[,c(3,5)]
- splitb$CarType = as.character(splitb$CarType)
- splitb$CarType[splitb$CarType == 'Family'] <- 'FamilySport'
- splitb$CarType[splitb$CarType == 'Sport'] <- 'FamilySport'
- absfreq = table(splitb)
- freq = prop.table(absfreq, 1)
- freqSum = rowSums(prop.table(absfreq))
- # Calculate GINI index when splitting Family-Sedan vs Sport
- GINI_FamilySport = 1 - freq["FamilySport", "No"]^2 - freq["FamilySport", "Yes"]^2
- GINI_Sedan = 1 - freq["Sedan", "No"]^2 - freq["Sedan", "Yes"]^2
- GINI_SplitB = freqSum["FamilySport"] * GINI_FamilySport + freqSum["Sedan"] * GINI_Sedan
- ### SPORT SEDAN VS FAMILY
- splitc = car_data[,c(3,5)]
- splitc$CarType = as.character(splitc$CarType)
- splitc$CarType[splitc$CarType == 'Sport'] <- 'SportSedan'
- splitc$CarType[splitc$CarType == 'Sedan'] <- 'SportSedan'
- absfreq = table(splitc)
- freq = prop.table(absfreq, 1)
- freqSum = rowSums(prop.table(absfreq))
- # Calculate GINI index when splitting Family-Sedan vs Sport
- GINI_SportSedan = 1 - freq["SportSedan", "No"]^2 - freq["SportSedan", "Yes"]^2
- GINI_Family = 1 - freq["Family", "No"]^2 - freq["Family", "Yes"]^2
- GINI_SplitC = freqSum["SportSedan"] * GINI_SportSedan + freqSum["Family"] * GINI_Family
- ##################### Budget
- ############### Family-Sport vs Sedan
- splitb = car_data[,c(3,5)]
- splitb$CarType = as.character(splitb$CarType)
- splitb$CarType[splitb$CarType == 'Family'] <- 'FamilySport'
- splitb$CarType[splitb$CarType == 'Sport'] <- 'FamilySport'
- absfreq = table(splitb)
- freq = prop.table(absfreq, 1)
- freqSum = rowSums(prop.table(absfreq))
- # Calculate GINI index when splitting Family-Sedan vs Sport
- GINI_FamilySport = 1 - freq["FamilySport", "No"]^2 - freq["FamilySport", "Yes"]^2
- GINI_Sedan = 1 - freq["Sedan", "No"]^2 - freq["Sedan", "Yes"]^2
- GINI_SplitB = freqSum["FamilySport"] * GINI_FamilySport + freqSum["Sedan"] * GINI_Sedan
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement