Advertisement
gakonst

gini.R

Jan 22nd, 2018
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 3.82 KB | None | 0 0
  1. rm(list=ls())
  2.  
  3. # Read data from disk
  4.  
  5. car_data = read.csv("../car_data.csv")
  6. # Create tables with freqs for CID
  7. absfreq = table(car_data[, c(1, 5)])
  8. freq = prop.table(absfreq, 1)
  9. freqSum = rowSums(prop.table(absfreq))
  10.  
  11. GINI_CUSTOMERS = numeric(20)
  12. GINI_ID = 0
  13. for (i in 1:20) {
  14.     GINI_CUSTOMERS[i] = 1 - freq[i, 'No']^2 - freq[i, 'Yes']^2
  15.     GINI_ID = GINI_ID + freqSum[i] * GINI_CUSTOMERS[i]
  16. }
  17.  
  18. gini_process <-function(absfreq,splitvar = NULL){
  19.   freq = prop.table(absfreq, 1)
  20.   freqSum = rowSums(prop.table(absfreq))
  21.   row_1 = rownames(freq)[1]
  22.   row_2 = rownames(freq)[2]
  23.   GINI_1 = 1 - freq[row_1, "No"]^2 - freq[row_1, "Yes"]^2
  24.   GINI_1
  25.   GINI_2 = 1 - freq[row_2, "No"]^2 - freq[row_2, "Yes"]^2
  26.   GINI_2
  27.   GINI = freqSum[row_1] * GINI_1  + freqSum[row_2] * GINI_2
  28.   return (c(GINI_1, GINI_2, GINI))
  29. }
  30.  
  31. # Create tables with frequencies for Sex
  32. absfreq = table(car_data[, c(2, 5)])
  33. list[GINI_MALE, GINI_FEMALE, GINI_SEX] = gini_process(absfreq)
  34. freq = prop.table(absfreq, 1)
  35. freqSum = rowSums(prop.table(absfreq))
  36.  
  37. # Calculate GINI index of Sex
  38. GINI_Male = 1 - freq["M", "No"]^2 - freq["M", "Yes"]^2
  39. GINI_Female = 1 - freq["F", "No"]^2 - freq["F", "Yes"]^2
  40. GINI_Sex = freqSum["M"] * GINI_Male  + freqSum["F"] * GINI_Female
  41.  
  42. ##########################
  43. # Types: Sedan / Family / Sport
  44. # Create tables with frequencies for CarType, multisplit
  45. splita = car_data[,c(3,5)]
  46. splita$CarType = as.character(splita$CarType)
  47. splita$CarType[splita$CarType == 'Sedan'] <- 'FamilySedan'
  48. splita$CarType[splita$CarType == 'Family'] <- 'FamilySedan'
  49. absfreq = table(splita)
  50. freq = prop.table(absfreq, 1)
  51. freqSum = rowSums(prop.table(absfreq))
  52.  
  53. # Calculate GINI index when splitting Family-Sedan vs Sport
  54. GINI_FamilySedan = 1 - freq["FamilySedan", "No"]^2 - freq["FamilySedan", "Yes"]^2
  55. GINI_Sport = 1 - freq["Sport", "No"]^2 - freq["Sport", "Yes"]^2
  56. GINI_SplitA = freqSum["FamilySedan"] * GINI_FamilySedan  + freqSum["Sport"] * GINI_Sport
  57.  
  58. ############### Family-Sport vs Sedan
  59.  
  60. splitb = car_data[,c(3,5)]
  61. splitb$CarType = as.character(splitb$CarType)
  62. splitb$CarType[splitb$CarType == 'Family'] <- 'FamilySport'
  63. splitb$CarType[splitb$CarType == 'Sport'] <- 'FamilySport'
  64. absfreq = table(splitb)
  65. freq = prop.table(absfreq, 1)
  66. freqSum = rowSums(prop.table(absfreq))
  67.  
  68. # Calculate GINI index when splitting Family-Sedan vs Sport
  69. GINI_FamilySport = 1 - freq["FamilySport", "No"]^2 - freq["FamilySport", "Yes"]^2
  70. GINI_Sedan = 1 - freq["Sedan", "No"]^2 - freq["Sedan", "Yes"]^2
  71. GINI_SplitB = freqSum["FamilySport"] * GINI_FamilySport  + freqSum["Sedan"] * GINI_Sedan
  72.  
  73. ### SPORT SEDAN VS FAMILY
  74. splitc = car_data[,c(3,5)]
  75. splitc$CarType = as.character(splitc$CarType)
  76. splitc$CarType[splitc$CarType == 'Sport'] <- 'SportSedan'
  77. splitc$CarType[splitc$CarType == 'Sedan'] <- 'SportSedan'
  78. absfreq = table(splitc)
  79. freq = prop.table(absfreq, 1)
  80. freqSum = rowSums(prop.table(absfreq))
  81.  
  82. # Calculate GINI index when splitting Family-Sedan vs Sport
  83. GINI_SportSedan = 1 - freq["SportSedan", "No"]^2 - freq["SportSedan", "Yes"]^2
  84. GINI_Family = 1 - freq["Family", "No"]^2 - freq["Family", "Yes"]^2
  85. GINI_SplitC = freqSum["SportSedan"] * GINI_SportSedan  + freqSum["Family"] * GINI_Family
  86.  
  87. ##################### Budget
  88.  
  89. ############### Family-Sport vs Sedan
  90.  
  91. splitb = car_data[,c(3,5)]
  92. splitb$CarType = as.character(splitb$CarType)
  93. splitb$CarType[splitb$CarType == 'Family'] <- 'FamilySport'
  94. splitb$CarType[splitb$CarType == 'Sport'] <- 'FamilySport'
  95. absfreq = table(splitb)
  96. freq = prop.table(absfreq, 1)
  97. freqSum = rowSums(prop.table(absfreq))
  98.  
  99. # Calculate GINI index when splitting Family-Sedan vs Sport
  100. GINI_FamilySport = 1 - freq["FamilySport", "No"]^2 - freq["FamilySport", "Yes"]^2
  101. GINI_Sedan = 1 - freq["Sedan", "No"]^2 - freq["Sedan", "Yes"]^2
  102. GINI_SplitB = freqSum["FamilySport"] * GINI_FamilySport  + freqSum["Sedan"] * GINI_Sedan
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement