Advertisement
Guest User

Untitled

a guest
Mar 24th, 2017
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 3.80 KB | None | 0 0
  1. library(data.table)
  2. library(ggplot2)
  3. library(plyr)
  4. library(ez)
  5. library(corrgram)
  6.  
  7. mental <- as.data.table(read.csv("/home/georgiy/MATLAB/help/survey.csv", header = T, sep = ",", dec = ".", fill = T, stringsAsFactors = F))
  8. View(mental)
  9. #We will try to visualise how such parameters as gender, age, nationality and family history of mental illnesses affects the possibility of
  10. #having a mental illness in people working for tech companies
  11.  
  12. #Clean and prepare data
  13. #Create new data.table with parameters we plan to evaluate
  14. mental1 <- data.table(Age = mental$Age, Gender = mental$Gender, Country = mental$Country, Family_history = mental$family_history, Treatment = mental$treatment, Tech_company = mental$tech_company, keep.rownames = T)
  15.  
  16. ##Let's visualise each parameter
  17. ##Age
  18. summary(mental1$Age)
  19. barplot(table(mental1$Age), main = "Age distribution", xlab = "Age")
  20. #We can clearly see that we have outlying values
  21. #It is logical to clean the Age column and get rid of values below 16 and over 90, as nobody would employ age categories beyond these boundaries
  22. outlying_age <- subset(mental1, Age<16 | Age>90)
  23. nrow(outlying_age)
  24. outlying_age
  25. mental1[mental1$Age>90]$Age <- -1
  26. mental1[mental1$Age<16]$Age <- NA
  27. mental <- mental1[!is.na(mental1$Age)]
  28. #check maximum and minimum values
  29. max(mental$Age, na.rm = T)
  30. min(mental$Age, na.rm = T)
  31. #statistics
  32. boxplot(mental$Age, col = "blue", main = "General age statistics", ylab = "Age")
  33. summary(mental$Age)
  34. #as we see, we have outliers on the boxplot, We should get rif of them in the data.table
  35. mental <- mental[mental$Age<=(mean(mental$Age, na.rm=T)+2*sd(mental$Age, na.rm=T))]
  36. boxplot(mental$Age, col = "blue", main = "General age statistics", ylab = "Age")
  37.  
  38. ##Gender
  39. #Get rid of strange gender like "something kinda male" etc; assign gender numeric values
  40. summary(mental$Gender)
  41.  
  42. mental$Gender <- as.numeric(revalue(mental$Gender, c("Male" = "2","male" = "2","M" = "2","m" = "2", "Female" = "1", "female" = "1", "F" = "1", "f" = 1)))
  43. mental <- mental[!is.na(mental$Gender)]
  44.  
  45.  
  46.  
  47. #clean country column and choose US, UK & Canada
  48. #see which countries are presented
  49. a <- setorder(mental[,.N, by=Country],-N)
  50. a
  51. #choose the first 5 countries and create a vec out of them
  52. mental$Country %in% c('United States','United Kingdom','Canada')
  53. entries <-  c('United States','United Kingdom','Canada')
  54. #mental[mental$Country != 'United States'| 'United Kingdom'| 'Canada']$Country <- NA
  55. #countryInd <- (mental$Country != as.vector(c('United States','United Kingdom','Canada','Germany','Ireland')))
  56. countryInd <- mental$Country %in% as.vector(c('United States','United Kingdom','Canada','Germany','Ireland'))
  57. mental <- mental[countryInd]
  58.  
  59.  
  60. #family history
  61. mental$Family_history[mental$Family_history == 'Yes'] <- 1
  62. mental$Family_history[mental$Family_history == 'No'] <- 0
  63.  
  64. #treatment
  65. mental$Treatment[mental$Treatment == 'Yes'] <- 1
  66. mental$Treatment[mental$Treatment == 'No'] <- 0
  67.  
  68. #tech company or not. Leave only tech company workers
  69. mental$Tech_company[mental$Tech_company == 'No'] <- NA
  70. mental$Tech_company[mental$Tech_company == 'Yes'] <- 1
  71. mental <- mental[!is.na(mental$Tech_company)]
  72.  
  73. count(mental$Family_history)
  74. count(mental$Treatment)
  75. gencount <- count(mental$Gender)
  76. famcount <- count(mental$Family_history)
  77.  
  78.  
  79. treatedGender <- count(mental[mental$Treatment %in% 1]$Gender)
  80. nonTreatedGender <- count(mental[mental$Treatment %in% 0]$Gender)
  81.  
  82. treatedGender$freq <- treatedGender$freq/gencount$freq
  83.  
  84. treatedFam <- count(mental[mental$Treatment %in% 1]$Family_history)
  85. nonTreatedFam <- count(mental[mental$Treatment %in% 0]$Family_history)
  86.  
  87. treatedFam$freq <- treatedFam$freq/famcount$freq
  88.  
  89. treatedCountry <- count(mental[mental$Treatment %in% 1]$Country)
  90. countryCount <- count(mental$Country)
  91. treatedCountry$freq <- treatedCountry$freq/countryCount$freq
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement