Guest User

Untitled

a guest
Dec 3rd, 2018
128
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.37 KB | None | 0 0
  1. library(treemapify)
  2. library(ggplot2)
  3. library(reshape2)
  4. library(zoo)
  5. require('RPostgreSQL')
  6. drv <- dbDriver('PostgreSQL')
  7. con <- dbConnect(drv, dbname = 'inspection_results',host = '35.231.189.188', port = 5432,user = 'postgres', password = 'pwd4APAN5310')
  8.  
  9.  
  10. ##Q1
  11. q1 = "select health_dept_name, count(distinct zipcode) from addresses a
  12. join facilities f on a.address_id = f.address_id
  13. join local_health_department l on f.health_dept_id = l.health_dept_id
  14. group by health_dept_name
  15. order by count(distinct zipcode) desc limit 10
  16. ;"
  17. df1 <- dbGetQuery(con, q1)
  18. df1$health_dept_name <- factor(df1$health_dept_name, levels = df1$health_dept_name)
  19. ggplot(data=df1, aes(x=health_dept_name, y=count, fill=health_dept_name)) +geom_bar(colour="black", stat="identity")+theme_minimal()+ coord_flip()+ labs(title="How Many Zipcode each Health Department Oversee", subtitle="Top 10",x = "Health Department", y="Number of ZipCodes")+scale_x_discrete(limits = rev(levels(df1$health_dept_name)))+guides(fill=guide_legend(title="Health Department"))
  20.  
  21. ##Q2
  22. q2 = "select inspection_type as inspections,
  23. CAST(SUM(critical_violations) AS float)/CAST(count(inspection_id) AS float)as critical_violations,
  24. CAST(SUM(noncritical_violations) AS float)/CAST(count(inspection_id) AS float)as non_critical_violation
  25. from inspections i
  26. join violations v on i.violation_id = v.violation_id
  27. where EXTRACT (year FROM i.last_inspected_date) = 2018
  28. group by inspection_type
  29. ;"
  30. df2 <- dbGetQuery(con, q2)
  31. df2a<- melt(df2, id.vars='inspections')
  32. ggplot(df2a, aes(x=inspections, y=value, fill=variable)) + geom_bar(stat='identity')+labs(title="Violations Per Inspection",subtitle="In 2018",x="Inpection Type", y="Number of Violations")+theme_minimal()+guides(fill=guide_legend(title="Inspection Type"))
  33.  
  34. ##Q3
  35. q3 = "select permit_expiration_date,count(p.permission_id) from permissions p
  36. join facilities f on p.permission_id = f.permission_id
  37. join local_health_department l on f.health_dept_id = l.health_dept_id
  38. where permit_expiration_date is not null and health_dept_name = 'Orange County'
  39. group by permit_expiration_date
  40. ;"
  41. df3 <- dbGetQuery(con, q3)
  42. ggplot(df3, aes(x=permit_expiration_date)) + geom_line(aes(y=count))+labs(title="Permit Expiration Date in Orange County",x="Permit Expiration Date", y="Number of Permit Expire")+theme_minimal()
  43.  
  44.  
  45. ##Q4
  46. q4 = "SELECT m.county, sum(critical_violations) AS total_number_of_critical_violations
  47. FROM (SELECT * FROM violations v
  48. join inspections i on v.violation_id = i.violation_id
  49. join records r on i.inspection_id = r.record_id
  50. join facilities f on r.facility_id = f.facility_id
  51. join addresses a on f.address_id = a.address_id) AS m
  52. GROUP BY m.county
  53. ORDER BY total_number_of_critical_violations DESC LIMIT 10
  54. ;"
  55. df4 <- dbGetQuery(con, q4)
  56.  
  57. df4$county <- factor(df4$county, levels = df4$county)
  58. ggplot(data=df4, aes(x=county, y=total_number_of_critical_violations, fill=county)) +geom_bar(colour="black", stat="identity",width=0.7)+theme_minimal()+theme(axis.text.x = element_text(angle=65, vjust=0.6))+labs(title="Highest Critical Violation",subtitle="Top 10 Counties",x = "County", y="Number of Critical Violations")+ guides(fill=FALSE)
  59.  
  60. ##Q5
  61.  
  62. q5 = "SELECT m.county,
  63. Round(CAST(SUM(critical_violations) AS DECIMAL(10,2))/CAST(count(facility_name) AS DECIMAL(10,2)),2) as Crit_violation_per_facility
  64. FROM (SELECT * FROM violations v
  65. join inspections i on v.violation_id = i.violation_id
  66. join records r on i.inspection_id = r.record_id
  67. join facilities f on r.facility_id = f.facility_id
  68. join addresses a on f.address_id = a.address_id) AS m
  69. GROUP BY m.county
  70. ORDER BY Round(CAST(SUM(critical_violations) AS DECIMAL(10,2))/CAST(count(facility_name) AS DECIMAL(10,2)),2) asc
  71. ;"
  72.  
  73. df5 <- dbGetQuery(con, q5)
  74. mean(df5$crit_violation_per_facility)
  75. df5$grade <- ifelse(df5$crit_violation_per_facility < 0.25, "below average", "above average")
  76. df5$county <- factor(df5$county, levels = df5$county)
  77.  
  78. ggplot(df5, aes(x=county, y=crit_violation_per_facility, label=crit_violation_per_facility)) +
  79. geom_point(stat='identity', aes(col=grade), size=6) +
  80. scale_color_manual(name="Critical Violations",labels = c("Above Average", "Below Average"),
  81. values = c("above average"="#D55E00", "below average"="#0072B2")) +
  82. geom_text(color="white", size=2) +
  83. labs(title="Critical Violation Per Facility",
  84. subtitle="County",xlab="critical violation per facility") +
  85. ylim(0.1, 0.55) +
  86. coord_flip()
  87.  
  88.  
  89.  
  90.  
  91. ##q6
  92.  
  93. q6 = "SELECT m.health_dept_name,count(m.facility_name) AS the_number_of_no_violations
  94. FROM (SELECT * FROM violations v
  95. JOIN inspections i ON v.violation_id = i.violation_id
  96. JOIN records r ON i.inspection_id = r.record_id
  97. JOIN facilities f ON r.facility_id = f.facility_id
  98. JOIN addresses a ON f.address_id = a.address_id
  99. JOIN local_health_department l on l.health_dept_id = f.health_dept_id) AS m
  100. WHERE m.critical_violations = 0
  101. AND m.critical_not_corrected = 0
  102. AND m.noncritical_violations = 0
  103. GROUP BY m.health_dept_name
  104. ORDER BY the_number_of_no_violations DESC
  105. ;"
  106.  
  107. df6<- dbGetQuery(con, q6)
  108. df6$health_dept_name <- factor(df6$health_dept_name, levels = df6$health_dept_name)
  109.  
  110. ggplot(data=df6, aes(x=health_dept_name, y=the_number_of_no_violations, fill=health_dept_name)) +geom_bar(colour="black", stat="identity")+theme_minimal()+ coord_flip()+ labs(title="Number of no-violation facilities", subtitle="County",x = "Health Department", y="Number of facilities have no violation")+scale_x_discrete(limits = rev(levels(df6$health_dept_name)))+guides(fill=FALSE)
  111.  
  112.  
  113. ##Q7
  114. q7 = "select to_char(i.last_inspected_date, 'YYYY-MM') as Date,
  115. count(i.inspection_id) as num_inspection,
  116. (sum(critical_violations)+sum(critical_not_corrected)+sum(noncritical_violations)) as total_violations
  117. from inspections i
  118. join violations v on i.violation_id = v.violation_id
  119. group by to_char(i.last_inspected_date, 'YYYY-MM')
  120. having (sum(critical_violations)+sum(critical_not_corrected)+sum(noncritical_violations)) is not null
  121. order by to_char(i.last_inspected_date, 'YYYY-MM') asc
  122. ;"
  123.  
  124. df7<- dbGetQuery(con, q7)
  125.  
  126. df7$date <- as.yearmon(df7$date)
  127. ggplot(df7, aes(x=date)) + geom_line(aes(y=num_inspection, col="num_inspection")) + geom_line(aes(y=total_violations, col="total_violations"))+labs(title="Total Inspections and Violation", subtitle="From 06-2016 to 09-2018", y="Value")
  128.  
  129.  
  130.  
  131. ##Q8
  132. q8="SELECT inspection_comments,count(inspection_comments) as frequency from inspections i
  133. join violations v on i.violation_id=v.violation_id
  134. where critical_violations >2
  135. group by inspection_comments
  136. order by count(inspection_comments) desc limit 10;"
  137.  
  138. df8<- dbGetQuery(con, q8)
  139.  
  140. ##Q9:
  141.  
  142. q9="select a.county,
  143. Round(Cast(count(i.inspection_id) As DECIMAL(10,2))/CAST(count(distinct facility_name) AS DECIMAL(10,2)),2) as Inspection_per_facility,
  144. Round(CAST(SUM(noncritical_violations) AS DECIMAL(10,2))/CAST(count(distinct facility_name) AS DECIMAL(10,2)),2) as Crit_violation_per_facility
  145. From inspections i
  146. join violations v on i.violation_id=v.violation_id
  147. join records r on i.inspection_id = r.record_id
  148. join facilities f on r.facility_id = f.facility_id
  149. join addresses a on f.address_id = a.address_id
  150. group by a.county
  151. order by Round(Cast(count(i.inspection_id) As DECIMAL(10,2))/CAST(count(distinct facility_name) AS DECIMAL(10,2)),2) desc
  152. ;"
  153.  
  154. df9<- dbGetQuery(con, q9)
  155.  
  156. ggplot(df9, aes(x=crit_violation_per_facility, y=inspection_per_facility))+labs(title="More inspections discover more critical violation?",subtitle="County",x="Critical Violation Per Facility", y="Inspection Per Facility")+theme_minimal()+guides(fill=FALSE)+geom_text(aes(label=county),hjust=0, vjust=0.3,size=3,check_overlap = TRUE,color="black")+geom_point(color='red')
  157.  
  158.  
  159. ##Q10
  160. q10="SELECT o.description, COUNT(o.description) AS num_of_operation
  161. FROM operations o
  162. JOIN inspections i ON o.operation_id = i.operation_id
  163. WHERE inspection_type = 'Re-Inspection'
  164. AND EXTRACT (year FROM i.last_inspected_date) = 2018
  165. OR EXTRACT (year FROM i.last_inspected_date) = 2017
  166. GROUP BY o.description
  167. ORDER BY count(o.description) DESC
  168. ;"
  169.  
  170. df10<- dbGetQuery(con, q10)
  171.  
  172. ggplot(df10, aes(x = "", y=num_of_operation, fill = description)) + geom_bar(width = 1, stat = "identity")+ theme(axis.line = element_blank(), plot.title = element_text(hjust=1)) + labs(fill="Operation Type", x=NULL, y=NULL, title="Pie Chart for Operation Type in Year 2018 and 2017 for Re-Inspection")+coord_polar("y")+theme(legend.position="bottom",legend.text=element_text(size=8),legend.title=element_blank(),legend.key.size = unit(1,"line"),legend.direction="vertical")
Add Comment
Please, Sign In to add comment