Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(treemapify)
- library(ggplot2)
- library(reshape2)
- library(zoo)
- require('RPostgreSQL')
- drv <- dbDriver('PostgreSQL')
- con <- dbConnect(drv, dbname = 'inspection_results',host = '35.231.189.188', port = 5432,user = 'postgres', password = 'pwd4APAN5310')
- ##Q1
- q1 = "select health_dept_name, count(distinct zipcode) from addresses a
- join facilities f on a.address_id = f.address_id
- join local_health_department l on f.health_dept_id = l.health_dept_id
- group by health_dept_name
- order by count(distinct zipcode) desc limit 10
- ;"
- df1 <- dbGetQuery(con, q1)
- df1$health_dept_name <- factor(df1$health_dept_name, levels = df1$health_dept_name)
- ggplot(data=df1, aes(x=health_dept_name, y=count, fill=health_dept_name)) +geom_bar(colour="black", stat="identity")+theme_minimal()+ coord_flip()+ labs(title="How Many Zipcode each Health Department Oversee", subtitle="Top 10",x = "Health Department", y="Number of ZipCodes")+scale_x_discrete(limits = rev(levels(df1$health_dept_name)))+guides(fill=guide_legend(title="Health Department"))
- ##Q2
- q2 = "select inspection_type as inspections,
- CAST(SUM(critical_violations) AS float)/CAST(count(inspection_id) AS float)as critical_violations,
- CAST(SUM(noncritical_violations) AS float)/CAST(count(inspection_id) AS float)as non_critical_violation
- from inspections i
- join violations v on i.violation_id = v.violation_id
- where EXTRACT (year FROM i.last_inspected_date) = 2018
- group by inspection_type
- ;"
- df2 <- dbGetQuery(con, q2)
- df2a<- melt(df2, id.vars='inspections')
- ggplot(df2a, aes(x=inspections, y=value, fill=variable)) + geom_bar(stat='identity')+labs(title="Violations Per Inspection",subtitle="In 2018",x="Inpection Type", y="Number of Violations")+theme_minimal()+guides(fill=guide_legend(title="Inspection Type"))
- ##Q3
- q3 = "select permit_expiration_date,count(p.permission_id) from permissions p
- join facilities f on p.permission_id = f.permission_id
- join local_health_department l on f.health_dept_id = l.health_dept_id
- where permit_expiration_date is not null and health_dept_name = 'Orange County'
- group by permit_expiration_date
- ;"
- df3 <- dbGetQuery(con, q3)
- ggplot(df3, aes(x=permit_expiration_date)) + geom_line(aes(y=count))+labs(title="Permit Expiration Date in Orange County",x="Permit Expiration Date", y="Number of Permit Expire")+theme_minimal()
- ##Q4
- q4 = "SELECT m.county, sum(critical_violations) AS total_number_of_critical_violations
- FROM (SELECT * FROM violations v
- join inspections i on v.violation_id = i.violation_id
- join records r on i.inspection_id = r.record_id
- join facilities f on r.facility_id = f.facility_id
- join addresses a on f.address_id = a.address_id) AS m
- GROUP BY m.county
- ORDER BY total_number_of_critical_violations DESC LIMIT 10
- ;"
- df4 <- dbGetQuery(con, q4)
- df4$county <- factor(df4$county, levels = df4$county)
- ggplot(data=df4, aes(x=county, y=total_number_of_critical_violations, fill=county)) +geom_bar(colour="black", stat="identity",width=0.7)+theme_minimal()+theme(axis.text.x = element_text(angle=65, vjust=0.6))+labs(title="Highest Critical Violation",subtitle="Top 10 Counties",x = "County", y="Number of Critical Violations")+ guides(fill=FALSE)
- ##Q5
- q5 = "SELECT m.county,
- Round(CAST(SUM(critical_violations) AS DECIMAL(10,2))/CAST(count(facility_name) AS DECIMAL(10,2)),2) as Crit_violation_per_facility
- FROM (SELECT * FROM violations v
- join inspections i on v.violation_id = i.violation_id
- join records r on i.inspection_id = r.record_id
- join facilities f on r.facility_id = f.facility_id
- join addresses a on f.address_id = a.address_id) AS m
- GROUP BY m.county
- ORDER BY Round(CAST(SUM(critical_violations) AS DECIMAL(10,2))/CAST(count(facility_name) AS DECIMAL(10,2)),2) asc
- ;"
- df5 <- dbGetQuery(con, q5)
- mean(df5$crit_violation_per_facility)
- df5$grade <- ifelse(df5$crit_violation_per_facility < 0.25, "below average", "above average")
- df5$county <- factor(df5$county, levels = df5$county)
- ggplot(df5, aes(x=county, y=crit_violation_per_facility, label=crit_violation_per_facility)) +
- geom_point(stat='identity', aes(col=grade), size=6) +
- scale_color_manual(name="Critical Violations",labels = c("Above Average", "Below Average"),
- values = c("above average"="#D55E00", "below average"="#0072B2")) +
- geom_text(color="white", size=2) +
- labs(title="Critical Violation Per Facility",
- subtitle="County",xlab="critical violation per facility") +
- ylim(0.1, 0.55) +
- coord_flip()
- ##q6
- q6 = "SELECT m.health_dept_name,count(m.facility_name) AS the_number_of_no_violations
- FROM (SELECT * FROM violations v
- JOIN inspections i ON v.violation_id = i.violation_id
- JOIN records r ON i.inspection_id = r.record_id
- JOIN facilities f ON r.facility_id = f.facility_id
- JOIN addresses a ON f.address_id = a.address_id
- JOIN local_health_department l on l.health_dept_id = f.health_dept_id) AS m
- WHERE m.critical_violations = 0
- AND m.critical_not_corrected = 0
- AND m.noncritical_violations = 0
- GROUP BY m.health_dept_name
- ORDER BY the_number_of_no_violations DESC
- ;"
- df6<- dbGetQuery(con, q6)
- df6$health_dept_name <- factor(df6$health_dept_name, levels = df6$health_dept_name)
- ggplot(data=df6, aes(x=health_dept_name, y=the_number_of_no_violations, fill=health_dept_name)) +geom_bar(colour="black", stat="identity")+theme_minimal()+ coord_flip()+ labs(title="Number of no-violation facilities", subtitle="County",x = "Health Department", y="Number of facilities have no violation")+scale_x_discrete(limits = rev(levels(df6$health_dept_name)))+guides(fill=FALSE)
- ##Q7
- q7 = "select to_char(i.last_inspected_date, 'YYYY-MM') as Date,
- count(i.inspection_id) as num_inspection,
- (sum(critical_violations)+sum(critical_not_corrected)+sum(noncritical_violations)) as total_violations
- from inspections i
- join violations v on i.violation_id = v.violation_id
- group by to_char(i.last_inspected_date, 'YYYY-MM')
- having (sum(critical_violations)+sum(critical_not_corrected)+sum(noncritical_violations)) is not null
- order by to_char(i.last_inspected_date, 'YYYY-MM') asc
- ;"
- df7<- dbGetQuery(con, q7)
- df7$date <- as.yearmon(df7$date)
- ggplot(df7, aes(x=date)) + geom_line(aes(y=num_inspection, col="num_inspection")) + geom_line(aes(y=total_violations, col="total_violations"))+labs(title="Total Inspections and Violation", subtitle="From 06-2016 to 09-2018", y="Value")
- ##Q8
- q8="SELECT inspection_comments,count(inspection_comments) as frequency from inspections i
- join violations v on i.violation_id=v.violation_id
- where critical_violations >2
- group by inspection_comments
- order by count(inspection_comments) desc limit 10;"
- df8<- dbGetQuery(con, q8)
- ##Q9:
- q9="select a.county,
- Round(Cast(count(i.inspection_id) As DECIMAL(10,2))/CAST(count(distinct facility_name) AS DECIMAL(10,2)),2) as Inspection_per_facility,
- Round(CAST(SUM(noncritical_violations) AS DECIMAL(10,2))/CAST(count(distinct facility_name) AS DECIMAL(10,2)),2) as Crit_violation_per_facility
- From inspections i
- join violations v on i.violation_id=v.violation_id
- join records r on i.inspection_id = r.record_id
- join facilities f on r.facility_id = f.facility_id
- join addresses a on f.address_id = a.address_id
- group by a.county
- order by Round(Cast(count(i.inspection_id) As DECIMAL(10,2))/CAST(count(distinct facility_name) AS DECIMAL(10,2)),2) desc
- ;"
- df9<- dbGetQuery(con, q9)
- ggplot(df9, aes(x=crit_violation_per_facility, y=inspection_per_facility))+labs(title="More inspections discover more critical violation?",subtitle="County",x="Critical Violation Per Facility", y="Inspection Per Facility")+theme_minimal()+guides(fill=FALSE)+geom_text(aes(label=county),hjust=0, vjust=0.3,size=3,check_overlap = TRUE,color="black")+geom_point(color='red')
- ##Q10
- q10="SELECT o.description, COUNT(o.description) AS num_of_operation
- FROM operations o
- JOIN inspections i ON o.operation_id = i.operation_id
- WHERE inspection_type = 'Re-Inspection'
- AND EXTRACT (year FROM i.last_inspected_date) = 2018
- OR EXTRACT (year FROM i.last_inspected_date) = 2017
- GROUP BY o.description
- ORDER BY count(o.description) DESC
- ;"
- df10<- dbGetQuery(con, q10)
- ggplot(df10, aes(x = "", y=num_of_operation, fill = description)) + geom_bar(width = 1, stat = "identity")+ theme(axis.line = element_blank(), plot.title = element_text(hjust=1)) + labs(fill="Operation Type", x=NULL, y=NULL, title="Pie Chart for Operation Type in Year 2018 and 2017 for Re-Inspection")+coord_polar("y")+theme(legend.position="bottom",legend.text=element_text(size=8),legend.title=element_blank(),legend.key.size = unit(1,"line"),legend.direction="vertical")
Add Comment
Please, Sign In to add comment