Advertisement
Guest User

Untitled

a guest
Jul 27th, 2016
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.25 KB | None | 0 0
  1. rm(list=ls())
  2. setwd("/Volumes/data/data/data_ger/")
  3. library(dplyr)
  4. library(ggplot2)
  5. library(magrittr)
  6.  
  7. master_df <- read.csv("/Volumes/data/data/data_ger/OrgKomplettAnonymisiert_DE.csv",
  8. header = T,
  9. sep = ";",
  10. stringsAsFactors = F)
  11.  
  12.  
  13. rechnungen_df <- read.csv("RechnungenAnonymisiert_DE.csv",
  14. header = T,
  15. sep = ";",
  16. stringsAsFactors = F,
  17. na.strings = NA)
  18.  
  19. stifter_df <- read.csv("StifterAnonymisiert_DE.csv",
  20. header = T,
  21. sep = ";",
  22. stringsAsFactors = F,
  23. na.strings = NA)
  24.  
  25. reaktivierung_df <- read.csv("ReaktivierungAnonymisiert_DE.csv",
  26. header = T,
  27. sep = ";",
  28. stringsAsFactors = T)
  29.  
  30. #names tolower ####
  31. names(master_df) <- tolower(names(master_df))
  32. names(rechnungen_df) <- tolower(names(rechnungen_df))
  33. names(stifter_df) <- tolower(names(stifter_df))
  34. names(reaktivierung_df) <- tolower(names(reaktivierung_df))
  35.  
  36.  
  37. #sort foerderzweck and clean ####
  38. sort(unique(master_df$foerderzweck))
  39. master_df$foerderzweck <- master_df$foerderzweck %>%
  40. gsub("^\\\001", "", .) %>%
  41. gsub("\\\001$", "", .) %>%
  42. gsub("\\\001",", ", .)
  43.  
  44. # gen list of foerderzweck and agg ####
  45. temp_cat <- unlist(strsplit(master_df$foerderzweck, ","))
  46. temp_cat <- sort(unique(stringi::stri_trim_both(temp_cat)))
  47.  
  48. findCat <- function(x) {
  49. length(grep(x, master_df$foerderzweck))
  50. }
  51.  
  52. numberCat <- data.frame(mapply(findCat, temp_cat))
  53. colnames(numberCat)[1] <- "frequency"
  54.  
  55. # map foerderzweck on rechnungen_df ####
  56. data_master <- data.frame(master_df$orgnr, master_df$foerderzweck, master_df$created)
  57. names(data_master) [names(data_master) == "master_df.orgnr"] <- "orgnr"
  58. rechnungen_df <- merge(rechnungen_df, data_master, by = "orgnr", all.x = T)
  59.  
  60. # group by foerderzweck and artikelbezeichnung ####
  61. rechnungen_df[rechnungen_df$fmvbetrag == "NULL", "fmvbetrag"] <- NA
  62. rechnungen_df$fmvbetrag <- as.numeric(rechnungen_df$fmvbetrag)
  63.  
  64. agg_foerderzweck <- group_by(rechnungen_df, master_df.foerderzweck, artikelbezeichnung) %>%
  65. summarise(
  66. summe_fmvbetrag = sum(fmvbetrag),
  67. summe_artikelbetrag = sum(artikelbetrag)
  68. ) %>%
  69. mutate(
  70. differenz = summe_fmvbetrag - summe_artikelbetrag
  71. )
  72.  
  73.  
  74.  
  75. # group by id and artikelbezeichnung ####
  76. agg_id <- group_by(rechnungen_df, orgnr, artikelbezeichnung) %>%
  77. summarise(
  78. summe_fmvbetrag = sum(fmvbetrag),
  79. summe_artikelbetrag = sum(artikelbetrag)
  80. ) %>%
  81. mutate(
  82. differenz = summe_fmvbetrag - summe_artikelbetrag
  83. )
  84.  
  85. # compute diff between create and first invoice ####
  86. str(rechnungen_df)
  87. rechnungen_df$rechnungsdatum <- gsub("\\.[0-9]{3}", "",rechnungen_df$rechnungsdatum )
  88. rechnungen_df$master_df.created <- gsub("\\.[0-9]{3}", "", rechnungen_df$master_df.created)
  89.  
  90. rechnungen_df$rechnungsdatum <- as.POSIXct(strptime(rechnungen_df$rechnungsdatum, format = "%Y-%m-%d"))
  91. rechnungen_df$master_df.created <- as.POSIXct(strptime(rechnungen_df$master_df.created, format = "%Y-%m-%d"))
  92.  
  93. elapsed_months <- function(end_date, start_date) {
  94. ed <- as.POSIXlt(end_date)
  95. sd <- as.POSIXlt(start_date)
  96. 12 * (ed$year - sd$year) + (ed$mon - sd$mon)
  97. }
  98. rechnungen_df$createinvoice_diff <- elapsed_months(rechnungen_df$rechnungsdatum, rechnungen_df$master_df.created)
  99. class(rechnungen_df$createinvoice_diff)
  100.  
  101. agg_meantime_id <- group_by(rechnungen_df, orgnr) %>%
  102. summarise(
  103. mean_time = mean(createinvoice_diff)
  104. )
  105.  
  106. agg_meantime_foerderzweck <- group_by(rechnungen_df, master_df.foerderzweck) %>%
  107. summarise(
  108. mean_time = mean(createinvoice_diff)
  109. )
  110.  
  111.  
  112.  
  113. library(stringi)
  114. temp_cat <- unlist(strsplit(agg_meantime_foerderzweck$master_df.foerderzweck, ","))
  115. temp_cat <- stri_trim_both(temp_cat)
  116. length(grep(temp_cat[1], master_data_sel$support_objective))
  117.  
  118.  
  119. temp_cat <- unlist(strsplit(agg_meantime_foerderzweck$master_df.foerderzweck, ","))
  120. temp_cat <- sort(unique(stringi::stri_trim_both(temp_cat)))
  121.  
  122. findCat <- function(x) {
  123. length(grep(x, master_data_sel$support_objective))
  124. }
  125.  
  126.  
  127. sort(unique(master_df$plz))
  128.  
  129. #zip###
  130.  
  131. cleaning <- function (x) {
  132. x[grepl("[^0-9]", x)] <- NA
  133.  
  134. return (x)
  135. }
  136.  
  137. master_df$plz <- cleaning(master_df$plz)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement