Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- rm(list=ls())
- setwd("/Volumes/data/data/data_ger/")
- library(dplyr)
- library(ggplot2)
- library(magrittr)
- master_df <- read.csv("/Volumes/data/data/data_ger/OrgKomplettAnonymisiert_DE.csv",
- header = T,
- sep = ";",
- stringsAsFactors = F)
- rechnungen_df <- read.csv("RechnungenAnonymisiert_DE.csv",
- header = T,
- sep = ";",
- stringsAsFactors = F,
- na.strings = NA)
- stifter_df <- read.csv("StifterAnonymisiert_DE.csv",
- header = T,
- sep = ";",
- stringsAsFactors = F,
- na.strings = NA)
- reaktivierung_df <- read.csv("ReaktivierungAnonymisiert_DE.csv",
- header = T,
- sep = ";",
- stringsAsFactors = T)
- #names tolower ####
- names(master_df) <- tolower(names(master_df))
- names(rechnungen_df) <- tolower(names(rechnungen_df))
- names(stifter_df) <- tolower(names(stifter_df))
- names(reaktivierung_df) <- tolower(names(reaktivierung_df))
- #sort foerderzweck and clean ####
- sort(unique(master_df$foerderzweck))
- master_df$foerderzweck <- master_df$foerderzweck %>%
- gsub("^\\\001", "", .) %>%
- gsub("\\\001$", "", .) %>%
- gsub("\\\001",", ", .)
- # gen list of foerderzweck and agg ####
- temp_cat <- unlist(strsplit(master_df$foerderzweck, ","))
- temp_cat <- sort(unique(stringi::stri_trim_both(temp_cat)))
- findCat <- function(x) {
- length(grep(x, master_df$foerderzweck))
- }
- numberCat <- data.frame(mapply(findCat, temp_cat))
- colnames(numberCat)[1] <- "frequency"
- # map foerderzweck on rechnungen_df ####
- data_master <- data.frame(master_df$orgnr, master_df$foerderzweck, master_df$created)
- names(data_master) [names(data_master) == "master_df.orgnr"] <- "orgnr"
- rechnungen_df <- merge(rechnungen_df, data_master, by = "orgnr", all.x = T)
- # group by foerderzweck and artikelbezeichnung ####
- rechnungen_df[rechnungen_df$fmvbetrag == "NULL", "fmvbetrag"] <- NA
- rechnungen_df$fmvbetrag <- as.numeric(rechnungen_df$fmvbetrag)
- agg_foerderzweck <- group_by(rechnungen_df, master_df.foerderzweck, artikelbezeichnung) %>%
- summarise(
- summe_fmvbetrag = sum(fmvbetrag),
- summe_artikelbetrag = sum(artikelbetrag)
- ) %>%
- mutate(
- differenz = summe_fmvbetrag - summe_artikelbetrag
- )
- # group by id and artikelbezeichnung ####
- agg_id <- group_by(rechnungen_df, orgnr, artikelbezeichnung) %>%
- summarise(
- summe_fmvbetrag = sum(fmvbetrag),
- summe_artikelbetrag = sum(artikelbetrag)
- ) %>%
- mutate(
- differenz = summe_fmvbetrag - summe_artikelbetrag
- )
- # compute diff between create and first invoice ####
- str(rechnungen_df)
- rechnungen_df$rechnungsdatum <- gsub("\\.[0-9]{3}", "",rechnungen_df$rechnungsdatum )
- rechnungen_df$master_df.created <- gsub("\\.[0-9]{3}", "", rechnungen_df$master_df.created)
- rechnungen_df$rechnungsdatum <- as.POSIXct(strptime(rechnungen_df$rechnungsdatum, format = "%Y-%m-%d"))
- rechnungen_df$master_df.created <- as.POSIXct(strptime(rechnungen_df$master_df.created, format = "%Y-%m-%d"))
- elapsed_months <- function(end_date, start_date) {
- ed <- as.POSIXlt(end_date)
- sd <- as.POSIXlt(start_date)
- 12 * (ed$year - sd$year) + (ed$mon - sd$mon)
- }
- rechnungen_df$createinvoice_diff <- elapsed_months(rechnungen_df$rechnungsdatum, rechnungen_df$master_df.created)
- class(rechnungen_df$createinvoice_diff)
- agg_meantime_id <- group_by(rechnungen_df, orgnr) %>%
- summarise(
- mean_time = mean(createinvoice_diff)
- )
- agg_meantime_foerderzweck <- group_by(rechnungen_df, master_df.foerderzweck) %>%
- summarise(
- mean_time = mean(createinvoice_diff)
- )
- library(stringi)
- temp_cat <- unlist(strsplit(agg_meantime_foerderzweck$master_df.foerderzweck, ","))
- temp_cat <- stri_trim_both(temp_cat)
- length(grep(temp_cat[1], master_data_sel$support_objective))
- temp_cat <- unlist(strsplit(agg_meantime_foerderzweck$master_df.foerderzweck, ","))
- temp_cat <- sort(unique(stringi::stri_trim_both(temp_cat)))
- findCat <- function(x) {
- length(grep(x, master_data_sel$support_objective))
- }
- sort(unique(master_df$plz))
- #zip###
- cleaning <- function (x) {
- x[grepl("[^0-9]", x)] <- NA
- return (x)
- }
- master_df$plz <- cleaning(master_df$plz)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement