Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- setwd("/Users/liufangyi/Documents/sql/CP4")
- list.files()
- install.packages('RPostgreSQL')
- install.packages('dplyr')
- install.packages('tidyr')
- library('RPostgreSQL')
- library('dplyr')
- library('tidyr')
- drv <- dbDriver('PostgreSQL')
- con <- dbConnect(drv, dbname = 'web_4',
- host = 's19db.apan5310.com', port = 50202,
- user = 'postgres', password = '64ddswmi')
- df1 <- read.csv(file = 'DIM_ONLINE_PANELIST - DIM_ONLINE_PANELIST.csv',stringsAsFactors=FALSE)
- df2 <- read.csv(file = 'FACT_ONLINE_BEHAVIOR - FACT_ONLINE_BEHAVIOR.csv',stringsAsFactors=FALSE)
- df3 <- read.csv(file = 'DIM_ONLINE_DICTIONARY_CATEGORY - DIM_ONLINE_DICTIONARY_CATEGORY.csv',stringsAsFactors=FALSE)
- df4 <- read.csv(file = 'CONNEXIONS(update).csv',stringsAsFactors=FALSE)
- df4 <- df4[1:53,]
- df5 <- read.csv(file = 'employment.csv',stringsAsFactors=FALSE)
- df1 <- df1 [1:100000,]
- sub <- read.csv(file = 'sub.csv',stringsAsFactors=FALSE)
- #employment
- employment <- df5
- dbWriteTable(con,name = 'employment', value = employment,
- row.names = FALSE, append = TRUE)
- #race
- df6 <- df4 %>% select(race_id,ethnicity) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(101:106,nrow(df6), replace = TRUE),df6)
- dbWriteTable(con,name = 'race', value = df6,
- row.names = FALSE, append = TRUE)
- #education
- df6 <- df4 %>% select(edu_id,edu_level) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(102:107,nrow(df6)),df6)
- dbWriteTable(con,name = 'education', value = df6,
- row.names = FALSE, append = TRUE)
- #house_income
- df6 <- df4 %>% select(hi_id,hi_level) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(104:111,nrow(df6)),df6)
- dbWriteTable(con,name = 'housing_income', value = df6,
- row.names = FALSE, append = TRUE)
- #occpucation
- df6 <- df4 %>% select(oc_id,occupation) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(108:114,nrow(df6)),df6)
- dbWriteTable(con,name = 'occupation', value = df6,
- row.names = FALSE, append = TRUE)
- #age
- df6 <-df4 %>% select(age_id,age_range) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(102:108,nrow(df6)),df6)
- dbWriteTable(con,name = 'age', value = df6,
- row.names = FALSE, append = TRUE)
- #urbanicity
- df6 <- df4 %>% select(urb_id,urb_level) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(107:110,nrow(df6)),df6)
- dbWriteTable(con,name = 'urbanicity', value = df6,
- row.names = FALSE, append = TRUE)
- #technodoption#####
- df6 <- df4 %>% select(tech_id,tech_level) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(109:114,nrow(df6)),df6)
- df6 <- df6[-c(4),]
- dbWriteTable(con,name = 'technodoption_class', value = df6,
- row.names = FALSE, append = TRUE)
- # tenure
- df6 <- df4 %>% select(tenure_id,tenure_status) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(103:108,nrow(df6)),df6)
- dbWriteTable(con,name = 'tenure', value = df6,
- row.names = FALSE, append = TRUE)
- #companion
- df6 <- df4 %>% select(comp_id,comp_status) %>% distinct()
- df6 <- bind_cols('employee_id' = sample(101:104,nrow(df6)),df6)
- dbWriteTable(con,name = 'companion', value = df6,
- row.names = FALSE, append = TRUE)
- #action_time
- df6 <- df2 %>% select('action_time' = 'ACTION_TIMESTAMP', 'action_date_id' = 'ACTION_DATE_ID',
- 'action_time_id' = 'ACTION_TIME_ID') %>% distinct()
- dbWriteTable(con,name = 'action_time', value = df6,
- row.names = FALSE, append = TRUE)
- #Cat_ID, category, subcategory
- sub <- read.csv("sub.csv")
- sub <- sub[1:86,]
- df10 <- df3 %>% select('category_id' = 'CATEGORY_ID', 'subcategory_name' = 'SUBCATEGORY_NAME') %>% distinct()
- df10$subcategory_name <- factor(df10$subcategory_name)
- sub$subcategory_name <- factor(sub$subcategory_name)
- levels(sub$subcategory_name) = levels(df10$subcategory_name)
- df20 <- df10 %>% inner_join(y = sub, by = 'subcategory_name')
- Cat_IDs <- df20 %>% select(category_id, subcategory_id) %>% distinct()
- Category <- df3 %>% select('category_id' = 'CATEGORY_ID', 'category_name' = 'CATEGORY_NAME') %>% distinct()
- subcategories <- sub
- dbWriteTable(con,name = 'categories', value = Category,
- row.names = FALSE, append = TRUE)
- dbWriteTable(con,name = 'subcategories', value = subcategories,
- row.names = FALSE, append = TRUE)
- dbWriteTable(con,name = 'category_ids', value = Cat_IDs,
- row.names = FALSE, append = TRUE)
- #segmentation
- df6 <- df4 %>% select('connections_code' = 'CODE', 'segment_nickname' = 'SEGMENT_NICKNAME', urb_id, hi_id, age_id, comp_id,
- tenure_id, edu_id, race_id, tech_id, oc_id) %>% distinct()
- dbWriteTable(con,name = 'segmentation', value = df6,
- row.names = FALSE, append = TRUE)
- #subdomain
- df6 <- df2 %>% select('domain_name' = 'DOMAIN_NAME', 'subdomain_name' = 'SUBDOMAIN_NAME') %>% distinct(domain_name,.keep_all = TRUE)
- dbWriteTable(con,name = 'subdomain', value = df6,
- row.names = FALSE, append = TRUE)
- #online_behavior
- df6 <- df2 %>% select('id' = 'ID', 'meter_member_hk' = 'METER_MEMBER_HK', 'category_id' = 'CATEGORY_ID', 'subcategory_id' = 'SUBCATEGORY_ID',
- 'domain_name' = 'DOMAIN_NAME', 'referring_domain' = 'REFERRING_DOMAIN', 'duration_in_sec' = 'DURATION_IN_SECONDS',
- 'action_time' = 'ACTION_TIMESTAMP') %>% distinct(meter_member_hk,.keep_all = TRUE)
- df6 <- na.omit(df6)
- dbWriteTable(con,name = 'online_behavior', value = df6,
- row.names = FALSE, append = TRUE)
- #user_info
- df6 <- df1 %>% select('meter_member_hk' = 'METER_MEMBER_HK', 'dma'='DMA', 'connections_code' = 'CONNEXIONS_CODE') %>% distinct(meter_member_hk,.keep_all = TRUE)
- df6 <- df6 %>% dplyr::filter(!(connections_code==''))
- df6 <- unique(df6)
- df6 <- subset(df6,connections_code <=53)
- dbWriteTable(con,name = 'user_info', value = df6,
- row.names = FALSE, append = TRUE)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement