SHARE
TWEET

Untitled

a guest Feb 18th, 2020 93 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import sys
  5. import getopt
  6.  
  7. from datetime import datetime
  8. from sqlalchemy import create_engine
  9.  
  10. import pandas as pd
  11.  
  12.  
  13. if __name__ == "__main__":
  14.     unixOptions = "s:e"
  15.     gnuOptions = ["start_dt=", "end_dt="]
  16.    
  17.     fullCmdArguments = sys.argv
  18.     argumentList = fullCmdArguments[1:]
  19.    
  20.     try:
  21.         arguments, values = getopt.getopt(argumentList, unixOptions, gnuOptions)
  22.     except getopt.error as err:
  23.         print (str(err))
  24.         sys.exit(2)
  25.    
  26.     start_dt = ''
  27.     end_dt = ''
  28.     for currentArgument, currentValue in arguments:
  29.         if currentArgument in ("-s", "--start_dt"):
  30.             start_dt = currentValue
  31.         elif currentArgument in ("-e", "--end_dt"):
  32.             end_dt = currentValue
  33.  
  34.     db_config = {'user': 'my_user','pwd': 'my_user_password','host': 'localhost','port': 5432,'db': 'zen'}
  35.  
  36.     connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
  37.                                                             db_config['pwd'],
  38.                                                             db_config['host'],
  39.                                                             db_config['port'],
  40.                                                             db_config['db'])
  41.  
  42.     engine = create_engine(connection_string)
  43.  
  44.     query = '''SELECT event_id, age_segment, event, item_id, item_topic,item_type, source_id, source_topic, source_type,TO_TIMESTAMP(ts / 1000) AT TIME ZONE 'Etc/UTC' as dt, user_id
  45.                 FROM log_raw
  46.                 WHERE TO_TIMESTAMP(ts / 1000) AT TIME ZONE 'Etc/UTC' BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP
  47.             '''.format(start_dt, end_dt)
  48.  
  49.     log_raw = pd.io.sql.read_sql(query, con = engine, index_col = 'event_id')
  50.  
  51.     print(log_raw.head(5))
  52.  
  53.     log_raw['dt'] = pd.to_datetime(log_raw['dt']).dt.round('min')
  54.  
  55.     dash_visits = log_raw.groupby(['item_topic','source_topic','age_segment','dt']).agg({'user_id':'count'})
  56.  
  57.     dash_visits = dash_visits.rename(columns={'user_id':'visits'})
  58.  
  59.     print(dash_visits.head(5))
  60.  
  61.     dash_engagement = log_raw.groupby(['dt','item_topic','event','age_segment']).agg({'user_id':'nunique'})
  62.  
  63.     dash_engagement = dash_engagement.rename(columns={'user_id':'unique_users'})
  64.  
  65.     print(dash_engagement.head(5))
  66.  
  67.     tables = {'dash_visits': dash_visits,
  68.               'dash_engagement': dash_engagement}
  69.  
  70.     for table_name, table_data in tables.items():
  71.         query = '''
  72.                 DELETE FROM {} WHERE dt BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP
  73.                 '''.format(table_name, start_dt, end_dt)
  74.         engine.execute(query)
  75.         table_data.to_sql(name = table_name, con = engine, if_exists = 'append', index = False)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top