Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import sys
- import getopt
- from datetime import datetime
- import pandas as pd
- from sqlalchemy import create_engine
- if __name__ == "__main__":
- #Задаем входные параметры
- unixOptions = "sdt:edt"
- gnuOptions = ["start_dt=", "end_dt="]
- fullCmdArguments = sys.argv
- argumentList = fullCmdArguments[1:] #excluding script name
- try:
- arguments, values = getopt.getopt(argumentList, unixOptions, gnuOptions)
- except getopt.error as err:
- # output error, and return with an error code
- print (str(err))
- sys.exit(2)
- start_dt = '2019-09-24 18:00:00'
- end_dt = '2019-09-24 19:00:00'
- for currentArgument, currentValue in arguments:
- if currentArgument in ("-sdt", "--start_dt"):
- start_dt = currentValue
- elif currentArgument in ("-edt", "--end_dt"):
- end_dt = currentValue
- db_config = {'user': 'my_user',
- 'pwd': 'my_user_password',
- 'host': 'localhost',
- 'port': 5432,
- 'db': 'zen'}
- connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
- db_config['pwd'],
- db_config['host'],
- db_config['port'],
- db_config['db'])
- engine = create_engine(connection_string)
- #Теперь выберем из таблицы только те строки,
- #которые были выпущены между start_dt и end_dt
- query = ''' SELECT * FROM log_raw
- WHERE TO_TIMESTAMP(ts / 1000) AT TIME ZONE 'Etc/UTC'
- BETWEEN
- '{}' AND '{}'
- '''.format(datetime.strptime(start_dt,'%Y-%m-%d %H:%M:%S'),datetime.strptime(end_dt,'%Y-%m-%d %H:%M:%S'))
- log_raw = pd.io.sql.read_sql(query, con = engine, index_col = 'game_id')
- columns_numeric = ['event_id', 'item_id', 'source_id', 'user_id']
- columns_datetime = ['ts']
- columns_str = ['age_segment', 'event', 'item_topic', 'item_type', 'source_topic', 'source_type']
- for column in columns_str: log_raw[column] = log_raw[column].astype(str)
- for column in columns_numeric: log_raw[column] = pd.to_numeric(log_raw[column], errors='coerce')
- for column in columns_datetime: log_raw[column] = pd.to_datetime(log_raw[column]).dt.round('min')
- dash_visits = log_raw.groupby(['item_topic', 'source_topic', 'age_segment', 'timestamp']).agg({'user_id': 'count'})
- dash_visits = dash_visits.rename(columns = {'timestamp': 'dt',
- 'user_id': 'visits'})
- dash_engagement = log_raw.groupby(['timestamp', 'item_topic', 'event', 'age_segment']).agg({'user_id': 'nunique'})
- dash_engagement = dash_engagement.rename(columns = {'timestamp': 'dt',
- 'user_id': 'unique_users'})
- #Удаляем старые записи между start_dt и end_dt
- tables = {'dash_visits': dash_visits,
- 'dash_engagement': dash_engagement}
- for table_name, table_data in tables.items():
- query = '''
- DELETE FROM {} WHERE dt BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP
- '''.format(table_name, start_dt, end_dt)
- engine.execute(query)
- table_data.to_sql(name = table_name, con = engine, if_exists = 'append', index = False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement