Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import sys
- import getopt
- from datetime import datetime
- import pandas as pd
- from sqlalchemy import create_engine
- if __name__ == "__main__":
- #input params setup
- unixOptions = "sdt:edt"
- gnuOptions = ["start_dt=", "end_dt="]
- fullCmdArguments = sys.argv
- argumentList = fullCmdArguments[1:] #excluding script name
- try:
- arguments, values = getopt.getopt(argumentList, unixOptions, gnuOptions)
- except getopt.error as err:
- # output error, and return with an error code
- print (str(err))
- sys.exit(2)
- start_dt = '1981-01-01'
- end_dt = '1998-01-01'
- for currentArgument, currentValue in arguments:
- if currentArgument in ("-sdt", "--start_dt"):
- start_dt = currentValue
- elif currentArgument in ("-edt", "--end_dt"):
- end_dt = currentValue
- db_config = {'user': 'my_user',
- 'pwd': 'my_user_password',
- 'host': 'localhost',
- 'port': 5432,
- 'db': 'games'}
- connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
- db_config['pwd'],
- db_config['host'],
- db_config['port'],
- db_config['db'])
- engine = create_engine(connection_string)
- query = ''' SELECT *
- FROM data_raw
- WHERE year_of_release::TIMESTAMP BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP
- '''.format(start_dt, end_dt)
- data_raw = pd.io.sql.read_sql(query, con = engine, index_col = 'game_id')
- columns_str = ['name', 'platform', 'genre', 'rating']
- columns_numeric = ['na_players', 'eu_players', 'jp_players', 'other_players', 'critic_score', 'user_score']
- columns_datetime = ['year_of_release']
- for column in columns_str: data_raw[column] = data_raw[column].astype(str)
- for column in columns_numeric: data_raw[column] = pd.to_numeric(data_raw[column], errors='coerce')
- for column in columns_datetime: data_raw[column] = pd.to_datetime(data_raw[column])
- data_raw['total_copies_sold'] = data_raw[['na_players', 'eu_players', 'jp_players', 'other_players']].sum(axis = 1)
- agg_games_year_genre_platform = data_raw.groupby(['year_of_release', 'genre', 'platform']).agg({'name': 'count', 'total_copies_sold': 'sum'})
- agg_games_year_score = data_raw.groupby(['year_of_release', 'genre', 'platform']).agg({'critic_score': 'mean', 'user_score': 'mean'})
- agg_games_year_genre_platform = agg_games_year_genre_platform.rename(columns = {'name': 'count', 'total_copies_sold':'sum'})
- agg_games_year_score = agg_games_year_score.rename(columns = {'critic_score': 'avg_critic_score', 'user_score': 'avg_user_score'})
- agg_games_year_genre_platform = agg_games_year_genre_platform.fillna(0).reset_index()
- agg_games_year_score = agg_games_year_score.fillna(0).reset_index()
- tables = {'agg_games_year_genre_platform': 'agg_games_year_genre_platform',
- 'agg_games_year_score': 'agg_games_year_score'}
- for table_name, table_data in tables.items():
- query = '''
- DELETE FROM {} WHERE year_of_release BETWEEN '{}'::TIMESTAMP AND '{}'::TIMESTAMP
- '''.format(table_name, start_dt, end_dt)
- engine.execute(query)
- table_data.to_sql(name = table_name, con = engine, if_exists = 'append', index = False)
- print('All done.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement