Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- from scipy.stats import ttest_ind, pearsonr
- import pandas as pd
- import numpy as np
- import psycopg2
- import os
- RELEVANT_QUERY = 'relevantid_query.pgsql'
- NON_RELEVANT_QUERY = 'non_relevantid_query.pgsql'
- class NestedDictionary(dict):
- def __getitem__(self, item):
- try:
- return dict.__getitem__(self, item)
- except KeyError:
- value = self[item] = type(self)()
- return value
- def connect_to_database():
- user_name = os.environ.get('REDSHIFT_USER_NAME')
- user_password = os.environ.get('REDSHIFT_PASSWORD')
- host = 'lucid-research.ctr4i8xtibzz.us-east-1.redshift.amazonaws.com'
- port = '5439'
- database_name = 'dw'
- conn_string = (
- 'host=' + host +
- ' port=' + port +
- ' dbname=' + database_name +
- ' user=' + user_name +
- ' password=' + user_password
- )
- conn = psycopg2.connect(conn_string)
- return conn
- def get_from_database(query_file, connection, in_params=None):
- sql = open(query_file).read()
- df = pd.read_sql(sql, connection, params={'ids': in_params})
- return df
- def transform_data(rel_data, non_rel_data):
- dictionary = NestedDictionary()
- for _, row in rel_data.iterrows():
- date = row['date']
- responsestatus = row['responsestatus_id']
- rel_count = row['count_session_id']
- dictionary[date][responsestatus]['_relevant_sessions'] = rel_count
- for _, row in non_rel_data.iterrows():
- date = row['date']
- responsestatus = row['responsestatus_id']
- count = row['count_session_id']
- dictionary[date][responsestatus]['_sessions'] = count
- # for _, row in data.iterrows():
- # date = str(row['date'])
- # response_status = row['responsestatus_id']
- # is_relevantid = row['is_relevantid']
- # if response_status in dictionary[date]:
- # dictionary[date][response_status]['_sessions'] += row['count_session_id']
- # else:
- # dictionary[date][response_status]['_sessions'] = row['count_session_id']
- # if is_relevantid:
- # dictionary[date][response_status]['_relevant_sessions'] = row['count_relevant_id']
- for date, values in dictionary.items():
- session_total = 0
- relevant_total = 0
- for response, stats in values.items():
- try:
- session_total += stats['_sessions']
- except TypeError:
- session_total += 0
- try:
- relevant_total += stats['_relevant_sessions']
- except TypeError:
- relevant_total += 0
- for response, stats in values.items():
- try:
- prob_sessions = stats['_sessions'] / session_total
- except TypeError:
- prob_sessions = 0
- try:
- prob_relevant = stats['_relevant_sessions'] / relevant_total
- except TypeError:
- prob_relevant = 0
- dictionary[date][response]['sessions'] = prob_sessions
- dictionary[date][response]['relevant_sessions'] = prob_relevant
- return dictionary
- def effect_size(data_one, data_two):
- n1 = len(data_one)
- n2 = len(data_two)
- s1 = np.var(data_one, ddof=1)
- s2 = np.var(data_two, ddof=1)
- s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
- u1 = np.mean(data_one)
- u2 = np.mean(data_two)
- return (u1 - u2) / s
- def response_list(dictionary, response_status):
- session_list = []
- relevant_list = []
- for date, values in dictionary.items():
- relevant_list.append(values[response_status]['relevant_sessions'])
- session_list.append(values[response_status]['sessions'])
- return session_list, relevant_list
- def run():
- connection = connect_to_database()
- rel_data = get_from_database(RELEVANT_QUERY, connection)
- non_rel_data = get_from_database(NON_RELEVANT_QUERY, connection)
- response_dict = transform_data(rel_data, non_rel_data)
- # for date, values in response_dict.items():
- # for response, stats in values.items():
- # if response == 3:
- # print(date, stats)
- session_list, relevant_list = response_list(response_dict, 3)
- ttest_results = ttest_ind(session_list, relevant_list)
- effect_statistic = effect_size(session_list, relevant_list)
- correlation_statistic, _ = pearsonr(session_list, relevant_list)
- print('Mean of Sessions:', np.mean(session_list))
- print('Mean of Relevant:', np.mean(relevant_list))
- print('Probabililty of Happening:', ttest_results[1])
- print('Effect Size:', effect_statistic)
- print('Pearsons Correlation:', correlation_statistic)
- # print('Std Sessions:', np.std(session_list))
- # print('Mean Sessions:', np.mean(session_list))
- # print('SEM Sessions:', sem(session_list))
- # print('Std Relevant:', np.std(relevant_list))
- # print('Mean Relevant:', np.mean(relevant_list))
- # print('SEM Relevant:', sem(relevant_list))
- # print('Relevant N:', len(relevant_list))
- # plt.hist(session_list, 25)
- # plt.show()
- # plt.hist(relevant_list, 25)
- # plt.show()
- if __name__ == '__main__':
- run()
- """
- Relevant ID Surveys:
- SELECT DISTINCT TO_CHAR(bis.entrydate ,'YYYY-MM-DD') AS date,
- bis.lk_responsestatusid AS responsestatus_id,
- COUNT(bis.sessionid) AS count_session_id
- FROM bi_sessions AS bis
- WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
- AND bis.surveyid IN (
- SELECT DISTINCT bis.surveyid AS survey_id
- FROM bi_sessions AS bis
- INNER JOIN sessionrelevantiddata AS rid
- ON bis.sessionid = rid.sessionid
- WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
- GROUP BY bis.surveyid, rid.sessionid
- ORDER BY bis.surveyid
- )
- GROUP BY date, responsestatus_id;
- Non Relevant ID Surveys:
- SELECT DISTINCT TO_CHAR(bis.entrydate ,'YYYY-MM-DD') AS date,
- bis.lk_responsestatusid AS responsestatus_id,
- COUNT(bis.sessionid) AS count_session_id
- FROM bi_sessions AS bis
- WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
- AND bis.surveyid NOT IN (
- SELECT DISTINCT bis.surveyid AS survey_id
- FROM bi_sessions AS bis
- INNER JOIN sessionrelevantiddata AS rid
- ON bis.sessionid = rid.sessionid
- WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
- GROUP BY bis.surveyid, rid.sessionid
- ORDER BY bis.surveyid
- )
- GROUP BY date, responsestatus_id;
- """
Add Comment
Please, Sign In to add comment