Guest User

Untitled

a guest
Jul 12th, 2018
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.38 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. from scipy.stats import ttest_ind, pearsonr
  3.  
  4. import pandas as pd
  5. import numpy as np
  6. import psycopg2
  7. import os
  8.  
  9. RELEVANT_QUERY = 'relevantid_query.pgsql'
  10. NON_RELEVANT_QUERY = 'non_relevantid_query.pgsql'
  11.  
  12.  
  13. class NestedDictionary(dict):
  14. def __getitem__(self, item):
  15. try:
  16. return dict.__getitem__(self, item)
  17. except KeyError:
  18. value = self[item] = type(self)()
  19. return value
  20.  
  21.  
  22. def connect_to_database():
  23. user_name = os.environ.get('REDSHIFT_USER_NAME')
  24. user_password = os.environ.get('REDSHIFT_PASSWORD')
  25. host = 'lucid-research.ctr4i8xtibzz.us-east-1.redshift.amazonaws.com'
  26. port = '5439'
  27. database_name = 'dw'
  28. conn_string = (
  29. 'host=' + host +
  30. ' port=' + port +
  31. ' dbname=' + database_name +
  32. ' user=' + user_name +
  33. ' password=' + user_password
  34. )
  35. conn = psycopg2.connect(conn_string)
  36. return conn
  37.  
  38.  
  39. def get_from_database(query_file, connection, in_params=None):
  40. sql = open(query_file).read()
  41. df = pd.read_sql(sql, connection, params={'ids': in_params})
  42. return df
  43.  
  44.  
  45. def transform_data(rel_data, non_rel_data):
  46. dictionary = NestedDictionary()
  47. for _, row in rel_data.iterrows():
  48. date = row['date']
  49. responsestatus = row['responsestatus_id']
  50. rel_count = row['count_session_id']
  51. dictionary[date][responsestatus]['_relevant_sessions'] = rel_count
  52. for _, row in non_rel_data.iterrows():
  53. date = row['date']
  54. responsestatus = row['responsestatus_id']
  55. count = row['count_session_id']
  56. dictionary[date][responsestatus]['_sessions'] = count
  57. # for _, row in data.iterrows():
  58. # date = str(row['date'])
  59. # response_status = row['responsestatus_id']
  60. # is_relevantid = row['is_relevantid']
  61. # if response_status in dictionary[date]:
  62. # dictionary[date][response_status]['_sessions'] += row['count_session_id']
  63. # else:
  64. # dictionary[date][response_status]['_sessions'] = row['count_session_id']
  65. # if is_relevantid:
  66. # dictionary[date][response_status]['_relevant_sessions'] = row['count_relevant_id']
  67. for date, values in dictionary.items():
  68. session_total = 0
  69. relevant_total = 0
  70. for response, stats in values.items():
  71. try:
  72. session_total += stats['_sessions']
  73. except TypeError:
  74. session_total += 0
  75. try:
  76. relevant_total += stats['_relevant_sessions']
  77. except TypeError:
  78. relevant_total += 0
  79. for response, stats in values.items():
  80. try:
  81. prob_sessions = stats['_sessions'] / session_total
  82. except TypeError:
  83. prob_sessions = 0
  84. try:
  85. prob_relevant = stats['_relevant_sessions'] / relevant_total
  86. except TypeError:
  87. prob_relevant = 0
  88. dictionary[date][response]['sessions'] = prob_sessions
  89. dictionary[date][response]['relevant_sessions'] = prob_relevant
  90. return dictionary
  91.  
  92.  
  93. def effect_size(data_one, data_two):
  94. n1 = len(data_one)
  95. n2 = len(data_two)
  96. s1 = np.var(data_one, ddof=1)
  97. s2 = np.var(data_two, ddof=1)
  98. s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
  99. u1 = np.mean(data_one)
  100. u2 = np.mean(data_two)
  101. return (u1 - u2) / s
  102.  
  103.  
  104. def response_list(dictionary, response_status):
  105. session_list = []
  106. relevant_list = []
  107. for date, values in dictionary.items():
  108. relevant_list.append(values[response_status]['relevant_sessions'])
  109. session_list.append(values[response_status]['sessions'])
  110. return session_list, relevant_list
  111.  
  112.  
  113. def run():
  114. connection = connect_to_database()
  115. rel_data = get_from_database(RELEVANT_QUERY, connection)
  116. non_rel_data = get_from_database(NON_RELEVANT_QUERY, connection)
  117. response_dict = transform_data(rel_data, non_rel_data)
  118. # for date, values in response_dict.items():
  119. # for response, stats in values.items():
  120. # if response == 3:
  121. # print(date, stats)
  122. session_list, relevant_list = response_list(response_dict, 3)
  123. ttest_results = ttest_ind(session_list, relevant_list)
  124. effect_statistic = effect_size(session_list, relevant_list)
  125. correlation_statistic, _ = pearsonr(session_list, relevant_list)
  126. print('Mean of Sessions:', np.mean(session_list))
  127. print('Mean of Relevant:', np.mean(relevant_list))
  128. print('Probabililty of Happening:', ttest_results[1])
  129. print('Effect Size:', effect_statistic)
  130. print('Pearsons Correlation:', correlation_statistic)
  131. # print('Std Sessions:', np.std(session_list))
  132. # print('Mean Sessions:', np.mean(session_list))
  133. # print('SEM Sessions:', sem(session_list))
  134. # print('Std Relevant:', np.std(relevant_list))
  135. # print('Mean Relevant:', np.mean(relevant_list))
  136. # print('SEM Relevant:', sem(relevant_list))
  137. # print('Relevant N:', len(relevant_list))
  138. # plt.hist(session_list, 25)
  139. # plt.show()
  140. # plt.hist(relevant_list, 25)
  141. # plt.show()
  142.  
  143.  
  144. if __name__ == '__main__':
  145. run()
  146.  
  147.  
  148. """
  149. Relevant ID Surveys:
  150.  
  151. SELECT DISTINCT TO_CHAR(bis.entrydate ,'YYYY-MM-DD') AS date,
  152. bis.lk_responsestatusid AS responsestatus_id,
  153. COUNT(bis.sessionid) AS count_session_id
  154. FROM bi_sessions AS bis
  155. WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
  156. AND bis.surveyid IN (
  157. SELECT DISTINCT bis.surveyid AS survey_id
  158. FROM bi_sessions AS bis
  159. INNER JOIN sessionrelevantiddata AS rid
  160. ON bis.sessionid = rid.sessionid
  161. WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
  162. GROUP BY bis.surveyid, rid.sessionid
  163. ORDER BY bis.surveyid
  164. )
  165. GROUP BY date, responsestatus_id;
  166.  
  167. Non Relevant ID Surveys:
  168.  
  169. SELECT DISTINCT TO_CHAR(bis.entrydate ,'YYYY-MM-DD') AS date,
  170. bis.lk_responsestatusid AS responsestatus_id,
  171. COUNT(bis.sessionid) AS count_session_id
  172. FROM bi_sessions AS bis
  173. WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
  174. AND bis.surveyid NOT IN (
  175. SELECT DISTINCT bis.surveyid AS survey_id
  176. FROM bi_sessions AS bis
  177. INNER JOIN sessionrelevantiddata AS rid
  178. ON bis.sessionid = rid.sessionid
  179. WHERE bis.entrydate BETWEEN '2018-01-01 00:00:00.0000' AND '2018-03-31 23:59:59.9999'
  180. GROUP BY bis.surveyid, rid.sessionid
  181. ORDER BY bis.surveyid
  182. )
  183. GROUP BY date, responsestatus_id;
  184. """
Add Comment
Please, Sign In to add comment