Advertisement
Guest User

Untitled

a guest
Sep 11th, 2019
144
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.62 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import scipy.stats
  4. from math import sqrt
  5.  
  6.  
  7. def cramers_v(x, y):
  8. confusion_matrix = pd.crosstab(x, y)
  9. chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
  10. n = confusion_matrix.sum().sum()
  11. phi2 = chi2 / n
  12. r, k = confusion_matrix.shape
  13. phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
  14. rcorr = r - ((r - 1) ** 2) / (n - 1)
  15. kcorr = k - ((k - 1) ** 2) / (n - 1)
  16. return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
  17.  
  18.  
  19. def gini(list_of_values):
  20. '''
  21. Compute the Gini coefficient.
  22. :param list_of_values: list/series
  23. :return: Gini coefficient
  24. '''
  25. if isinstance(list_of_values, float):
  26. return 1
  27. sorted_list = sorted(list_of_values)
  28. height, area = 0, 0
  29. for value in sorted_list:
  30. height += value
  31. area += height - value / 2.
  32. fair_area = height * len(list_of_values) / 2.
  33. return (fair_area - area) / fair_area
  34.  
  35.  
  36. def mean_confidence_interval(data, confidence=0.95):
  37. '''
  38. Confidence interval of the mean
  39. :param data:
  40. :param confidence:
  41. :return:
  42. '''
  43. a = 1.0 * np.array(data)
  44. n, se = len(a), scipy.stats.sem(a)
  45. h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
  46. return h
  47.  
  48.  
  49. def median_confidence_interval(data, confidence=0.95):
  50. '''
  51. Get the confidence interval over the median
  52. :param data: an array/series
  53. :param confidence:
  54. :return:
  55. '''
  56. a = 1.0 * np.array(data)
  57. n = len(a)
  58. n, se = len(a), sqrt(n * .5 * .5)
  59. h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
  60. return h
  61.  
  62.  
  63. def test_diff(a, b):
  64. '''
  65. Run the Mann Whitney test
  66. :param a: first list/series
  67. :param b: second list/series
  68. :return: U and p-val
  69. '''
  70. a, b = np.array(a), np.array(b)
  71. if len(a) == 0 or len(b) == 0:
  72. return 1
  73. if np.array_equal(a, b):
  74. return 1
  75.  
  76. t, p = scipy.stats.mannwhitneyu(a, b)
  77. return p # "U = {}, p = {}".format(t, p)
  78.  
  79.  
  80. def crombach_alpha(two_columns_df):
  81. '''
  82. Compute the Crombach alpha
  83. :param two_columns_df: df
  84. :return: the Crombach Alpha
  85. '''
  86. nr_items = len(two_columns_df.columns)
  87. cov_mtr = two_columns_df.cov() # variance - covariance matrix
  88.  
  89. mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items # mean variance
  90. mean_cov_ii = sum(np.sum(cov_mtr)) - sum(
  91. np.diagonal(cov_mtr)) # sume all the covariance among item, and remove the variance
  92. mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items) # average
  93.  
  94. return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii)
  95.  
  96.  
  97. def median_split(two_column_dataset):
  98. """
  99. Split dataset on the median of the second column
  100. :param two_column_dataset
  101. :return: low, high subset
  102. """
  103.  
  104. two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale
  105. median_position = int(two_column_dataset.shape[0] / 2) # find median index
  106.  
  107. # split
  108. low = two_column_dataset.iloc[:median_position, ]
  109. high = two_column_dataset.iloc[median_position:, ]
  110.  
  111. return low, high
  112.  
  113.  
  114. def low_high_split(two_column_dataset):
  115. """
  116. Split dataset in three of the second column
  117. :param two_column_dataset
  118. :return: low, high subset
  119. """
  120.  
  121. two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale
  122. low_position = int(two_column_dataset.shape[0] / 3) # find median index
  123. high_position = int(2 * two_column_dataset.shape[0]/3)
  124.  
  125. # split
  126. low = two_column_dataset.iloc[:low_position, ]
  127. high = two_column_dataset.iloc[high_position:, ]
  128.  
  129. return low, high
  130.  
  131.  
  132. if __name__ == "__main__":
  133. print("MAIN")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement