SHARE
TWEET

Untitled

a guest Sep 11th, 2019 102 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import numpy as np
  2. import pandas as pd
  3. import scipy.stats
  4. from math import sqrt
  5.  
  6.  
  7. def cramers_v(x, y):
  8.     confusion_matrix = pd.crosstab(x, y)
  9.     chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
  10.     n = confusion_matrix.sum().sum()
  11.     phi2 = chi2 / n
  12.     r, k = confusion_matrix.shape
  13.     phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
  14.     rcorr = r - ((r - 1) ** 2) / (n - 1)
  15.     kcorr = k - ((k - 1) ** 2) / (n - 1)
  16.     return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
  17.  
  18.  
  19. def gini(list_of_values):
  20.     '''
  21.     Compute the Gini coefficient.
  22.     :param list_of_values: list/series
  23.     :return: Gini coefficient
  24.     '''
  25.     if isinstance(list_of_values, float):
  26.         return 1
  27.     sorted_list = sorted(list_of_values)
  28.     height, area = 0, 0
  29.     for value in sorted_list:
  30.         height += value
  31.         area += height - value / 2.
  32.     fair_area = height * len(list_of_values) / 2.
  33.     return (fair_area - area) / fair_area
  34.  
  35.  
  36. def mean_confidence_interval(data, confidence=0.95):
  37.     '''
  38.     Confidence interval of the mean
  39.     :param data:
  40.     :param confidence:
  41.     :return:
  42.     '''
  43.     a = 1.0 * np.array(data)
  44.     n, se = len(a), scipy.stats.sem(a)
  45.     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
  46.     return h
  47.  
  48.  
  49. def median_confidence_interval(data, confidence=0.95):
  50.     '''
  51.     Get the confidence interval over the median
  52.     :param data: an array/series
  53.     :param confidence:
  54.     :return:
  55.     '''
  56.     a = 1.0 * np.array(data)
  57.     n = len(a)
  58.     n, se = len(a), sqrt(n * .5 * .5)
  59.     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
  60.     return h
  61.  
  62.  
  63. def test_diff(a, b):
  64.     '''
  65.     Run the Mann Whitney test
  66.     :param a: first list/series
  67.     :param b: second list/series
  68.     :return: U and p-val
  69.     '''
  70.     a, b = np.array(a), np.array(b)
  71.     if len(a) == 0 or len(b) == 0:
  72.         return 1
  73.     if np.array_equal(a, b):
  74.         return 1
  75.  
  76.     t, p = scipy.stats.mannwhitneyu(a, b)
  77.     return p  # "U = {}, p = {}".format(t, p)
  78.  
  79.  
  80. def crombach_alpha(two_columns_df):
  81.     '''
  82.     Compute the Crombach alpha
  83.     :param two_columns_df: df
  84.     :return: the Crombach Alpha
  85.     '''
  86.     nr_items = len(two_columns_df.columns)
  87.     cov_mtr = two_columns_df.cov()  # variance - covariance matrix
  88.  
  89.     mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items  # mean variance
  90.     mean_cov_ii = sum(np.sum(cov_mtr)) - sum(
  91.         np.diagonal(cov_mtr))  # sume all the covariance among item, and remove the variance
  92.     mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items)  # average
  93.  
  94.     return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii)
  95.  
  96.  
  97. def median_split(two_column_dataset):
  98.     """
  99. Split dataset on the median of the second column
  100.     :param two_column_dataset
  101.     :return: low, high subset
  102.     """
  103.  
  104.     two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1])  # order by scale
  105.     median_position = int(two_column_dataset.shape[0] / 2)  # find median index
  106.  
  107.     # split
  108.     low = two_column_dataset.iloc[:median_position, ]
  109.     high = two_column_dataset.iloc[median_position:, ]
  110.  
  111.     return low, high
  112.  
  113.  
  114. def low_high_split(two_column_dataset):
  115.     """
  116. Split dataset in three of the second column
  117.     :param two_column_dataset
  118.     :return: low, high subset
  119.     """
  120.  
  121.     two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1])  # order by scale
  122.     low_position = int(two_column_dataset.shape[0] / 3)  # find median index
  123.     high_position = int(2 * two_column_dataset.shape[0]/3)
  124.  
  125.     # split
  126.     low = two_column_dataset.iloc[:low_position, ]
  127.     high = two_column_dataset.iloc[high_position:, ]
  128.  
  129.     return low, high
  130.  
  131.  
  132. if __name__ == "__main__":
  133.     print("MAIN")
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top