# Untitled

a guest Sep 11th, 2019 102 Never
1. import numpy as np
2. import pandas as pd
3. import scipy.stats
4. from math import sqrt
5.
6.
7. def cramers_v(x, y):
8.     confusion_matrix = pd.crosstab(x, y)
9.     chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
10.     n = confusion_matrix.sum().sum()
11.     phi2 = chi2 / n
12.     r, k = confusion_matrix.shape
13.     phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
14.     rcorr = r - ((r - 1) ** 2) / (n - 1)
15.     kcorr = k - ((k - 1) ** 2) / (n - 1)
16.     return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
17.
18.
19. def gini(list_of_values):
20.     '''
21.     Compute the Gini coefficient.
22.     :param list_of_values: list/series
23.     :return: Gini coefficient
24.     '''
25.     if isinstance(list_of_values, float):
26.         return 1
27.     sorted_list = sorted(list_of_values)
28.     height, area = 0, 0
29.     for value in sorted_list:
30.         height += value
31.         area += height - value / 2.
32.     fair_area = height * len(list_of_values) / 2.
33.     return (fair_area - area) / fair_area
34.
35.
36. def mean_confidence_interval(data, confidence=0.95):
37.     '''
38.     Confidence interval of the mean
39.     :param data:
40.     :param confidence:
41.     :return:
42.     '''
43.     a = 1.0 * np.array(data)
44.     n, se = len(a), scipy.stats.sem(a)
45.     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
46.     return h
47.
48.
49. def median_confidence_interval(data, confidence=0.95):
50.     '''
51.     Get the confidence interval over the median
52.     :param data: an array/series
53.     :param confidence:
54.     :return:
55.     '''
56.     a = 1.0 * np.array(data)
57.     n = len(a)
58.     n, se = len(a), sqrt(n * .5 * .5)
59.     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
60.     return h
61.
62.
63. def test_diff(a, b):
64.     '''
65.     Run the Mann Whitney test
66.     :param a: first list/series
67.     :param b: second list/series
68.     :return: U and p-val
69.     '''
70.     a, b = np.array(a), np.array(b)
71.     if len(a) == 0 or len(b) == 0:
72.         return 1
73.     if np.array_equal(a, b):
74.         return 1
75.
76.     t, p = scipy.stats.mannwhitneyu(a, b)
77.     return p  # "U = {}, p = {}".format(t, p)
78.
79.
80. def crombach_alpha(two_columns_df):
81.     '''
82.     Compute the Crombach alpha
83.     :param two_columns_df: df
84.     :return: the Crombach Alpha
85.     '''
86.     nr_items = len(two_columns_df.columns)
87.     cov_mtr = two_columns_df.cov()  # variance - covariance matrix
88.
89.     mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items  # mean variance
90.     mean_cov_ii = sum(np.sum(cov_mtr)) - sum(
91.         np.diagonal(cov_mtr))  # sume all the covariance among item, and remove the variance
92.     mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items)  # average
93.
94.     return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii)
95.
96.
97. def median_split(two_column_dataset):
98.     """
99. Split dataset on the median of the second column
100.     :param two_column_dataset
101.     :return: low, high subset
102.     """
103.
104.     two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1])  # order by scale
105.     median_position = int(two_column_dataset.shape[0] / 2)  # find median index
106.
107.     # split
108.     low = two_column_dataset.iloc[:median_position, ]
109.     high = two_column_dataset.iloc[median_position:, ]
110.
111.     return low, high
112.
113.
114. def low_high_split(two_column_dataset):
115.     """
116. Split dataset in three of the second column
117.     :param two_column_dataset
118.     :return: low, high subset
119.     """
120.
121.     two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1])  # order by scale
122.     low_position = int(two_column_dataset.shape[0] / 3)  # find median index
123.     high_position = int(2 * two_column_dataset.shape[0]/3)
124.
125.     # split
126.     low = two_column_dataset.iloc[:low_position, ]
127.     high = two_column_dataset.iloc[high_position:, ]
128.
129.     return low, high
130.
131.
132. if __name__ == "__main__":
133.     print("MAIN")
