• API
• FAQ
• Tools
• Archive
SHARE
TWEET

# Untitled

a guest Sep 11th, 2019 102 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import numpy as np
2. import pandas as pd
3. import scipy.stats
4. from math import sqrt
5.
6.
7. def cramers_v(x, y):
8.     confusion_matrix = pd.crosstab(x, y)
9.     chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
10.     n = confusion_matrix.sum().sum()
11.     phi2 = chi2 / n
12.     r, k = confusion_matrix.shape
13.     phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
14.     rcorr = r - ((r - 1) ** 2) / (n - 1)
15.     kcorr = k - ((k - 1) ** 2) / (n - 1)
16.     return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
17.
18.
19. def gini(list_of_values):
20.     '''
21.     Compute the Gini coefficient.
22.     :param list_of_values: list/series
23.     :return: Gini coefficient
24.     '''
25.     if isinstance(list_of_values, float):
26.         return 1
27.     sorted_list = sorted(list_of_values)
28.     height, area = 0, 0
29.     for value in sorted_list:
30.         height += value
31.         area += height - value / 2.
32.     fair_area = height * len(list_of_values) / 2.
33.     return (fair_area - area) / fair_area
34.
35.
36. def mean_confidence_interval(data, confidence=0.95):
37.     '''
38.     Confidence interval of the mean
39.     :param data:
40.     :param confidence:
41.     :return:
42.     '''
43.     a = 1.0 * np.array(data)
44.     n, se = len(a), scipy.stats.sem(a)
45.     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
46.     return h
47.
48.
49. def median_confidence_interval(data, confidence=0.95):
50.     '''
51.     Get the confidence interval over the median
52.     :param data: an array/series
53.     :param confidence:
54.     :return:
55.     '''
56.     a = 1.0 * np.array(data)
57.     n = len(a)
58.     n, se = len(a), sqrt(n * .5 * .5)
59.     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
60.     return h
61.
62.
63. def test_diff(a, b):
64.     '''
65.     Run the Mann Whitney test
66.     :param a: first list/series
67.     :param b: second list/series
68.     :return: U and p-val
69.     '''
70.     a, b = np.array(a), np.array(b)
71.     if len(a) == 0 or len(b) == 0:
72.         return 1
73.     if np.array_equal(a, b):
74.         return 1
75.
76.     t, p = scipy.stats.mannwhitneyu(a, b)
77.     return p  # "U = {}, p = {}".format(t, p)
78.
79.
80. def crombach_alpha(two_columns_df):
81.     '''
82.     Compute the Crombach alpha
83.     :param two_columns_df: df
84.     :return: the Crombach Alpha
85.     '''
86.     nr_items = len(two_columns_df.columns)
87.     cov_mtr = two_columns_df.cov()  # variance - covariance matrix
88.
89.     mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items  # mean variance
90.     mean_cov_ii = sum(np.sum(cov_mtr)) - sum(
91.         np.diagonal(cov_mtr))  # sume all the covariance among item, and remove the variance
92.     mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items)  # average
93.
94.     return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii)
95.
96.
97. def median_split(two_column_dataset):
98.     """
99. Split dataset on the median of the second column
100.     :param two_column_dataset
101.     :return: low, high subset
102.     """
103.
104.     two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1])  # order by scale
105.     median_position = int(two_column_dataset.shape[0] / 2)  # find median index
106.
107.     # split
108.     low = two_column_dataset.iloc[:median_position, ]
109.     high = two_column_dataset.iloc[median_position:, ]
110.
111.     return low, high
112.
113.
114. def low_high_split(two_column_dataset):
115.     """
116. Split dataset in three of the second column
117.     :param two_column_dataset
118.     :return: low, high subset
119.     """
120.
121.     two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1])  # order by scale
122.     low_position = int(two_column_dataset.shape[0] / 3)  # find median index
123.     high_position = int(2 * two_column_dataset.shape[0]/3)
124.
125.     # split
126.     low = two_column_dataset.iloc[:low_position, ]
127.     high = two_column_dataset.iloc[high_position:, ]
128.
129.     return low, high
130.
131.
132. if __name__ == "__main__":
133.     print("MAIN")
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.
Not a member of Pastebin yet?