Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import scipy.stats
- from math import sqrt
- def cramers_v(x, y):
- confusion_matrix = pd.crosstab(x, y)
- chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
- n = confusion_matrix.sum().sum()
- phi2 = chi2 / n
- r, k = confusion_matrix.shape
- phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
- rcorr = r - ((r - 1) ** 2) / (n - 1)
- kcorr = k - ((k - 1) ** 2) / (n - 1)
- return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
- def gini(list_of_values):
- '''
- Compute the Gini coefficient.
- :param list_of_values: list/series
- :return: Gini coefficient
- '''
- if isinstance(list_of_values, float):
- return 1
- sorted_list = sorted(list_of_values)
- height, area = 0, 0
- for value in sorted_list:
- height += value
- area += height - value / 2.
- fair_area = height * len(list_of_values) / 2.
- return (fair_area - area) / fair_area
- def mean_confidence_interval(data, confidence=0.95):
- '''
- Confidence interval of the mean
- :param data:
- :param confidence:
- :return:
- '''
- a = 1.0 * np.array(data)
- n, se = len(a), scipy.stats.sem(a)
- h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
- return h
- def median_confidence_interval(data, confidence=0.95):
- '''
- Get the confidence interval over the median
- :param data: an array/series
- :param confidence:
- :return:
- '''
- a = 1.0 * np.array(data)
- n = len(a)
- n, se = len(a), sqrt(n * .5 * .5)
- h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
- return h
- def test_diff(a, b):
- '''
- Run the Mann Whitney test
- :param a: first list/series
- :param b: second list/series
- :return: U and p-val
- '''
- a, b = np.array(a), np.array(b)
- if len(a) == 0 or len(b) == 0:
- return 1
- if np.array_equal(a, b):
- return 1
- t, p = scipy.stats.mannwhitneyu(a, b)
- return p # "U = {}, p = {}".format(t, p)
- def crombach_alpha(two_columns_df):
- '''
- Compute the Crombach alpha
- :param two_columns_df: df
- :return: the Crombach Alpha
- '''
- nr_items = len(two_columns_df.columns)
- cov_mtr = two_columns_df.cov() # variance - covariance matrix
- mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items # mean variance
- mean_cov_ii = sum(np.sum(cov_mtr)) - sum(
- np.diagonal(cov_mtr)) # sume all the covariance among item, and remove the variance
- mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items) # average
- return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii)
- def median_split(two_column_dataset):
- """
- Split dataset on the median of the second column
- :param two_column_dataset
- :return: low, high subset
- """
- two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale
- median_position = int(two_column_dataset.shape[0] / 2) # find median index
- # split
- low = two_column_dataset.iloc[:median_position, ]
- high = two_column_dataset.iloc[median_position:, ]
- return low, high
- def low_high_split(two_column_dataset):
- """
- Split dataset in three of the second column
- :param two_column_dataset
- :return: low, high subset
- """
- two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale
- low_position = int(two_column_dataset.shape[0] / 3) # find median index
- high_position = int(2 * two_column_dataset.shape[0]/3)
- # split
- low = two_column_dataset.iloc[:low_position, ]
- high = two_column_dataset.iloc[high_position:, ]
- return low, high
- if __name__ == "__main__":
- print("MAIN")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement