Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import random
- import math
- import pandas as pd
- from matplotlib import pyplot as plt
- from google.colab import drive
- drive.mount('/content/drive')
- data=pd.read_csv("drive/MyDrive/penguins_size.csv")
- data
- "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
- species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
- 0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE
- 1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE
- 2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE
- 3 Adelie Torgersen NaN NaN NaN NaN NaN
- 4 Adelie Torgersen 36.7 19.3 193.0 3450.0 FEMALE
- ... ... ... ... ... ... ... ...
- 339 Gentoo Biscoe NaN NaN NaN NaN NaN
- 340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 FEMALE
- 341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 MALE
- 342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 FEMALE
- 343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 MALE
- 344 rows × 7 columns"
- data.info()
- "<class 'pandas.core.frame.DataFrame'>
- RangeIndex: 344 entries, 0 to 343
- Data columns (total 7 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 species 344 non-null object
- 1 island 344 non-null object
- 2 culmen_length_mm 342 non-null float64
- 3 culmen_depth_mm 342 non-null float64
- 4 flipper_length_mm 342 non-null float64
- 5 body_mass_g 342 non-null float64
- 6 sex 334 non-null object
- dtypes: float64(4), object(3)
- memory usage: 18.9+ KB"
- data = data.drop(['species'], axis='columns')
- data = data.drop(['island'], axis='columns')
- data = data.drop(['culmen_length_mm'], axis='columns')
- data = data.drop(['culmen_depth_mm'], axis='columns')
- data = data.drop(['flipper_length_mm'], axis='columns')
- data = data.drop(['sex'], axis='columns')
- data.info()
- "<class 'pandas.core.frame.DataFrame'>
- RangeIndex: 344 entries, 0 to 343
- Data columns (total 1 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 body_mass_g 342 non-null float64
- dtypes: float64(1)
- memory usage: 2.8 KB"
- data = data.dropna()
- data.info()
- "<class 'pandas.core.frame.DataFrame'>
- Index: 342 entries, 0 to 343
- Data columns (total 1 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 body_mass_g 342 non-null float64
- dtypes: float64(1)
- memory usage: 5.3 KB"
- distribution=data["body_mass_g"].to_numpy()
- def generateFromEmpiricalDistributionFunction(distribution):
- distribution.sort()
- N = distribution.size
- intervalsNumber = int(1 + 3.322 * math.log10(N))
- intervalBorders = np.zeros(intervalsNumber + 1)
- intervalCounts = np.zeros(intervalsNumber)
- intervalProbabilities = np.zeros(intervalsNumber)
- intervalProbabilitiesSum = np.zeros(intervalsNumber)
- minValue = distribution[0]
- maxValue = distribution[N - 1]
- intervalLength = (maxValue - minValue) / intervalsNumber
- intervalBorders[0] = minValue
- j = 0
- for i in range (0, intervalsNumber):
- n = 0
- intervalBorders[i + 1] = intervalBorders[i] + intervalLength
- while j < N and distribution[j] <= intervalBorders[i + 1]:
- j += 1
- n += 1
- intervalCounts[i] = n
- intervalProbabilities[i] = intervalCounts[i] / N
- intervalProbabilitiesSum[i] = intervalProbabilities[i]
- if i > 0:
- intervalProbabilitiesSum[i] += intervalProbabilitiesSum[i - 1]
- r = random.random()
- i = 0
- while i < intervalsNumber - 1 and r > intervalProbabilitiesSum[i]:
- i += 1
- number = round(intervalBorders[i] + intervalLength * ((intervalProbabilities[i] - intervalProbabilitiesSum[i] + r) / intervalProbabilities[i]))
- return number
- plt.hist(distribution, bins = 20)
- plt.xlabel("Вес, грамм")
- plt.ylabel("Количество")
- plt.show()
- N = distribution.size
- N = 10000
- generatedDistribution = np.zeros(N)
- for i in range (0, N):
- generatedDistribution[i] = generateFromEmpiricalDistributionFunction(distribution)
- fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
- weights1 = np.ones_like(distribution) / len(distribution)
- weights2 = np.ones_like(generatedDistribution) / len(generatedDistribution)
- axs[0].hist(distribution, weights=weights1, bins = 9)
- axs[1].hist(generatedDistribution, weights=weights2, bins = 9)
- plt.show()
- from scipy.stats import gaussian_kde
- from numpy import linspace
- # this create the kernel, given an array it will estimate the probability over that values
- kde1 = gaussian_kde( distribution )
- kde2 = gaussian_kde( generatedDistribution )
- # these are the values over wich your kernel will be evaluated
- dist_space1 = linspace( min(distribution), max(distribution), 100 )
- dist_space2 = linspace( min(generatedDistribution), max(generatedDistribution), 100 )
- # plot the results
- plt.plot( dist_space1, kde1(dist_space1), label = "Исходная выборка" )
- plt.plot( dist_space2, kde2(dist_space2), label = "Сгенерированная выборка" )
- plt.legend(loc="upper right")
- plt.show()
- plt.hist([distribution, generatedDistribution], weights=[weights1, weights2], density=True , histtype='step', linewidth=2, alpha=0.7, label=['Исходная выборка','Сгенерированная выборка'])
- plt.legend(loc="upper right")
- plt.xlabel("Вес, грамм")
- plt.ylabel("Частость")
- plt.show()
- MEmpiricalDistribution = 0
- DEmpiricalDistribution = 0
- MGeneratedDistribution = 0
- DGeneratedDistribution = 0
- for i in range (0, distribution.size):
- MEmpiricalDistribution += distribution[i]
- MEmpiricalDistribution /= distribution.size
- for i in range (0, distribution.size):
- DEmpiricalDistribution += (MEmpiricalDistribution - distribution[i]) * (MEmpiricalDistribution - distribution[i])
- DEmpiricalDistribution /= (distribution.size - 1)
- for i in range (0, generatedDistribution.size):
- MGeneratedDistribution += generatedDistribution[i]
- MGeneratedDistribution /= generatedDistribution.size
- for i in range (0, generatedDistribution.size):
- DGeneratedDistribution += (MGeneratedDistribution - generatedDistribution[i]) * (MGeneratedDistribution - generatedDistribution[i])
- DGeneratedDistribution /= (generatedDistribution.size - 1)
- output = 'Мат. ожидание исходной выборки: ' + str(MEmpiricalDistribution) + '\n' + 'Мат. ожидание сгенерированной выборки: ' + str(MGeneratedDistribution) + '\n' + 'Дисперсия исходной выборки: ' + str(DEmpiricalDistribution) + '\n' + 'Дисперсия сгенерированной выборки: ' + str(DGeneratedDistribution) + '\n'
- print(output)
- "Мат. ожидание исходной выборки: 4201.754385964912
- Мат. ожидание сгенерированной выборки: 4181.9988
- Дисперсия исходной выборки: 643131.0773267483
- Дисперсия сгенерированной выборки: 663124.3964382047"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement