Advertisement
AIwinter

4

Sep 29th, 2024
25
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.06 KB | None | 0 0
  1. import numpy as np
  2. import random
  3. import math
  4. import pandas as pd
  5. from matplotlib import pyplot as plt
  6.  
  7. from google.colab import drive
  8. drive.mount('/content/drive')
  9. data=pd.read_csv("drive/MyDrive/penguins_size.csv")
  10. data
  11.  
  12. "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  13. species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
  14. 0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE
  15. 1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE
  16. 2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE
  17. 3 Adelie Torgersen NaN NaN NaN NaN NaN
  18. 4 Adelie Torgersen 36.7 19.3 193.0 3450.0 FEMALE
  19. ... ... ... ... ... ... ... ...
  20. 339 Gentoo Biscoe NaN NaN NaN NaN NaN
  21. 340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 FEMALE
  22. 341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 MALE
  23. 342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 FEMALE
  24. 343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 MALE
  25. 344 rows × 7 columns"
  26.  
  27. data.info()
  28.  
  29. "<class 'pandas.core.frame.DataFrame'>
  30. RangeIndex: 344 entries, 0 to 343
  31. Data columns (total 7 columns):
  32. # Column Non-Null Count Dtype
  33. --- ------ -------------- -----
  34. 0 species 344 non-null object
  35. 1 island 344 non-null object
  36. 2 culmen_length_mm 342 non-null float64
  37. 3 culmen_depth_mm 342 non-null float64
  38. 4 flipper_length_mm 342 non-null float64
  39. 5 body_mass_g 342 non-null float64
  40. 6 sex 334 non-null object
  41. dtypes: float64(4), object(3)
  42. memory usage: 18.9+ KB"
  43.  
  44. data = data.drop(['species'], axis='columns')
  45. data = data.drop(['island'], axis='columns')
  46. data = data.drop(['culmen_length_mm'], axis='columns')
  47. data = data.drop(['culmen_depth_mm'], axis='columns')
  48. data = data.drop(['flipper_length_mm'], axis='columns')
  49. data = data.drop(['sex'], axis='columns')
  50. data.info()
  51.  
  52. "<class 'pandas.core.frame.DataFrame'>
  53. RangeIndex: 344 entries, 0 to 343
  54. Data columns (total 1 columns):
  55. # Column Non-Null Count Dtype
  56. --- ------ -------------- -----
  57. 0 body_mass_g 342 non-null float64
  58. dtypes: float64(1)
  59. memory usage: 2.8 KB"
  60.  
  61. data = data.dropna()
  62. data.info()
  63.  
  64. "<class 'pandas.core.frame.DataFrame'>
  65. Index: 342 entries, 0 to 343
  66. Data columns (total 1 columns):
  67. # Column Non-Null Count Dtype
  68. --- ------ -------------- -----
  69. 0 body_mass_g 342 non-null float64
  70. dtypes: float64(1)
  71. memory usage: 5.3 KB"
  72.  
  73. distribution=data["body_mass_g"].to_numpy()
  74.  
  75. def generateFromEmpiricalDistributionFunction(distribution):
  76. distribution.sort()
  77. N = distribution.size
  78. intervalsNumber = int(1 + 3.322 * math.log10(N))
  79. intervalBorders = np.zeros(intervalsNumber + 1)
  80. intervalCounts = np.zeros(intervalsNumber)
  81. intervalProbabilities = np.zeros(intervalsNumber)
  82. intervalProbabilitiesSum = np.zeros(intervalsNumber)
  83. minValue = distribution[0]
  84. maxValue = distribution[N - 1]
  85. intervalLength = (maxValue - minValue) / intervalsNumber
  86. intervalBorders[0] = minValue
  87. j = 0
  88. for i in range (0, intervalsNumber):
  89. n = 0
  90. intervalBorders[i + 1] = intervalBorders[i] + intervalLength
  91. while j < N and distribution[j] <= intervalBorders[i + 1]:
  92. j += 1
  93. n += 1
  94. intervalCounts[i] = n
  95. intervalProbabilities[i] = intervalCounts[i] / N
  96. intervalProbabilitiesSum[i] = intervalProbabilities[i]
  97. if i > 0:
  98. intervalProbabilitiesSum[i] += intervalProbabilitiesSum[i - 1]
  99. r = random.random()
  100. i = 0
  101. while i < intervalsNumber - 1 and r > intervalProbabilitiesSum[i]:
  102. i += 1
  103. number = round(intervalBorders[i] + intervalLength * ((intervalProbabilities[i] - intervalProbabilitiesSum[i] + r) / intervalProbabilities[i]))
  104.  
  105. return number
  106.  
  107. plt.hist(distribution, bins = 20)
  108. plt.xlabel("Вес, грамм")
  109. plt.ylabel("Количество")
  110. plt.show()
  111.  
  112. N = distribution.size
  113.  
  114. N = 10000
  115.  
  116. generatedDistribution = np.zeros(N)
  117. for i in range (0, N):
  118. generatedDistribution[i] = generateFromEmpiricalDistributionFunction(distribution)
  119.  
  120. fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
  121.  
  122. weights1 = np.ones_like(distribution) / len(distribution)
  123. weights2 = np.ones_like(generatedDistribution) / len(generatedDistribution)
  124.  
  125.  
  126.  
  127. axs[0].hist(distribution, weights=weights1, bins = 9)
  128. axs[1].hist(generatedDistribution, weights=weights2, bins = 9)
  129.  
  130. plt.show()
  131.  
  132. from scipy.stats import gaussian_kde
  133. from numpy import linspace
  134.  
  135. # this create the kernel, given an array it will estimate the probability over that values
  136. kde1 = gaussian_kde( distribution )
  137. kde2 = gaussian_kde( generatedDistribution )
  138. # these are the values over wich your kernel will be evaluated
  139. dist_space1 = linspace( min(distribution), max(distribution), 100 )
  140. dist_space2 = linspace( min(generatedDistribution), max(generatedDistribution), 100 )
  141. # plot the results
  142.  
  143. plt.plot( dist_space1, kde1(dist_space1), label = "Исходная выборка" )
  144. plt.plot( dist_space2, kde2(dist_space2), label = "Сгенерированная выборка" )
  145. plt.legend(loc="upper right")
  146. plt.show()
  147.  
  148. plt.hist([distribution, generatedDistribution], weights=[weights1, weights2], density=True , histtype='step', linewidth=2, alpha=0.7, label=['Исходная выборка','Сгенерированная выборка'])
  149. plt.legend(loc="upper right")
  150. plt.xlabel("Вес, грамм")
  151. plt.ylabel("Частость")
  152. plt.show()
  153.  
  154. MEmpiricalDistribution = 0
  155. DEmpiricalDistribution = 0
  156. MGeneratedDistribution = 0
  157. DGeneratedDistribution = 0
  158.  
  159. for i in range (0, distribution.size):
  160. MEmpiricalDistribution += distribution[i]
  161. MEmpiricalDistribution /= distribution.size
  162.  
  163. for i in range (0, distribution.size):
  164. DEmpiricalDistribution += (MEmpiricalDistribution - distribution[i]) * (MEmpiricalDistribution - distribution[i])
  165. DEmpiricalDistribution /= (distribution.size - 1)
  166.  
  167. for i in range (0, generatedDistribution.size):
  168. MGeneratedDistribution += generatedDistribution[i]
  169. MGeneratedDistribution /= generatedDistribution.size
  170.  
  171. for i in range (0, generatedDistribution.size):
  172. DGeneratedDistribution += (MGeneratedDistribution - generatedDistribution[i]) * (MGeneratedDistribution - generatedDistribution[i])
  173. DGeneratedDistribution /= (generatedDistribution.size - 1)
  174.  
  175. output = 'Мат. ожидание исходной выборки: ' + str(MEmpiricalDistribution) + '\n' + 'Мат. ожидание сгенерированной выборки: ' + str(MGeneratedDistribution) + '\n' + 'Дисперсия исходной выборки: ' + str(DEmpiricalDistribution) + '\n' + 'Дисперсия сгенерированной выборки: ' + str(DGeneratedDistribution) + '\n'
  176. print(output)
  177.  
  178. "Мат. ожидание исходной выборки: 4201.754385964912
  179. Мат. ожидание сгенерированной выборки: 4181.9988
  180. Дисперсия исходной выборки: 643131.0773267483
  181. Дисперсия сгенерированной выборки: 663124.3964382047"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement