Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- #
- #How to generate new samples from an unknown distribution of which you only have a given set of samples?
- #
- #This script illustrates the Inverse-Empirical-Cumulative-Distribution-Sampling method:
- #1. make an ECDF of the given samples x_1...x_n of unknown distribution ECDF_x
- #2. sample u_1...u_k from a uniform distribution [0,1]
- #3. find the image of u_1...u_k under the inverse of ECDF_x.
- #
- #larsupilami73
- import os
- from scipy import *
- import matplotlib.pyplot as plt
- random.seed(42)
- N = 10000 #number of samples to construct the ecdf from
- #ecdf from samples
- def ecdf(x):
- X = sort(x)
- Y = linspace(0,1,len(X))
- return X,Y
- #a simple sampler class
- class sampler():
- def __init__(self,x,name='unknown'):
- self.X, self.Y = ecdf(x)
- self.name = name
- def get_ecdf(self):
- return self.X,self.Y
- def get_samples(self,size=1000):
- """Sample the unknown distribution by first sampling a uniform distribution between 0 and 1 and
- then calculating the inverse ecdf on these samples"""
- U = random.uniform(low=0,high=1,size=size)
- #Find the index in Y where a uniform sample u is closest to.
- #Then find the corresponding X value: this is the inverse ecdf and your sample!
- #Note this only gives back samples identical to the original unknown samples...
- #S = [self.X[abs(u-self.Y).argmin()] for u in U] #works!
- #...however better is to interpolate X between corresponding Y values that U falls inbetween,
- #assuming
- S = interp(U,self.Y,self.X)
- return array(S)
- #generate samples from some 'unknown' and weird distribution
- def some_weird_distribution(Nsamples):
- X= []
- n = 0
- while(n<Nsamples):
- x = random.beta(2,5) #see https://www.statisticshowto.datasciencecentral.com/beta-distribution/
- if x<0.3 or x>0.5:
- X.append(x)
- n +=1
- return array(X)
- #get the samples from the unknown distribution we want to 'copy'
- x_unknown1 = some_weird_distribution(N)
- #make a sampler
- sampler1= sampler(x_unknown1)
- #make a nice figure
- fig, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3,figsize=(8,4),dpi=150)
- #see more color names: plt.cm.colors.cnames
- c1 = 'tomato'
- c2 = 'steelblue'
- c3 = 'ivory'
- c4 = 'black'
- c5 = 'green'
- #samples
- ax1.scatter(range(N),x_unknown1, c=c1, marker='.',alpha=0.4,)
- ax1.set_title('From unknown distribution',fontsize=10)
- #ax1.set_xlabel('sample nr.')
- ax1.set_ylabel('sample values')
- ax1.text(0,0.05,'N={}'.format(x_unknown1.shape[0]),color=c4)
- ax1.set_facecolor(c3)
- #histogram
- ax2.hist(x_unknown1, bins=50,color=c1,density=True,edgecolor=c4)
- ax2.set_title('Histogram',fontsize=10)
- #ax2.set_xlabel('sample values')
- ax2.set_ylabel('bin values')
- ax2.set_facecolor(c3)
- #ECDF
- x1,y1 = sampler1.get_ecdf()
- ax3.set_title('ECDF',fontsize=10)
- ax3.scatter(x1,y1,s=5,c=c1,marker='.',alpha=0.5)
- ax3.set_facecolor(c3)
- #temp directory for the frames
- if not os.path.exists('./frames'):
- os.mkdir('./frames')
- try:
- os.system('rm ./frames/frame*.png')
- except:
- pass
- #now generate *new* samples via inverse ecdf method
- #and show that these follow the distribution of the given samples of the unknown distribution
- #same histogram, same ecdf
- for k,Ns in enumerate([20,20,50,100,200,500,1000,2000,4000,8000,10000]):
- s = sampler1.get_samples(Ns)
- #show the generated samples
- ax4.scatter(range(Ns),s, c=c2, marker='.',alpha=0.4)
- ax4.set_title('Inverse ECDF samples',fontsize=10)
- ax4.set_xlabel('sample nr.')
- ax4.set_ylabel('sample values')
- ax4.set_ylim(ax1.get_ylim())
- #ax4.set_xlim(ax1.get_xlim())
- #ax4.set_xticks(ax1.get_xticks())
- ax4.text(0,0.05,'N={}'.format(s.shape[0]),color=c4)
- ax4.set_facecolor(c3)
- #show the histogram of generated samples
- ax5.hist(s, bins=50,color=c2,density=True,edgecolor=c4)
- ax5.set_title('Histogram',fontsize=10)
- ax5.set_xlabel('samples values')
- ax5.set_ylabel('bin values')
- ax5.set_xlim(ax2.get_xlim())
- #ax5.set_ylim(ax2.get_ylim())
- ax5.set_facecolor(c3)
- #ecdf of generated samples
- xe,ye = ecdf(s)
- ax6.set_title('ECDF',fontsize=10)
- ax6.scatter(xe,ye,s=5,c=c2,marker='.',alpha=0.5)
- ax6.set_xlabel('sample values')
- ax6.set_xlim(ax3.get_xlim())
- ax6.set_facecolor(c3)
- #ax6.set_xticks(ax3.get_xticks())
- #save a frame
- fig.tight_layout()
- fig.savefig('./frames/frame%05d.png' %k)
- ax4.clear()
- ax5.clear()
- ax6.clear()
- #fig.show()
- #for some reason, there's a 'jump' between 1st and 2nd frame
- #simply removing duplicate of frame solves this...
- os.remove('./frames/frame%05d.png' %0)
- plt.close('all')
- #now assemble all the pictures in a movie
- print('converting to gif!')
- #requires imagemagick
- os.system('convert -delay 100 -loop 0 ./frames/*.png inverse_sampling.gif')
- #or to mp4
- #print('converting to mp4!')
- #os.system("ffmpeg -y -r 5 -i ./frames/frame%05d.png -c:v libx264 -vf fps=10 inverse_sampling.mp4")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement