Inverse CDF sampling

#!/usr/bin/python3
#
#How to generate new samples from an unknown distribution of which you only have a given set of samples?
#
#This script illustrates the Inverse-Empirical-Cumulative-Distribution-Sampling method:
#1. make an ECDF of the given samples x_1...x_n of unknown distribution ECDF_x
#2. sample u_1...u_k from a uniform distribution [0,1]
#3. find the image of u_1...u_k under the inverse of ECDF_x.
#
#larsupilami73

import os
from scipy import *
import matplotlib.pyplot as plt


random.seed(42)
N = 10000 #number of samples to construct the ecdf from

#ecdf from samples
def ecdf(x):
    X = sort(x)
    Y = linspace(0,1,len(X))
    return X,Y

#a simple sampler class
class sampler():
    def __init__(self,x,name='unknown'):
        self.X, self.Y = ecdf(x)
        self.name = name

    def get_ecdf(self):
        return self.X,self.Y

    def get_samples(self,size=1000):
        """Sample the unknown distribution by first sampling a uniform distribution between 0 and 1 and
        then calculating the inverse ecdf on these samples"""
        U = random.uniform(low=0,high=1,size=size)
        #Find the index in Y where a uniform sample u is closest to.
        #Then find the corresponding X value: this is the inverse ecdf and your sample!
        #Note this only gives back samples identical to the original unknown samples...
        #S = [self.X[abs(u-self.Y).argmin()] for u in U] #works!

        #...however better is to interpolate X between corresponding Y values that U falls inbetween,
        #assuming
        S = interp(U,self.Y,self.X)
        return array(S)


#generate samples from some 'unknown' and weird distribution
def some_weird_distribution(Nsamples):
    X= []
    n = 0
    while(n<Nsamples):
        x = random.beta(2,5) #see https://www.statisticshowto.datasciencecentral.com/beta-distribution/
        if x<0.3 or x>0.5:
            X.append(x)
            n +=1

    return array(X)

#get the samples from the unknown distribution we want to 'copy'
x_unknown1 = some_weird_distribution(N)

#make a sampler
sampler1= sampler(x_unknown1)

#make a nice figure
fig, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3,figsize=(8,4),dpi=150)

#see more color names: plt.cm.colors.cnames
c1 = 'tomato'
c2 = 'steelblue'
c3 = 'ivory'
c4 = 'black'
c5 = 'green'

#samples
ax1.scatter(range(N),x_unknown1, c=c1, marker='.',alpha=0.4,)
ax1.set_title('From unknown distribution',fontsize=10)
#ax1.set_xlabel('sample nr.')
ax1.set_ylabel('sample values')
ax1.text(0,0.05,'N={}'.format(x_unknown1.shape[0]),color=c4)
ax1.set_facecolor(c3)

#histogram
ax2.hist(x_unknown1, bins=50,color=c1,density=True,edgecolor=c4)
ax2.set_title('Histogram',fontsize=10)
#ax2.set_xlabel('sample values')
ax2.set_ylabel('bin values')
ax2.set_facecolor(c3)

#ECDF
x1,y1 = sampler1.get_ecdf()
ax3.set_title('ECDF',fontsize=10)
ax3.scatter(x1,y1,s=5,c=c1,marker='.',alpha=0.5)
ax3.set_facecolor(c3)

#temp directory for the frames
if not os.path.exists('./frames'):
    os.mkdir('./frames')
try:
    os.system('rm ./frames/frame*.png')
except:
    pass

#now generate *new* samples via inverse ecdf method
#and show that these follow the distribution of the given samples of the unknown distribution
#same histogram, same ecdf
for k,Ns in enumerate([20,20,50,100,200,500,1000,2000,4000,8000,10000]):
    s = sampler1.get_samples(Ns)

    #show the generated samples
    ax4.scatter(range(Ns),s, c=c2, marker='.',alpha=0.4)
    ax4.set_title('Inverse ECDF samples',fontsize=10)
    ax4.set_xlabel('sample nr.')
    ax4.set_ylabel('sample values')
    ax4.set_ylim(ax1.get_ylim())
    #ax4.set_xlim(ax1.get_xlim())
    #ax4.set_xticks(ax1.get_xticks())
    ax4.text(0,0.05,'N={}'.format(s.shape[0]),color=c4)
    ax4.set_facecolor(c3)

    #show the histogram of generated samples
    ax5.hist(s, bins=50,color=c2,density=True,edgecolor=c4)
    ax5.set_title('Histogram',fontsize=10)
    ax5.set_xlabel('samples values')
    ax5.set_ylabel('bin values')
    ax5.set_xlim(ax2.get_xlim())
    #ax5.set_ylim(ax2.get_ylim())
    ax5.set_facecolor(c3)

    #ecdf of generated samples
    xe,ye = ecdf(s)
    ax6.set_title('ECDF',fontsize=10)
    ax6.scatter(xe,ye,s=5,c=c2,marker='.',alpha=0.5)
    ax6.set_xlabel('sample values')
    ax6.set_xlim(ax3.get_xlim())
    ax6.set_facecolor(c3)
    #ax6.set_xticks(ax3.get_xticks())

    #save a frame
    fig.tight_layout()
    fig.savefig('./frames/frame%05d.png' %k)
    ax4.clear()
    ax5.clear()
    ax6.clear()

    #fig.show()

#for some reason, there's a 'jump' between 1st and 2nd frame
#simply removing duplicate of frame solves this...
os.remove('./frames/frame%05d.png' %0)

plt.close('all')
#now assemble all the pictures in a movie
print('converting to gif!')
#requires imagemagick
os.system('convert -delay 100 -loop 0 ./frames/*.png inverse_sampling.gif')
#or to mp4
#print('converting to mp4!')
#os.system("ffmpeg -y -r 5 -i ./frames/frame%05d.png -c:v libx264 -vf fps=10 inverse_sampling.mp4")