Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from random import choice
- from statistics import mean, stdev
- filein = open(r'/home/hellerick/Documents/Data/Geonames/Italian populated places.txt', mode='rU', encoding="utf-8")
- # The list should be a text file with a name in each line, and with no blank line in the end
- necessary_length = 10000
- data = filein.read().split('\n')
- # for Python 2:
- # data = filein.read().decode('utf8').split('\n')
- # encoding="utf-8"
- #data = [i.lower() for i in data]
- data = [i for i in data]
- filein.close()
- if data[-1] == '':
- data = data[:-1]
- lengths = [len(i) for i in data]
- mastermean = mean (lengths)
- masterdev = stdev (lengths)
- print (mastermean, masterdev)
- stat = dict([])
- for w in data:
- for i in range(-1,len(w)+1):
- if i == -1:
- prv = '\n'
- cur = '\n'
- nxt = w[0]
- elif i == len(w):
- prv = w[-1]
- cur = '\n'
- nxt = '\n'
- else:
- cur = w[i]
- if i == 0:
- prv = '\n'
- else:
- prv = w[i-1]
- if i == len(w)-1:
- nxt = '\n'
- else:
- nxt = w[i+1]
- if prv+cur in stat:
- if nxt in stat[prv+cur]:
- stat[prv+cur][nxt] += 1
- else:
- stat[prv+cur][nxt] = 1
- else:
- stat[prv+cur] = {nxt:1}
- #for i in stat:
- # print (i, stat[i])
- result = set()
- while len(result)<necessary_length:
- s = '\n\n'
- genname = s
- while True:
- nextchars = []
- for i in stat[s]:
- nextchars = nextchars + [i]*stat[s][i]
- chosenchar = choice(nextchars)
- genname = genname + chosenchar
- s = s[1] + chosenchar
- #print ()
- if s == '\n\n':
- break
- #print (',',genname[2:-2],',')
- genname = genname[2:-2]
- if len(result) < 10:
- result= result | {genname}
- else:
- curlen = [len(i) for i in result]
- curdev = stdev(curlen)
- curmean = mean(curlen)
- attres = result | {genname}
- attlen = [len(i) for i in attres]
- attdev = stdev(attlen)
- attmean = mean(attlen)
- if ((abs(curmean-mastermean)-abs(attmean-mastermean)) + (abs(curdev-masterdev)-abs(attdev-masterdev)) > 0 and (abs(attmean-mastermean)<0.5 and abs(attdev-masterdev)<0.1)==False ) or (abs(attmean-mastermean)<0.5 and abs(attdev-masterdev)<0.1):
- result = attres
- print (genname)
- curlen = [len(i) for i in result]
- curdev = stdev(curlen)
- curmean = mean(curlen)
- print (curmean, curdev)
- with open('/home/hellerick/Documents/Data/Geonames/Italian populated places - Imitation.txt', mode='wt', encoding="utf-8") as f:
- for r in result:
- f.write(r+'\n')
Advertisement
Add Comment
Please, Sign In to add comment