Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Nikola Jovanovic 0081/15
- from string import ascii_lowercase
- from string import ascii_uppercase
- ascii_letters = ascii_lowercase + ascii_uppercase ## Bukvalno vec postoji ascii_letters u string :D
- def import_standard_etaoin(dat_name=''): ## Podrazumevani argument bas i nije primeren (-0)
- std_freq = {}
- try:
- letterfreq = open(dat_name+'.txt.', 'r')
- for row in letterfreq:
- row = row.replace('\n', '')
- std_freq[row[0]] = float(row.split('\t')[1][:-1])/100
- print ('Fajl ucitan.')
- return std_freq
- except:
- print ('Koriscenje predefinisanih ucestalosti.')
- return {'e': 0.12702,
- 't': 0.09056,
- 'a': 0.08167,
- 'o': 0.07507,
- 'i': 0.06966,
- 'n': 0.06749,
- 's': 0.06327,
- 'h': 0.06094,
- 'r': 0.05987,
- 'd': 0.04253,
- 'l': 0.04025,
- 'u': 0.02782,
- 'c': 0.02758,
- 'm': 0.02406,
- 'w': 0.02361,
- 'f': 0.02228,
- 'g': 0.02015,
- 'y': 0.01974,
- 'p': 0.01929,
- 'b': 0.01492,
- 'v': 0.00978,
- 'k': 0.00772,
- 'j': 0.00153,
- 'x': 0.00150,
- 'q': 0.00095,
- 'z': 0.00074}
- def m_o_succ(succs):
- succ_list = [(apps, l) for l, apps in succs.items()]
- maxv = max(succ_list)
- return maxv[1]
- def precess_file(dat_name):
- global ascii_lowercase, ascii_letters
- letts_in_dat= 0
- dat_lett_count = {c: 0 for c in ascii_lowercase}
- lett_succs = {k: {c: 0 for c in ascii_lowercase} for k in ascii_lowercase}
- # raw_input('Unesite ime datoteke (bez .txt): ')
- try:
- text_dat = open(dat_name+'.txt')
- except:
- print ('Greska prilikom ucitavanja fajla')
- exit()
- pc = ''
- for i, c in enumerate(text_dat.read()):
- if c in ascii_letters:
- c = c.lower()
- dat_lett_count[c] += 1
- letts_in_dat += 1
- if len(pc) > 0: lett_succs[pc][c] += 1
- pc = c
- else:
- pc = ''
- text_dat.close()
- return {c: [float(dat_lett_count[c])/letts_in_dat, m_o_succ(lett_succs[c])] for c in ascii_lowercase}
- def compare_freq(std_etaoin, dat_data):
- diff = {}
- for c in ascii_lowercase:
- diff[c] = std_etaoin[c] / round(dat_data[c][0], 5)
- return diff
- def print_results(std_etaoin, dat_data, comp): ## Sjajan ispis, bolji nego sa pprint
- lett_list = [(dat_data[x][0], x, dat_data[x][1]) for x in dat_data.keys()]
- print ('slovo | dobijeni % | ocekivani % | odnos % | sledbenik')
- for x in sorted(lett_list)[::-1]:
- print (x[1]+' | {:9.3f}% | {:10.3f}% | {:7.3f} | '.format(round(x[0]*100, 3), std_etaoin[x[1]]*100, comp[x[1]])+x[2])
- def save_results(dat_name, diff):
- out_dat = open(dat_name+'-etaoin.txt', 'w')
- out = []
- for c in ascii_lowercase:
- out.append(c + '\t{:.3f}'.format(round(diff[c], 3)))
- out_dat.write('\n'.join(out))
- out_dat.close()
- std_etaoin_dat_name = raw_input ('Unesite ime datoteke sa ucestanoscu slova (bez .txt): ')
- standard_freq = import_standard_etaoin(std_etaoin_dat_name)
- dat_name = raw_input ('Unesite ime tekstulane datoteke (bez .txt): ')
- file_data = precess_file(dat_name)
- comp = compare_freq(standard_freq, file_data)
- print_results(standard_freq, file_data, comp)
- ## Ne ispisuje se broj ponavljanja slova, ali se racuna, pa je ok (-0)
- save_results(dat_name, comp)
- ## Sve je u potpunosti ispravno
- ## Poeni: 80/80
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement