Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Tested in Google Colab. Code released into public domain.
- # The next few lines are bash commands (indicated by ! in Jupyter)
- !git clone https://github.com/simondschweitzer/aed-tei
- !wget -O wiktionary.json "https://petscan.wmflabs.org/?language=en&project=wiktionary&categories=Egyptian_lemmas&format=json&doit=1"
- import glob
- from lxml import etree
- import collections
- import json
- import itertools
- parser = etree.XMLParser(recover=True)
- egy_letters = 'ꜣjꜥwbpfmnrhḥḫẖzsšqkgtṯdḏ'
- all_words = set()
- def normalize(text, alphabet):
- '''Removes characters other than letters in the given alphabet.'''
- return ''.join([char for char in text.lower() if char in alphabet])
- def char_frequency(input_str):
- '''Orders characters in a string by descending frequency'''
- counter = collections.Counter(input_str)
- return ''.join([item for item, count in counter.most_common()])
- def extract_egy_text(path):
- '''Given the path to an XML file, returns the Egyptian text contained therein.'''
- global all_words
- tree = etree.parse(path, parser)
- full_text = ''
- sentences = tree.findall('.//{http://www.tei-c.org/ns/1.0}s')
- for s in sentences:
- words = s.findall('.//{http://www.tei-c.org/ns/1.0}w')
- words = [''.join(word.itertext()) for word in words]
- all_words |= set(words)
- full_text += ' '.join(words) + '. '
- return full_text
- def kendall_tau_distance(a, b):
- '''Returns the raw and normalized Kendall tau distance between two lists (assumed to have the same items)'''
- n = len(a)
- dist = sum([(a.index(x)-a.index(y))*(b.index(x)-b.index(y))<0 for x, y in itertools.combinations(a, 2)])
- return (dist, dist/(n*(n-1)/2))
- def generate_egy_corpus():
- files = glob.glob('aed-tei/files/*.xml')
- files = [f for f in files if '_' not in f]
- return '\n'.join([extract_egy_text(path) for path in files])
- corpus = generate_egy_corpus()
- lemmas = ''.join(list(all_words))
- wikt_lemmas = json.load(open('wiktionary.json', 'r'))['*'][0]['a']['*']
- wikt_lemmas = ' '.join([lemma['title'] for lemma in wikt_lemmas])
- def output(name, text, standard_order=egy_letters):
- freq_order = char_frequency(normalize(text, standard_order))
- print(name, freq_order, kendall_tau_distance(standard_order, freq_order))
- return freq_order
- print('Standard order', egy_letters)
- corpus_order = output('Corpus order', corpus)
- corpus_lemma_order = output('Corpus lemma order', lemmas)
- wikt_lemma_order = output('Wiktionary lemma order', wikt_lemmas)
- # https://archiv.ub.uni-heidelberg.de/propylaeumdok/2676/1/Peust_On_consonant_frequency_in_Egyptian_2008.pdf#page=10
- peust_order = 'ntrmsꜣḥpfꜥkḫbḏṯdšzqẖhg'
- print('Peust, 2008 (vs. our corpus order)', peust_order, kendall_tau_distance(peust_order, [c for c in corpus_order if c in peust_order]))
Advertisement
Add Comment
Please, Sign In to add comment