Untitled

# Tested in Google Colab. Code released into public domain.
# The next few lines are bash commands (indicated by ! in Jupyter)

!git clone https://github.com/simondschweitzer/aed-tei
!wget -O wiktionary.json "https://petscan.wmflabs.org/?language=en&project=wiktionary&categories=Egyptian_lemmas&format=json&doit=1"

import glob
from lxml import etree
import collections
import json
import itertools

parser = etree.XMLParser(recover=True)
egy_letters = 'ꜣjꜥwbpfmnrhḥḫẖzsšqkgtṯdḏ'
all_words = set()

def normalize(text, alphabet):
  '''Removes characters other than letters in the given alphabet.'''
  return ''.join([char for char in text.lower() if char in alphabet])

def char_frequency(input_str):
  '''Orders characters in a string by descending frequency'''
  counter = collections.Counter(input_str)
  return ''.join([item for item, count in counter.most_common()])

def extract_egy_text(path):
  '''Given the path to an XML file, returns the Egyptian text contained therein.'''
  global all_words
  tree = etree.parse(path, parser)
  full_text = ''
  sentences = tree.findall('.//{http://www.tei-c.org/ns/1.0}s')
  for s in sentences:
    words = s.findall('.//{http://www.tei-c.org/ns/1.0}w')
    words = [''.join(word.itertext()) for word in words]
    all_words |= set(words)
    full_text += ' '.join(words) + '. '
  return full_text

def kendall_tau_distance(a, b):
    '''Returns the raw and normalized Kendall tau distance between two lists (assumed to have the same items)'''
    n = len(a)
    dist = sum([(a.index(x)-a.index(y))*(b.index(x)-b.index(y))<0 for x, y in itertools.combinations(a, 2)])
    return (dist, dist/(n*(n-1)/2))

def generate_egy_corpus():
  files = glob.glob('aed-tei/files/*.xml')
  files = [f for f in files if '_' not in f]
  return '\n'.join([extract_egy_text(path) for path in files])

corpus = generate_egy_corpus()
lemmas = ''.join(list(all_words))
wikt_lemmas = json.load(open('wiktionary.json', 'r'))['*'][0]['a']['*']
wikt_lemmas = ' '.join([lemma['title'] for lemma in wikt_lemmas])

def output(name, text, standard_order=egy_letters):
  freq_order = char_frequency(normalize(text, standard_order))
  print(name, freq_order, kendall_tau_distance(standard_order, freq_order))
  return freq_order

print('Standard order', egy_letters)
corpus_order = output('Corpus order', corpus)
corpus_lemma_order = output('Corpus lemma order', lemmas)
wikt_lemma_order = output('Wiktionary lemma order', wikt_lemmas)
# https://archiv.ub.uni-heidelberg.de/propylaeumdok/2676/1/Peust_On_consonant_frequency_in_Egyptian_2008.pdf#page=10
peust_order = 'ntrmsꜣḥpfꜥkḫbḏṯdšzqẖhg'
print('Peust, 2008 (vs. our corpus order)', peust_order, kendall_tau_distance(peust_order, [c for c in corpus_order if c in peust_order]))