Guest User

Untitled

a guest
May 15th, 2022
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.78 KB | None | 0 0
  1. # Tested in Google Colab. Code released into public domain.
  2. # The next few lines are bash commands (indicated by ! in Jupyter)
  3.  
  4. !git clone https://github.com/simondschweitzer/aed-tei
  5. !wget -O wiktionary.json "https://petscan.wmflabs.org/?language=en&project=wiktionary&categories=Egyptian_lemmas&format=json&doit=1"
  6.  
  7. import glob
  8. from lxml import etree
  9. import collections
  10. import json
  11. import itertools
  12.  
  13. parser = etree.XMLParser(recover=True)
  14. egy_letters = 'ꜣjꜥwbpfmnrhḥḫẖzsšqkgtṯdḏ'
  15. all_words = set()
  16.  
  17. def normalize(text, alphabet):
  18.   '''Removes characters other than letters in the given alphabet.'''
  19.   return ''.join([char for char in text.lower() if char in alphabet])
  20.  
  21. def char_frequency(input_str):
  22.   '''Orders characters in a string by descending frequency'''
  23.   counter = collections.Counter(input_str)
  24.   return ''.join([item for item, count in counter.most_common()])
  25.  
  26. def extract_egy_text(path):
  27.   '''Given the path to an XML file, returns the Egyptian text contained therein.'''
  28.   global all_words
  29.   tree = etree.parse(path, parser)
  30.   full_text = ''
  31.   sentences = tree.findall('.//{http://www.tei-c.org/ns/1.0}s')
  32.   for s in sentences:
  33.     words = s.findall('.//{http://www.tei-c.org/ns/1.0}w')
  34.     words = [''.join(word.itertext()) for word in words]
  35.     all_words |= set(words)
  36.     full_text += ' '.join(words) + '. '
  37.   return full_text
  38.  
  39. def kendall_tau_distance(a, b):
  40.     '''Returns the raw and normalized Kendall tau distance between two lists (assumed to have the same items)'''
  41.     n = len(a)
  42.     dist = sum([(a.index(x)-a.index(y))*(b.index(x)-b.index(y))<0 for x, y in itertools.combinations(a, 2)])
  43.     return (dist, dist/(n*(n-1)/2))
  44.  
  45. def generate_egy_corpus():
  46.   files = glob.glob('aed-tei/files/*.xml')
  47.   files = [f for f in files if '_' not in f]
  48.   return '\n'.join([extract_egy_text(path) for path in files])
  49.  
  50. corpus = generate_egy_corpus()
  51. lemmas = ''.join(list(all_words))
  52. wikt_lemmas = json.load(open('wiktionary.json', 'r'))['*'][0]['a']['*']
  53. wikt_lemmas = ' '.join([lemma['title'] for lemma in wikt_lemmas])
  54.  
  55. def output(name, text, standard_order=egy_letters):
  56.   freq_order = char_frequency(normalize(text, standard_order))
  57.   print(name, freq_order, kendall_tau_distance(standard_order, freq_order))
  58.   return freq_order
  59.  
  60. print('Standard order', egy_letters)
  61. corpus_order = output('Corpus order', corpus)
  62. corpus_lemma_order = output('Corpus lemma order', lemmas)
  63. wikt_lemma_order = output('Wiktionary lemma order', wikt_lemmas)
  64. # https://archiv.ub.uni-heidelberg.de/propylaeumdok/2676/1/Peust_On_consonant_frequency_in_Egyptian_2008.pdf#page=10
  65. peust_order = 'ntrmsꜣḥpfꜥkḫbḏṯdšzqẖhg'
  66. print('Peust, 2008 (vs. our corpus order)', peust_order, kendall_tau_distance(peust_order, [c for c in corpus_order if c in peust_order]))
Advertisement
Add Comment
Please, Sign In to add comment