Advertisement
Guest User

Untitled

a guest
Sep 21st, 2017
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.08 KB | None | 0 0
  1. #!/usr/bin/python2.5
  2. # coding=utf-8
  3. # -*- encoding: utf-8 -*-
  4.  
  5. import sys, codecs, copy, Ft, os;
  6. from Ft.Xml.Domlette import NonvalidatingReader;
  7. from Ft.Xml.XPath import Evaluate;
  8.  
  9. sys.stdout = codecs.getwriter('utf-8')(sys.stdout);
  10. sys.stderr = codecs.getwriter('utf-8')(sys.stderr);
  11.  
  12. if len(sys.argv) < 2: #{
  13.     print 'python apertium2extract.py <dix file>';
  14.     sys.exit(-1);
  15. #}
  16.  
  17. dictionary = sys.argv[1];
  18.  
  19. if dictionary == os.path.basename(dictionary): #{
  20.     dictionary = os.getcwd() + '/' + dictionary;
  21. #}
  22.  
  23. doc = NonvalidatingReader.parseUri('file:///' + dictionary);
  24. path = '/dictionary/pardefs/pardef';
  25.  
  26. paradigms = {};
  27. #categories = ['__n', '__adj', '__vblex'];
  28. categories = ['__n', '__adj'];
  29.  
  30. for node in Ft.Xml.XPath.Evaluate(path, contextNode=doc): #{
  31.         pardef = node.getAttributeNS(None, 'n');
  32.  
  33.     if pardef not in paradigms: #{
  34.         paradigms[pardef] = [];
  35.     #}
  36.     selected = 0;
  37.     for category in categories: #{
  38.         if pardef.count(category) > 0: #{
  39.             selected = 1;
  40.         #}
  41.     #}
  42.  
  43.     if selected < 1: #{
  44.         continue;
  45.     #}
  46.  
  47.     for child in Ft.Xml.XPath.Evaluate('.//e', contextNode=node): #{
  48.         for pair in Ft.Xml.XPath.Evaluate('.//p', contextNode=child): #{
  49.             suffix = '';
  50.             left = Ft.Xml.XPath.Evaluate('.//l', contextNode=pair)[0].firstChild;
  51.  
  52.             if type(left) != type(None): #{
  53.                 suffix = left.nodeValue;
  54.             else: #{
  55.                 suffix = ''
  56.             #}
  57.  
  58.  
  59.             symbols = '';
  60.             right =  Ft.Xml.XPath.Evaluate('.//r', contextNode=pair)[0];
  61.             for sym in Ft.Xml.XPath.Evaluate('.//s', contextNode=right): #{
  62.                 symbol = '';
  63.                 if type(sym) != type(None): #{
  64.                     symbol = sym.getAttributeNS(None, 'n');
  65.                 #}
  66.                 symbols = symbols + '.' + symbol;
  67.             #}
  68.  
  69.             paradigms[pardef].append(suffix);
  70.         #}
  71.     #}
  72. #}
  73.  
  74. universal_set = []
  75.  
  76. for paradigm in paradigms.keys(): #{
  77.     for p in paradigms[paradigm]: #{
  78.         universal_set.append(p);
  79.     #}
  80. #}
  81.  
  82. universal_set = set(universal_set);
  83.  
  84. for paradigm in paradigms.keys(): #{
  85.     #paradigm rna__vblex =
  86.     #        x
  87.     #        { x+"ð" & x+"ði" & x+"ðu" & x+"ður" & x+"r" & ~(x+"dur" | x+"t")} ;
  88.  
  89. #   if paradigm.count('/') > 0: #{
  90.         #continue;
  91.     #}
  92.     lset = len(set(paradigms[paradigm]));
  93.     if lset < 2: #{
  94.         continue;
  95.     #}
  96.  
  97.     print '-- ' + paradigm;
  98. #   print 'paradigm ' + paradigm.encode('ascii', 'ignore') + ' ='; 
  99.     print 'paradigm ' + paradigm + ' =';
  100.     print '\t' + 'x {';
  101.  
  102.     stems = '\t\t';
  103.     count = 0;
  104.     idx = 0;
  105.     for pair in set(paradigms[paradigm]): #{
  106.         if len(pair) >= 1: #{
  107.             add = 'x+"' + pair + '" ';
  108.             if idx != lset-1: #{
  109.                 add = 'x+"' + pair + '" & ';
  110.             #}
  111.             stems = stems + add;
  112.         #}
  113.  
  114.         if count == 6 and count != lset -1: #{
  115.             stems = stems + '\n\t\t';
  116.             count = 0;
  117.         #}
  118.         count = count + 1;
  119.         idx = idx + 1;
  120.     #}
  121.     complement = universal_set - set(paradigms[paradigm]);
  122.     stems = stems + ' & \n\t\t~(';
  123.     count = 0;
  124.     for s in complement: #{
  125.         stems = stems + 'x+"' + s + '" | ';
  126.         if count == 6 and count != lset -1: #{
  127.             stems = stems + '\n\t\t';
  128.             count = 0;
  129.         #}
  130.         count = count + 1;
  131.     #}
  132.     stems = stems + ')\t' + '};';  
  133.     print stems.replace('| )', ')').replace('| \n\t\t)', ')');
  134.     print '';
  135. #}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement