Guest User

Morphman mecab_wrapper.py Mecab3 update

a guest
Aug 16th, 2020
148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.71 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import importlib
  3. import importlib.util
  4. import re
  5. import subprocess
  6. import sys
  7.  
  8. from .morphemes import Morpheme
  9. from .util_external import memoize
  10.  
  11.  
  12. ####################################################################################################
  13. # Mecab Morphemizer
  14. ####################################################################################################
  15.  
  16.  
  17. MECAB_NODE_UNIDIC3_BOS = 'BOS/EOS,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*'
  18. MECAB_NODE_UNIDIC3_PARTS = ['%f[7]', '%f[10]','%m','%f[6]','%f[0]','%f[1]']
  19. MECAB_NODE_LENGTH_UNIDIC3 = len( MECAB_NODE_UNIDIC3_PARTS )
  20. MECAB_NODE_UNIDIC_BOS = 'BOS/EOS,*,*,*,*,*,*,*,*,*,*,*,*,*'
  21. MECAB_NODE_UNIDIC_PARTS = ['%f[7]', '%f[12]', '%m', '%f[6]', '%f[0]', '%f[1]']
  22. MECAB_NODE_LENGTH_UNIDIC = len(MECAB_NODE_UNIDIC_PARTS)
  23. MECAB_NODE_IPADIC_BOS = 'BOS/EOS,*,*,*,*,*,*,*,*'
  24. MECAB_NODE_IPADIC_PARTS = ['%f[6]', '%m', '%f[7]', '%f[0]', '%f[1]']
  25. MECAB_NODE_LENGTH_IPADIC = len(MECAB_NODE_IPADIC_PARTS)
  26. MECAB_NODE_READING_INDEX = 2
  27.  
  28. MECAB_ENCODING = None
  29. MECAB_POS_BLACKLIST = [
  30.     '記号',  # "symbol", generally punctuation
  31.     '補助記号',  # "symbol", generally punctuation
  32.     '空白',  # Empty space
  33. ]
  34. MECAB_SUBPOS_BLACKLIST = [
  35.     '数詞',  # Numbers
  36. ]
  37.  
  38. is_unidic = True
  39.  
  40. kanji = r'[㐀-䶵一-鿋豈-頻]'
  41.  
  42.  
  43. def extract_unicode_block(unicode_block, string):
  44.     """ extracts and returns all texts from a unicode block from string argument.
  45.        Note that you must use the unicode blocks defined above, or patterns of similar form """
  46.     return re.findall(unicode_block, string)
  47.  
  48.  
  49. def getMorpheme(parts):
  50.     global is_unidic
  51.  
  52.     if is_unidic:
  53.         if len(parts) != MECAB_NODE_LENGTH_UNIDIC or len(parts) != MECAB_NODE_LENGTH_UNIDIC3:
  54.             return None
  55.  
  56.         pos = parts[4] if parts[4] != '' else '*'
  57.         subPos = parts[5] if parts[5] != '' else '*'
  58.  
  59.         if (pos in MECAB_POS_BLACKLIST) or (subPos in MECAB_SUBPOS_BLACKLIST):
  60.             return None
  61.  
  62.         norm = parts[0]
  63.         base = parts[1]
  64.         inflected = parts[2]
  65.         reading = parts[3]
  66.  
  67.         return Morpheme(norm, base, inflected, reading, pos, subPos)
  68.     else:
  69.         if len(parts) != MECAB_NODE_LENGTH_IPADIC:
  70.             return None
  71.  
  72.         pos = parts[3] if parts[3] != '' else '*'
  73.         subPos = parts[4] if parts[4] != '' else '*'
  74.  
  75.         if (pos in MECAB_POS_BLACKLIST) or (subPos in MECAB_SUBPOS_BLACKLIST):
  76.             return None
  77.  
  78.         norm = parts[0]
  79.         base = parts[0]
  80.         inflected = parts[1]
  81.         reading = parts[2]
  82.  
  83.         m = fixReading(Morpheme(norm, base, inflected, reading, pos, subPos))
  84.         return m
  85.  
  86.  
  87. @memoize
  88. def getMorphemesMecab(e):
  89.     ms = [getMorpheme(m.split('\t')) for m in interact(e).split('\r')]
  90.     ms = [m for m in ms if m is not None]
  91.     return ms
  92.  
  93.  
  94. # [Str] -> subprocess.STARTUPINFO -> IO MecabProc
  95. def spawnMecab(base_cmd, startupinfo):
  96.     """Try to start a MeCab subprocess in the given way, or fail.
  97.  
  98.    Raises OSError if the given base_cmd and startupinfo don't work
  99.    for starting up MeCab, or the MeCab they produce has a dictionary
  100.    incompatible with our assumptions.
  101.    """
  102.     global MECAB_ENCODING
  103.     global is_unidic
  104.  
  105.     # [Str] -> subprocess.STARTUPINFO -> IO subprocess.Popen
  106.     def spawnCmd(cmd, startupinfo):
  107.         return subprocess.Popen(cmd, startupinfo=startupinfo, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
  108.                                 stderr=subprocess.STDOUT)
  109.  
  110.     config_dump = spawnCmd(base_cmd + ['-P'], startupinfo).stdout.read()
  111.     bos_feature_match = re.search(
  112.         '^bos-feature: (.*)$', str(config_dump, 'utf-8'), flags=re.M)
  113.  
  114.     if bos_feature_match is not None and bos_feature_match.group(1).strip() == MECAB_NODE_UNIDIC3_BOS:
  115.         node_parts = MECAB_NODE_UNIDIC3_PARTS
  116.         is_unidic = True
  117.     elif bos_feature_match is not None and bos_feature_match.group(1).strip() == MECAB_NODE_UNIDIC_BOS:
  118.         node_parts = MECAB_NODE_UNIDIC_PARTS
  119.         is_unidic = True
  120.     elif bos_feature_match is not None and bos_feature_match.group(1).strip() == MECAB_NODE_IPADIC_BOS:
  121.         node_parts = MECAB_NODE_IPADIC_PARTS
  122.         is_unidic = False
  123.     else:
  124.         raise OSError(
  125.             "Unexpected MeCab dictionary format; unidic or ipadic required.\n"
  126.             "Try installing the 'Mecab Unidic' or 'Japanese Support' addons,\n"
  127.             "or if using your system's `mecab` try installing a package\n"
  128.             "like `mecab-ipadic`\n")
  129.  
  130.     dicinfo_dump = spawnCmd(base_cmd + ['-D'], startupinfo).stdout.read()
  131.     charset_match = re.search(
  132.         '^charset:\t(.*)$', str(dicinfo_dump, 'utf-8'), flags=re.M)
  133.     if charset_match is None:
  134.         raise OSError('Can\'t find charset in MeCab dictionary info (`$MECAB -D`):\n\n'
  135.                       + dicinfo_dump)
  136.     MECAB_ENCODING = charset_match.group(1)
  137.  
  138.     args = ['--node-format=%s\r' % ('\t'.join(node_parts),),
  139.             '--eos-format=\n',
  140.             '--unk-format=']
  141.     return spawnCmd(base_cmd + args, startupinfo)
  142.  
  143.  
  144. @memoize
  145. def mecab():  # IO MecabProc
  146.     """Start a MeCab subprocess and return it.
  147.    `mecab` reads expressions from stdin at runtime, so only one
  148.    instance is needed.  That's why this function is memoized.
  149.    """
  150.  
  151.     if sys.platform.startswith('win'):
  152.         si = subprocess.STARTUPINFO()
  153.         try:
  154.             si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  155.         except:
  156.             # pylint: disable=no-member
  157.             si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW
  158.     else:
  159.         si = None
  160.  
  161.     # Search for mecab
  162.     reading = None
  163.  
  164.     # 1st priority - MecabUnidic
  165.     if importlib.util.find_spec('13462835'):
  166.         try:
  167.             reading = importlib.import_module('13462835.reading')
  168.             mecab_source = 'MecabUnidic'
  169.         except ModuleNotFoundError:
  170.             pass
  171.  
  172.     # 2nd priority - Japanese Support
  173.     if (not reading) and importlib.util.find_spec('3918629684'):
  174.         try:
  175.             reading = importlib.import_module('3918629684.reading')
  176.             mecab_source = 'Japanese Support'
  177.         except ModuleNotFoundError:
  178.             pass
  179.  
  180.     # 3nd priority - MIAJapaneseSupport
  181.     if (not reading) and importlib.util.find_spec('MIAJapaneseSupport'):
  182.         try:
  183.             reading = importlib.import_module('MIAJapaneseSupport.reading')
  184.             mecab_source = 'MIAJapaneseSupport'
  185.         except ModuleNotFoundError:
  186.             pass
  187.     # 4nd priority - MIAJapaneseSupport via Anki code (278530045)
  188.     if (not reading) and importlib.util.find_spec('278530045'):
  189.         try:
  190.             reading = importlib.import_module('278530045.reading')
  191.             mecab_source = '278530045'
  192.         except ModuleNotFoundError:
  193.             pass
  194.  
  195.     # 5th priority - From Morphman
  196.     if (not reading) and importlib.util.find_spec('morph') and importlib.util.find_spec('morph.deps.mecab.reading'):
  197.         try:
  198.             reading = importlib.import_module('morph.deps.mecab.reading')
  199.             mecab_source = 'MorphMan'
  200.         except ModuleNotFoundError:
  201.             pass
  202.  
  203.     # 6th priority - system mecab
  204.     if not reading:
  205.         try:
  206.             return spawnMecab(['mecab'], si), 'System'
  207.         except:
  208.             raise OSError('''
  209.            Mecab Japanese analyzer could not be found.
  210.            Please install one of the following Anki add-ons:
  211.                 https://ankiweb.net/shared/info/3918629684
  212.                 https://ankiweb.net/shared/info/13462835
  213.                 https://ankiweb.net/shared/info/278530045''')
  214.  
  215.     m = reading.MecabController()
  216.     m.setup()
  217.     # m.mecabCmd[1:4] are assumed to be the format arguments.
  218.  
  219.     return spawnMecab(m.mecabCmd[:1] + m.mecabCmd[4:], si), mecab_source
  220.  
  221.  
  222. @memoize
  223. def interact(expr):  # Str -> IO Str
  224.     """ "interacts" with 'mecab' command: writes expression to stdin of 'mecab' process and gets all the morpheme
  225.    info from its stdout. """
  226.     p, _ = mecab()
  227.     expr = expr.encode(MECAB_ENCODING, 'ignore')
  228.     p.stdin.write(expr + b'\n')
  229.     p.stdin.flush()
  230.  
  231.     return '\r'.join([str(p.stdout.readline().rstrip(b'\r\n'), MECAB_ENCODING) for l in expr.split(b'\n')])
  232.  
  233.  
  234. @memoize
  235. def fixReading(m):  # Morpheme -> IO Morpheme
  236.     """
  237.    'mecab' prints the reading of the kanji in inflected forms (and strangely in katakana). So 歩い[て] will have アルイ as
  238.    reading. This function sets the reading to the reading of the base form (in the example it will be 'アルク').
  239.    """
  240.     if m.pos in ['動詞', '助動詞', '形容詞']:  # verb, aux verb, i-adj
  241.         n = interact(m.base).split('\t')
  242.         if len(n) == MECAB_NODE_LENGTH_IPADIC:
  243.             m.read = n[MECAB_NODE_READING_INDEX].strip()
  244.     return m
  245.  
  246.  
Advertisement
Add Comment
Please, Sign In to add comment