Source code for pyknp.juman.morpheme

# -*- encoding: utf-8 -*-

from __future__ import absolute_import, unicode_literals
import re
import unittest
import six

from pyknp.knp.features import Features


class JUMAN_FORMAT(object):
    """ JUMANのラティスオプション

    Attributes:
        DEFAULT : 通常のJUMAN出力形式 (ラティスオプションなし)
        LATTICE_TOP_ONE: ラティス出力形式から、TOP1のビームだけを読む
        LATTICE_ALL: ラティス出力形式から、すべてのビームを読む 
    """

    DEFAULT = 0  # default
    LATTICE_TOP_ONE = 1
    LATTICE_ALL = 2


[docs]class Morpheme(object): """ 形態素の各種情報を保持するオブジェクト. Args: spec (str): JUMAN/KNP出力 mrph_id (int): 形態素ID juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Attributes: mrph_id (int): 形態素ID mrph_index (int): mrph_idに同じ doukei (list): midasi (str): 見出し yomi (str): 読み genkei (str): 原形 hinsi (str): 品詞 hinsi_id (int): 品詞ID bunrui (str): 品詞細分類 bunrui_id (int): 品詞細分類ID katuyou1 (str): 活用型 katuyou1_id (int): 活用型ID katuyou2 (str): 活用形 katuyou2_id (int): 活用形ID imis (str): 意味情報 fstring (str): 素性情報 repname (str): 代表表記 ranks (set[int]): ラティスでのランク span (tuple): 形態素の位置 (開始位置, 終了位置), JUMAN出力形式がラティス形式の場合のみ """
[docs] def __init__(self, spec, mrph_id=None, juman_format=JUMAN_FORMAT.DEFAULT): assert isinstance(spec, six.text_type) assert mrph_id is None or isinstance(mrph_id, int) if juman_format != JUMAN_FORMAT.DEFAULT and mrph_id is None: raise KeyError self.mrph_index = mrph_id self.mrph_id = mrph_id self.prev_mrph_id = 0 self.span = (0, 0) self.doukei = [] self.midasi = '' self.yomi = '' self.genkei = '' self.hinsi = '' self.hinsi_id = 0 self.bunrui = '' self.bunrui_id = 0 self.katuyou1 = '' self.katuyou1_id = 0 self.katuyou2 = '' self.katuyou2_id = 0 self.imis = '' self.fstring = '' self.repname = '' self.ranks = {1} if juman_format == JUMAN_FORMAT.DEFAULT: self._parse_spec(spec.strip("\n")) else: self._parse_new_spec(spec.strip("\n"))
def _parse_new_spec(self, spec): try: # FIXME KNPの場合と同様、EOSをきちんと判定する parts = spec.split("\t") self.mrph_id = int(parts[1]) self.prev_mrph_id = [int(mid) for mid in parts[2].split(";")] self.span = (int(parts[3]), int(parts[4])) self.midasi = parts[5] self.yomi = parts[7] self.genkei = parts[8] self.hinsi = parts[9] self.hinsi_id = int(parts[10]) self.bunrui = parts[11] self.bunrui_id = int(parts[12]) self.katuyou1 = parts[13] self.katuyou1_id = int(parts[14]) self.katuyou2 = parts[15] self.katuyou2_id = int(parts[16]) self.fstring = parts[17] self.features = self._parse_fstring(self.fstring) self.feature = self.features # backward-compatibility self.repname = parts[6] ranks = self.features.get('ランク', None) if ranks is not None: self.ranks = set(int(x) for x in ranks) except IndexError: pass def _parse_spec(self, spec): parts = [] part = '' inside_quotes = False if spec.startswith(' '): spec = '\\%s' % spec if spec.startswith('\ \ \ 特殊 1 空白 6 * 0 * 0'): parts = ['\ ', '\ ', '\ ', '特殊', '1', '空白', '6', '*', '0', '*', '0', 'NIL'] else: for char in spec: if char == '"': if not inside_quotes: inside_quotes = True else: inside_quotes = False # If "\"" proceeds " ", it would be not inside_quotes, but "\"". if inside_quotes and char == ' ' and part == '"': inside_quotes = False if part != "" and char == ' ' and not inside_quotes: if part.startswith('"') and part.endswith('"') and len(part) > 1: parts.append(part[1:-1]) else: parts.append(part) part = '' else: part += char parts.append(part) try: # FIXME KNPの場合と同様、EOSをきちんと判定する self.midasi = parts[0] self.yomi = parts[1] self.genkei = parts[2] self.hinsi = parts[3] self.hinsi_id = int(parts[4]) self.bunrui = parts[5] self.bunrui_id = int(parts[6]) self.katuyou1 = parts[7] self.katuyou1_id = int(parts[8]) self.katuyou2 = parts[9] self.katuyou2_id = int(parts[10]) self.imis = parts[11].lstrip("\"").rstrip("\"") self.fstring = parts[12] self.features = Features(self.fstring) self.feature = self.features # backward-compatibility except IndexError: pass # Extract 代表表記 match = re.search(r"代表表記:([^\"\s]+)", self.imis) if match: self.repname = match.group(1)
[docs] def push_doukei(self, mrph): self.doukei.append(mrph)
[docs] def repnames(self): """ 形態素の代表表記(曖昧性がある場合は「?」で連結)を返す. Returns: str: 形態素の代表表記文字列 """ repnames = [] if self.repname: repnames.append(self.repname) for doukei in self.doukei: if doukei.repname: repnames.append(doukei.repname) # 重複を削除 return "?".join(sorted(set(repnames), key=repnames.index))
[docs] def spec(self): imis = self.imis if imis != "NIL" and len(imis) != 0: imis = '"%s"' % imis spec = "%s %s %s %s %s %s %s %s %s %s %s %s %s" % \ (self.midasi, self.yomi, self.genkei, self.hinsi, self.hinsi_id, self.bunrui, self.bunrui_id, self.katuyou1, self.katuyou1_id, self.katuyou2, self.katuyou2_id, imis, self.fstring) return "%s\n" % spec.rstrip()
[docs] def new_spec(self, prev_mrph_id=None, span=None): assert isinstance(prev_mrph_id, int) or \ isinstance(prev_mrph_id, six.text_type) or \ isinstance(prev_mrph_id, list) or \ prev_mrph_id is None if prev_mrph_id is None: prev_mrph_id = self.prev_mrph_id # This method accepts character position instead of morpheme span for backward comatibility. assert isinstance(span, tuple) or \ isinstance(span, list) or \ isinstance(span, int) or \ isinstance(span, six.text_type) or \ span is None if span is None: span = self.span elif isinstance(span, tuple) or isinstance(span, list): span = (span[0], span[1]) elif span is six.text_type: span = (int(span), int(span) + len(self.midasi) - 1) elif isinstance(span, int): span = (span, span + len(self.midasi) - 1) if self.mrph_id is None: raise NotImplementedError out = ["-\t%s" % self.mrph_id] if isinstance(prev_mrph_id, list): out.append("\t%s" % ";".join(["%s" % pm for pm in prev_mrph_id])) else: out.append("\t%s" % prev_mrph_id) out.append("\t%d\t%d" % span) out.append("\t%s" % self.midasi) if len(self.repname) == 0: # out.append("\t%s/%s" % (self.midasi, self.yomi)) out.append("\t%s/%s" % (self.genkei, self.genkei)) else: out.append("\t%s" % self.repname) out.append("\t%s\t%s\t%s\t%s" % (self.yomi, self.genkei, self.hinsi, self.hinsi_id)) out.append("\t%s\t%s\t%s\t%s\t%s\t%s" % (self.bunrui, self.bunrui_id, self.katuyou1, self.katuyou1_id, self.katuyou2, self.katuyou2_id)) out.append("\t") if len(self.fstring) == 0: fs = [] for im in self.imis.split(" "): if im.startswith("代表表記:"): continue elif im == "NIL": continue fs.append(im) out.append("|".join(fs)) else: out.append(self.fstring) out.append("\n") return "".join(out)
def _parse_fstring(self, fstring): """ 素性情報をパースする """ rvalue = {} for feature in fstring.split("|"): fs = feature.rstrip().lstrip().split(":") key = ":".join(fs[:-1]) val = fs[-1] rvalue[key] = val.split(";") return rvalue def __repr__(self): return "Morpheme(%s)" % repr(self.spec())
class MorphemeTest(unittest.TestCase): def test_simple(self): spec = "であり であり だ 判定詞 4 * 0 判定詞 25 デアル列基本連用形 18\n" mrph = Morpheme(spec, 123) self.assertEqual(mrph.midasi, 'であり') self.assertEqual(mrph.yomi, 'であり') self.assertEqual(mrph.genkei, 'だ') self.assertEqual(mrph.hinsi, '判定詞') self.assertEqual(mrph.hinsi_id, 4) self.assertEqual(mrph.bunrui, '*') self.assertEqual(mrph.bunrui_id, 0) self.assertEqual(mrph.katuyou1, '判定詞') self.assertEqual(mrph.katuyou1_id, 25) self.assertEqual(mrph.katuyou2, 'デアル列基本連用形') self.assertEqual(mrph.katuyou2_id, 18) self.assertEqual(mrph.fstring, "") self.assertEqual(mrph.spec(), spec) self.assertEqual(mrph.new_spec(8, 9), "-\t123\t8\t9\t11\tであり\tだ/だ\tであり\t\t判定詞\t4\t*\t0\t判定詞\t25\tデアル列基本連用形\t18\t\n") def test_imis(self): spec = """解析 かいせき 解析 名詞 6 サ変名詞 2 * 0 * 0 "代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;科学・技術"\n""" mrph = Morpheme(spec) self.assertEqual(mrph.spec(), spec) self.assertEqual(mrph.imis, "代表表記:解析/かいせき カテゴリ:抽象物 ドメイン:教育・学習;科学・技術") def test_nil(self): spec = "であり であり だ 判定詞 4 * 0 判定詞 25 デアル列基本連用形 18 NIL\n" mrph = Morpheme(spec) self.assertEqual(mrph.imis, "NIL") self.assertEqual(mrph.spec(), spec) def test_at(self): spec = "@ @ @ 未定義語 15 その他 1 * 0 * 0" mrph = Morpheme(spec) self.assertEqual(mrph.midasi, '@') def test_knp(self): spec = "構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 NIL <漢字><かな漢字><自立><←複合><名詞相当語>\n" mrph = Morpheme(spec) self.assertEqual(mrph.midasi, '構文') self.assertEqual(mrph.yomi, 'こうぶん') self.assertEqual(mrph.genkei, '構文') self.assertEqual(mrph.hinsi, '名詞') self.assertEqual(mrph.hinsi_id, 6) self.assertEqual(mrph.bunrui, '普通名詞') self.assertEqual(mrph.bunrui_id, 1) self.assertEqual(mrph.katuyou1, '*') self.assertEqual(mrph.katuyou1_id, 0) self.assertEqual(mrph.katuyou2, '*') self.assertEqual(mrph.katuyou2_id, 0) self.assertEqual(mrph.imis, 'NIL') self.assertEqual(mrph.fstring, '<漢字><かな漢字><自立><←複合><名詞相当語>') self.assertEqual(mrph.spec(), spec) def test_repr(self): spec = "構文 こうぶん 構文 名詞 6 普通名詞 1 * 0 * 0 NIL <漢字><かな漢字><自立><←複合><名詞相当語>\n" mrph = Morpheme(spec) new_mrph = eval(repr(mrph)) self.assertEqual(mrph.spec(), new_mrph.spec()) class MorphemeTest2(unittest.TestCase): def test_simple(self): spec = """- 36 2 2 4 貰った 貰う/もらう もらった もらう 動詞 2 * 0 子音動詞ワ行 12 タ形 10 付属動詞候補(タ系)\n""" mrph = Morpheme(spec, 36, juman_format=JUMAN_FORMAT.LATTICE_ALL) self.assertEqual(mrph.midasi, '貰った') self.assertEqual(mrph.yomi, 'もらった') self.assertEqual(mrph.genkei, 'もらう') self.assertEqual(mrph.hinsi, '動詞') self.assertEqual(mrph.hinsi_id, 2) self.assertEqual(mrph.bunrui, '*') self.assertEqual(mrph.bunrui_id, 0) self.assertEqual(mrph.katuyou1, '子音動詞ワ行') self.assertEqual(mrph.katuyou1_id, 12) self.assertEqual(mrph.katuyou2, 'タ形') self.assertEqual(mrph.katuyou2_id, 10) self.assertEqual(mrph.imis, '') self.assertEqual(mrph.fstring, "付属動詞候補(タ系)") self.assertEqual(mrph.spec(), "貰った もらった もらう 動詞 2 * 0 子音動詞ワ行 12 タ形 10 付属動詞候補(タ系)\n") self.assertEqual(mrph.new_spec(2, 2), spec) def test_doukei(self): spec1 = """- 1 0 0 0 母 母/ぼ ぼ 母 名詞 6 普通名詞 1 * 0 * 0 漢字読み:音|漢字\n""" spec2 = """- 2 0 0 0 母 母/はは はは 母 名詞 6 普通名詞 1 * 0 * 0 漢字読み:訓|カテゴリ:人|漢字\n""" m1 = Morpheme(spec1, 1, juman_format=JUMAN_FORMAT.LATTICE_ALL) m2 = Morpheme(spec2, 1, juman_format=JUMAN_FORMAT.LATTICE_ALL) m1.push_doukei(m2) self.assertEqual(m1.repnames(), "母/ぼ?母/はは") def test_ranks(self): spec1 = """- 1 0 0 0 母 母/ぼ ぼ 母 名詞 6 普通名詞 1 * 0 * 0 漢字読み:音|漢字\n""" spec2 = """- 2 0 0 0 母 母/はは はは 母 名詞 6 普通名詞 1 * 0 * 0 漢字読み:訓|カテゴリ:人|漢字|ランク:1;2;3\n""" m1 = Morpheme(spec1, 1, juman_format=JUMAN_FORMAT.LATTICE_ALL) m2 = Morpheme(spec2, 1, juman_format=JUMAN_FORMAT.LATTICE_ALL) self.assertEqual(1, len(m1.ranks)) self.assertIn(1, m1.ranks) self.assertNotIn(2, m1.ranks) self.assertEqual(3, len(m2.ranks)) self.assertIn(1, m2.ranks) self.assertIn(2, m2.ranks) self.assertIn(3, m2.ranks) self.assertNotIn(4, m2.ranks) if __name__ == '__main__': unittest.main()