Source code for ckipnlp.container.util.wspos

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module provides containers for word-segmented sentences with part-of-speech-tags.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2020 CKIP Lab'
__license__ = 'CC BY-NC-SA 4.0'

from abc import (
    ABCMeta as _ABCMeta,
    abstractmethod as _abstractmethod,
)

from typing import (
    NamedTuple as _NamedTuple,
)

from ..base import (
    BaseTuple as _BaseTuple,
)

from ..seg import (
    SegSentence as _SegSentence,
    SegParagraph as _SegParagraph,
)

################################################################################################################################

def _token_from_text(data):
    """str -> Tuple[str, str]"""
    return data.strip().strip(')').rsplit('(', 1)

def _token_to_text(data):
    """Tuple[str, str] -> str"""
    return '{}({})'.format(*data)

def _sentence_from_text(data):
    """str -> Tuple[Iterable[str], Iterable[str]]"""
    return zip(*map(_token_from_text, data.split('\u3000')))

def _sentence_to_text(data):
    """Tuple[Iterable[str], Iterable[str]] -> str"""
    return '\u3000'.join(map(_token_to_text, zip(*data)))

def _paragraph_from_text(data):
    """Iterable[str] -> Tuple[Iterable[Iterable[str]], Iterable[Iterable[str]]]"""
    return zip(*map(_sentence_from_text, data))

def _paragraph_to_text(data):
    """Tuple[Iterable[Iterable[str]], Iterable[Iterable[str]]] -> Iterable[str]"""
    return map(_sentence_to_text, zip(*data))

################################################################################################################################

class _WsPosToken(_NamedTuple):
    word: str = None
    pos: str = None

[docs]class WsPosToken(_BaseTuple, _WsPosToken): """A word with POS-tag. Attributes ---------- word : str the word. pos : str the POS-tag. Note ---- This class is an subclass of *tuple*. To change the attribute, please create a new instance instead. .. admonition:: Data Structure Examples Text format Used for :meth:`from_text` and :meth:`to_text`. .. code-block:: python '中文字(Na)' # word / POS-tag Dict format Used for :meth:`from_dict` and :meth:`to_dict`. .. code-block:: python { 'word': '中文字', # word 'pos': 'Na', # POS-tag } List format Used for :meth:`from_list` and :meth:`to_list`. .. code-block:: python [ '中文字', # word 'Na', # POS-tag ] """ def __str__(self): return str(self.to_text()) ########################################################################################################################
[docs] @classmethod def from_text(cls, data): """Construct an instance from text format. Parameters ---------- data : str text such as ``'中文字(Na)'``. .. note:: - ``'中文字(Na)'`` -> word = ``'中文字'``, pos = ``'Na'`` - ``'中文字'`` -> word = ``'中文字'``, pos = ``None`` """ return cls(*_token_from_text(data))
[docs] def to_text(self): return _token_to_text(self)
################################################################################################################################
[docs]class WsPosSentence: """A helper class for data conversion of word-segmented and part-of-speech sentences.""" @_abstractmethod def __init__(self): pass ########################################################################################################################
[docs] @classmethod def from_text(cls, data): """Convert text format to word-segmented and part-of-speech sentences. Parameters ---------- data : str text such as ``'中文字(Na)\\u3000喔(T)'``. Returns ------- :class:`SegSentence <.seg.SegSentence>`: the word sentence :class:`SegSentence <.seg.SegSentence>`: the POS-tag sentence. """ return tuple(map(_SegSentence.from_list, _sentence_from_text(data)))
[docs] @staticmethod def to_text(word, pos): """Convert text format to word-segmented and part-of-speech sentences. Parameters ---------- word : :class:`SegSentence <.seg.SegSentence>` the word sentence pos : :class:`SegSentence <.seg.SegSentence>` the POS-tag sentence. Returns ------- str text such as ``'中文字(Na)\\u3000喔(T)'``. """ return _sentence_to_text((word, pos,))
################################################################################################################################
[docs]class WsPosParagraph(metaclass=_ABCMeta): """A helper class for data conversion of word-segmented and part-of-speech sentence lists.""" @_abstractmethod def __init__(self): pass ########################################################################################################################
[docs] @classmethod def from_text(cls, data): """Convert text format to word-segmented and part-of-speech sentence lists. Parameters ---------- data : Sequence[str] list of sentences such as ``'中文字(Na)\\u3000喔(T)'``. Returns ------- :class:`SegParagraph <.seg.SegParagraph>`: the word sentence list :class:`SegParagraph <.seg.SegParagraph>`: the POS-tag sentence list. """ return tuple(map(_SegParagraph.from_list, _paragraph_from_text(data)))
[docs] @staticmethod def to_text(word, pos): """Convert text format to word-segmented and part-of-speech sentence lists. Parameters ---------- word : :class:`SegParagraph <.seg.SegParagraph>` the word sentence list pos : :class:`SegParagraph <.seg.SegParagraph>` the POS-tag sentence list. Returns ------- List[str] list of sentences such as ``'中文字(Na)\\u3000喔(T)'``. """ return list(_paragraph_to_text((word, pos,)))