#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""
This module provides containers for word-segmented sentences with part-of-speech-tags.
"""
__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2020 CKIP Lab'
__license__ = 'CC BY-NC-SA 4.0'
from abc import (
ABCMeta as _ABCMeta,
abstractmethod as _abstractmethod,
)
from typing import (
NamedTuple as _NamedTuple,
)
from ..base import (
BaseTuple as _BaseTuple,
)
from ..seg import (
SegSentence as _SegSentence,
SegParagraph as _SegParagraph,
)
################################################################################################################################
def _token_from_text(data):
"""str -> Tuple[str, str]"""
return data.strip().strip(')').rsplit('(', 1)
def _token_to_text(data):
"""Tuple[str, str] -> str"""
return '{}({})'.format(*data)
def _sentence_from_text(data):
"""str -> Tuple[Iterable[str], Iterable[str]]"""
return zip(*map(_token_from_text, data.split('\u3000')))
def _sentence_to_text(data):
"""Tuple[Iterable[str], Iterable[str]] -> str"""
return '\u3000'.join(map(_token_to_text, zip(*data)))
def _paragraph_from_text(data):
"""Iterable[str] -> Tuple[Iterable[Iterable[str]], Iterable[Iterable[str]]]"""
return zip(*map(_sentence_from_text, data))
def _paragraph_to_text(data):
"""Tuple[Iterable[Iterable[str]], Iterable[Iterable[str]]] -> Iterable[str]"""
return map(_sentence_to_text, zip(*data))
################################################################################################################################
class _WsPosToken(_NamedTuple):
word: str = None
pos: str = None
[docs]class WsPosToken(_BaseTuple, _WsPosToken):
"""A word with POS-tag.
Attributes
----------
word : str
the word.
pos : str
the POS-tag.
Note
----
This class is an subclass of *tuple*. To change the attribute, please create a new instance instead.
.. admonition:: Data Structure Examples
Text format
Used for :meth:`from_text` and :meth:`to_text`.
.. code-block:: python
'中文字(Na)' # word / POS-tag
Dict format
Used for :meth:`from_dict` and :meth:`to_dict`.
.. code-block:: python
{
'word': '中文字', # word
'pos': 'Na', # POS-tag
}
List format
Used for :meth:`from_list` and :meth:`to_list`.
.. code-block:: python
[
'中文字', # word
'Na', # POS-tag
]
"""
def __str__(self):
return str(self.to_text())
########################################################################################################################
[docs] @classmethod
def from_text(cls, data):
"""Construct an instance from text format.
Parameters
----------
data : str
text such as ``'中文字(Na)'``.
.. note::
- ``'中文字(Na)'`` -> word = ``'中文字'``, pos = ``'Na'``
- ``'中文字'`` -> word = ``'中文字'``, pos = ``None``
"""
return cls(*_token_from_text(data))
[docs] def to_text(self):
return _token_to_text(self)
################################################################################################################################
[docs]class WsPosSentence:
"""A helper class for data conversion of word-segmented and part-of-speech sentences."""
@_abstractmethod
def __init__(self):
pass
########################################################################################################################
[docs] @classmethod
def from_text(cls, data):
"""Convert text format to word-segmented and part-of-speech sentences.
Parameters
----------
data : str
text such as ``'中文字(Na)\\u3000喔(T)'``.
Returns
-------
:class:`SegSentence <.seg.SegSentence>`:
the word sentence
:class:`SegSentence <.seg.SegSentence>`:
the POS-tag sentence.
"""
return tuple(map(_SegSentence.from_list, _sentence_from_text(data)))
[docs] @staticmethod
def to_text(word, pos):
"""Convert text format to word-segmented and part-of-speech sentences.
Parameters
----------
word : :class:`SegSentence <.seg.SegSentence>`
the word sentence
pos : :class:`SegSentence <.seg.SegSentence>`
the POS-tag sentence.
Returns
-------
str
text such as ``'中文字(Na)\\u3000喔(T)'``.
"""
return _sentence_to_text((word, pos,))
################################################################################################################################
[docs]class WsPosParagraph(metaclass=_ABCMeta):
"""A helper class for data conversion of word-segmented and part-of-speech sentence lists."""
@_abstractmethod
def __init__(self):
pass
########################################################################################################################
[docs] @classmethod
def from_text(cls, data):
"""Convert text format to word-segmented and part-of-speech sentence lists.
Parameters
----------
data : Sequence[str]
list of sentences such as ``'中文字(Na)\\u3000喔(T)'``.
Returns
-------
:class:`SegParagraph <.seg.SegParagraph>`:
the word sentence list
:class:`SegParagraph <.seg.SegParagraph>`:
the POS-tag sentence list.
"""
return tuple(map(_SegParagraph.from_list, _paragraph_from_text(data)))
[docs] @staticmethod
def to_text(word, pos):
"""Convert text format to word-segmented and part-of-speech sentence lists.
Parameters
----------
word : :class:`SegParagraph <.seg.SegParagraph>`
the word sentence list
pos : :class:`SegParagraph <.seg.SegParagraph>`
the POS-tag sentence list.
Returns
-------
List[str]
list of sentences such as ``'中文字(Na)\\u3000喔(T)'``.
"""
return list(_paragraph_to_text((word, pos,)))