#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""
This module provides kernel CKIPNLP pipeline.
"""
__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2020 CKIP Lab'
__license__ = 'CC BY-NC-SA 4.0'
from collections.abc import (
Mapping as _Mapping,
)
from ckipnlp.driver.base import (
DriverRegister as _DriverRegister,
)
###############################################################################################################################)
[docs]class CkipDocument(_Mapping):
"""The kernel document.
Attributes
----------
raw : str
The unsegmented text input.
text : :class:`~ckipnlp.container.text.TextParagraph`
The sentences.
ws : :class:`~ckipnlp.container.seg.SegParagraph`
The word-segmented sentences.
pos : :class:`~ckipnlp.container.seg.SegParagraph`
The part-of-speech sentences.
ner : :class:`~ckipnlp.container.ner.NerParagraph`
The named-entity recognition results.
conparse : :class:`~ckipnlp.container.parse.ParseParagraph`
The constituency-parsing sentences.
"""
__keys = ('raw', 'text', 'ws', 'pos', 'ner', 'conparse',)
def __init__(self, *, raw=None, text=None, ws=None, pos=None, ner=None, conparse=None):
self.raw = raw
self.text = text
self.ws = ws
self.pos = pos
self.ner = ner
self.conparse = conparse
self._wspos = None
def __len__(self):
return len(self.__keys)
def __iter__(self):
yield from self.__keys
def __getitem__(self, key):
return getattr(self, key)
################################################################################################################################
[docs]class CkipPipeline:
"""The kernel pipeline.
Arguments
---------
sentence_segmenter : str
The type of sentence segmenter.
word_segmenter : str
The type of word segmenter.
pos_tagger : str
The type of part-of-speech tagger.
ner_chunker : str
The type of named-entity recognition chunker.
con_parser : str
The type of constituency parser.
Other Parameters
----------------
lazy : bool
Lazy initialize the drivers.
opts : Dict[str, Dict]
The driver options. Key: driver name (e.g. `'sentence_segmenter'`); Value: a dictionary of options.
"""
def __init__(self, *,
sentence_segmenter='default',
word_segmenter='tagger',
pos_tagger='tagger',
con_parser='classic',
ner_chunker='tagger',
lazy=True,
opts={},
):
if word_segmenter == '_classic':
word_segmenter = 'classic'
if pos_tagger == '_classic':
pos_tagger = 'classic'
# WS & POS
if pos_tagger == 'classic':
assert word_segmenter == 'classic', 'CkipClassicPosTagger must be used with CkipClassicWordSegmenter together!'
self._wspos_driver = _DriverRegister.get('_wspos', '_classic')(
lazy=lazy, **opts.get('word_segmenter', {}), **opts.get('pos_tagger', {}),
)
word_segmenter = '_classic'
pos_tagger = '_classic'
else:
self._wspos_driver = _DriverRegister.get(None, None)()
self._sentence_segmenter = _DriverRegister.get('sentence_segmenter', sentence_segmenter)(
lazy=lazy, **opts.get('sentence_segmenter', {}),
)
self._word_segmenter = _DriverRegister.get('word_segmenter', word_segmenter)(
lazy=lazy, **opts.get('word_segmenter', {}),
)
self._pos_tagger = _DriverRegister.get('pos_tagger', pos_tagger)(
lazy=lazy, **opts.get('pos_tagger', {}),
)
self._con_parser = _DriverRegister.get('con_parser', con_parser)(
lazy=lazy, **opts.get('con_parser', {}),
)
self._ner_chunker = _DriverRegister.get('ner_tagger', ner_chunker)(
lazy=lazy, **opts.get('ner_chunker', {}),
)
########################################################################################################################
def _get(self, key, doc):
driver, name = {
'raw': (
None, None,
),
'_wspos': (
self._wspos_driver, 'classic word segmentation',
),
'text': (
self._sentence_segmenter, 'sentence segmentation',
),
'ws': (
self._word_segmenter, 'word segmentation',
),
'pos': (
self._pos_tagger, 'part-of-speech tagging',
),
'conparse': (
self._con_parser, 'constituency parsing',
),
'ner': (
self._ner_chunker, 'named-entity recognition',
),
}[key]
if doc[key] is NotImplemented:
raise RecursionError('Loop dependence detected!')
if doc[key] is None:
setattr(doc, key, NotImplemented)
if key == 'raw':
raise AttributeError('No raw text!')
elif not driver.is_dummy:
ret = driver._call_from_pipeline(self, doc) # pylint: disable=protected-access
setattr(doc, key, ret)
else:
raise AttributeError(f'No {name} driver / no {name} as input!')
return doc[key]
########################################################################################################################
[docs] def get_text(self, doc):
"""Apply sentence segmentation.
Arguments
---------
doc : :class:`CkipDocument`
The input document.
Returns
-------
doc.text : :class:`~ckipnlp.container.text.TextParagraph`
The sentences.
.. note::
This routine modify **doc** inplace.
"""
return self._get('text', doc)
########################################################################################################################
[docs] def get_ws(self, doc):
"""Apply word segmentation.
Arguments
---------
doc : :class:`CkipDocument`
The input document.
Returns
-------
doc.ws : :class:`~ckipnlp.container.seg.SegParagraph`
The word-segmented sentences.
.. note::
This routine modify **doc** inplace.
"""
return self._get('ws', doc)
########################################################################################################################
[docs] def get_pos(self, doc):
"""Apply part-of-speech tagging.
Arguments
---------
doc : :class:`CkipDocument`
The input document.
Returns
-------
doc.pos : :class:`~ckipnlp.container.seg.SegParagraph`
The part-of-speech sentences.
.. note::
This routine modify **doc** inplace.
"""
return self._get('pos', doc)
########################################################################################################################
[docs] def get_ner(self, doc):
"""Apply named-entity recognition.
Arguments
---------
doc : :class:`CkipDocument`
The input document.
Returns
-------
doc.ner : :class:`~ckipnlp.container.ner.NerParagraph`
The named-entity recognition results.
.. note::
This routine modify **doc** inplace.
"""
return self._get('ner', doc)
########################################################################################################################
[docs] def get_conparse(self, doc):
"""Apply constituency parsing.
Arguments
---------
doc : :class:`CkipDocument`
The input document.
Returns
-------
doc.conparse : :class:`~ckipnlp.container.parse.ParseParagraph`
The constituency parsing sentences.
.. note::
This routine modify **doc** inplace.
"""
return self._get('conparse', doc)