Source code for ckipnlp.driver.tagger

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module provides drivers with CkipTagger backend.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2020 CKIP Lab'
__license__ = 'CC BY-NC-SA 4.0'

from ckipnlp.container import (
    TextParagraph as _TextParagraph,
    SegParagraph as _SegParagraph,
    NerParagraph as _NerParagraph,
)

from ckipnlp.util.data import (
    get_tagger_data as _get_tagger_data,
)

from .base import (
    BaseDriver as _BaseDriver,
    DriverType as _DriverType,
    DriverFamily as _DriverFamily,
)

################################################################################################################################

[docs]class CkipTaggerWordSegmenter(_BaseDriver):
    """The CKIP word segmentation driver with CkipTagger backend.

    Arguments
    ---------
        lazy : bool
            Lazy initialize underlay object.
        disable_cuda : bool
            Disable GPU usage.
        recommend_lexicons: Mapping[str, float]
            A mapping of lexicon words to their relative weights.
        coerce_lexicons: Mapping[str, float]
            A mapping of lexicon words to their relative weights.

    Other Parameters
    ----------------
        **opts
            Extra options for :meth:`ckiptagger.WS.__call__`.
            (Please refer https://github.com/ckiplab/ckiptagger#4-run-the-ws-pos-ner-pipeline for details.)


    .. py:method:: __call__(*, text)

        Apply word segmentation.

        Parameters
            **text** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The sentences.

        Returns
            **ws** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The word-segmented sentences.
    """

    driver_type = _DriverType.WORD_SEGMENTER
    driver_family = _DriverFamily.TAGGER

    def __init__(self, *, lazy=False, disable_cuda=True, recommend_lexicons={}, coerce_lexicons={}, **opts):
        super().__init__(lazy=lazy)
        self._disable_cuda = disable_cuda
        self._recommend_lexicons = recommend_lexicons
        self._coerce_lexicons = coerce_lexicons
        self._opts = opts

    def _init(self):
        import ckiptagger

        if self._recommend_lexicons:
            self._opts['recommend_dictionary'] = ckiptagger.construct_dictionary(self._recommend_lexicons)
        if self._coerce_lexicons:
            self._opts['coerce_dictionary'] = ckiptagger.construct_dictionary(self._coerce_lexicons)

        self._core = ckiptagger.WS(_get_tagger_data(), disable_cuda=self._disable_cuda)

    def _call(self, *, text):
        assert isinstance(text, _TextParagraph)

        ws_list = self._core(text, **self._opts)
        ws = _SegParagraph.from_list(ws_list)

        return ws

[docs]class CkipTaggerPosTagger(_BaseDriver):
    """The CKIP part-of-speech tagging driver with CkipTagger backend.

    Arguments
    ---------
        lazy : bool
            Lazy initialize underlay object.
        disable_cuda : bool
            Disable GPU usage.

    Other Parameters
    ----------------
        **opts
            Extra options for :meth:`ckiptagger.POS.__call__`.
            (Please refer https://github.com/ckiplab/ckiptagger#4-run-the-ws-pos-ner-pipeline for details.)

    .. py:method:: __call__(*, text)

        Apply part-of-speech tagging.

        Parameters
            **ws** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The word-segmented sentences.

        Returns
            **pos** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The part-of-speech sentences.
    """

    driver_type = _DriverType.POS_TAGGER
    driver_family = _DriverFamily.TAGGER

    def __init__(self, *, lazy=False, disable_cuda=True, **opts):
        super().__init__(lazy=lazy)
        self._disable_cuda = disable_cuda
        self._opts = opts

    def _init(self):
        import ckiptagger
        self._core = ckiptagger.POS(_get_tagger_data(), disable_cuda=self._disable_cuda)

    def _call(self, *, ws):
        assert isinstance(ws, _SegParagraph)

        pos_list = self._core(ws, **self._opts)
        pos = _SegParagraph.from_list(pos_list)

        return pos

[docs]class CkipTaggerNerChunker(_BaseDriver):
    """The CKIP named-entity recognition driver with CkipTagger backend.

    Arguments
    ---------
        lazy : bool
            Lazy initialize underlay object.
        disable_cuda : bool
            Disable GPU usage.

    Other Parameters
    ----------------
        **opts
            Extra options for :meth:`ckiptagger.NER.__call__`.
            (Please refer https://github.com/ckiplab/ckiptagger#4-run-the-ws-pos-ner-pipeline for details.)

    .. py:method:: __call__(*, text)

        Apply named-entity recognition.

        Parameters
            - **ws** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The word-segmented sentences.
            - **pos** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The part-of-speech sentences.

        Returns
            **ner** (:class:`NerParagraph <ckipnlp.container.ner.NerParagraph>`) — The named-entity recognition results.
    """

    driver_type = _DriverType.NER_CHUNKER
    driver_family = _DriverFamily.TAGGER

    def __init__(self, *, lazy=False, disable_cuda=True, **opts):
        super().__init__(lazy=lazy)
        self._disable_cuda = disable_cuda
        self._opts = opts

    def _init(self):
        import ckiptagger
        self._core = ckiptagger.NER(_get_tagger_data(), disable_cuda=self._disable_cuda)

    def _call(self, *, ws, pos):
        assert isinstance(ws, _SegParagraph)
        assert isinstance(pos, _SegParagraph)

        ner_list = self._core(ws, pos, **self._opts)
        ner = _NerParagraph.from_tagger(ner_list)

        return ner