Source code for ckipnlp.driver.classic

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module provides drivers with CkipClassic backend.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2020 CKIP Lab'
__license__ = 'CC BY-NC-SA 4.0'

from ckipnlp.container import (
    TextParagraph as _TextParagraph,
    SegParagraph as _SegParagraph,
    WsPosParagraph as _WsPosParagraph,
    ParsedParagraph as _ParsedParagraph,
)

from .base import (
    BaseDriver as _BaseDriver,
    DriverType as _DriverType,
    DriverFamily as _DriverFamily,
)

################################################################################################################################

[docs]class CkipClassicWordSegmenter(_BaseDriver): """The CKIP word segmentation driver with CkipClassic backend. Arguments --------- lazy : bool Lazy initialize underlay object. do_pos : bool Returns POS-tag or not lexicons: Iterable[Tuple[str, str]] A list of the lexicon words and their POS-tags. .. py:method:: __call__(*, text) Apply word segmentation. Parameters **text** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The sentences. Returns - **ws** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The word-segmented sentences. - **pos** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The part-of-speech sentences. (returns if **do_pos** is set.) """ driver_type = _DriverType.WORD_SEGMENTER driver_family = _DriverFamily.CLASSIC _count = 0 def __init__(self, *, lazy=False, do_pos=False, lexicons=None): super().__init__(lazy=lazy) self._do_pos = do_pos self._lexicons = lexicons def _init(self): self.__class__._count += 1 # pylint: disable=protected-access if self.__class__._count > 1: # pylint: disable=protected-access raise RuntimeError(f'Never instance more than one {self.__class__.__name__}!') import ckip_classic.ws self._core = ckip_classic.ws.CkipWs(lex_list=self._lexicons) def _call(self, *, text): assert isinstance(text, _TextParagraph) wspos_text = self._core.apply_list(text.to_text()) ws, pos = _WsPosParagraph.from_text(wspos_text) return (ws, pos,) if self._do_pos else ws
[docs]class CkipClassicSentenceParser(_BaseDriver): """The CKIP sentence parsing driver with CkipClassic backend. Arguments --------- lazy : bool Lazy initialize underlay object. .. py:method:: __call__(*, ws, pos) Apply sentence parsing. Parameters - **ws** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The word-segmented sentences. - **pos** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The part-of-speech sentences. Returns **parsed** (:class:`ParsedParagraph <ckipnlp.container.parsed.ParsedParagraph>`) — The parsed-sentences. """ driver_type = _DriverType.SENTENCE_PARSER driver_family = _DriverFamily.CLASSIC _count = 0 def _init(self): self.__class__._count += 1 # pylint: disable=protected-access if self.__class__._count > 1: # pylint: disable=protected-access raise RuntimeError(f'Never instance more than one {self.__class__.__name__}!') import ckip_classic.parser self._core = ckip_classic.parser.CkipParser(do_ws=False) def _call(self, *, ws, pos): assert isinstance(ws, _SegParagraph) assert isinstance(pos, _SegParagraph) ws = _SegParagraph.from_list([map(self._half2full, line) for line in ws]) wspos_text = _WsPosParagraph.to_text(ws, pos) parsed_text = self._core.apply_list(wspos_text) parsed = _ParsedParagraph.from_text(parsed_text) return parsed @staticmethod def _half2full(text): return text \ .replace('(', '(') \ .replace(')', ')') \ .replace('+', '+') \ .replace('-', '-') \ .replace(':', ':') \ .replace('|', '|') \ .replace('&', '&') \ .replace('#', '#')