Source code for ckipnlp.driver.classic

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module provides drivers with CkipClassic backend.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2023 CKIP Lab'
__license__ = 'GPL-3.0'

from abc import (
    abstractmethod as _abstractmethod,
)

from itertools import (
    chain as _chain,
)

from ckipnlp.container import (
    TextParagraph as _TextParagraph,
    SegParagraph as _SegParagraph,
    WsPosSentence as _WsPosSentence,
    WsPosParagraph as _WsPosParagraph,
    ParseParagraph as _ParseParagraph,
)

from .base import (
    BaseDriver as _BaseDriver,
)

################################################################################################################################

[docs]class CkipClassicWordSegmenter(_BaseDriver): """The CKIP word segmentation driver with CkipClassic backend. Arguments --------- lazy : bool Lazy initialize the driver. do_pos : bool Returns POS-tag or not lexicons: Iterable[Tuple[str, str]] A list of the lexicon words and their POS-tags. .. method:: __call__(*, text) Apply word segmentation. Parameters **text** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The sentences. Returns - **ws** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The word-segmented sentences. - **pos** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The part-of-speech sentences. (returns if **do_pos** is set.) """ driver_type = None driver_family = 'classic' driver_inputs = None _count = 0 def __init__(self, *, lazy=False, do_pos=False, lexicons=None): super().__init__(lazy=lazy) self._do_pos = do_pos self._lexicons = lexicons def _init(self): self.__class__._count += 1 # pylint: disable=protected-access if self.__class__._count > 1: # pylint: disable=protected-access raise RuntimeError(f'Never instance more than one {self.__class__.__name__}!') import ckip_classic.ws self._core = ckip_classic.ws.CkipWs( new_style_format=True, lex_list=self._lexicons, ) def _call(self, *, text): assert isinstance(text, _TextParagraph) wspos_text = self._core.apply_list(text.to_text()) ws, pos = _WsPosParagraph.from_text(wspos_text) return (ws, pos,) if self._do_pos else ws
class _CkipClassicWordSegmenter(CkipClassicWordSegmenter): """The dummy class for :class:`CkipClassicWordSegmenter` for pipeline.""" driver_type = 'word_segmenter' driver_family = 'classic' driver_inputs = ('text',) def __init__(self, *, lazy=False, lexicons=None): super().__init__(lazy=lazy, do_pos=False, lexicons=lexicons) class _CkipClassic2WsPos(CkipClassicWordSegmenter): """The dummy class for :class:`CkipClassicWordSegmenter` for pipeline.""" driver_type = '_wspos' driver_family = '_classic' driver_inputs = ('text',) def __init__(self, *, lazy=False, lexicons=None): super().__init__(lazy=lazy, do_pos=True, lexicons=lexicons) class _CkipClassic2WordSegmenter(_BaseDriver): """The dummy class for :class:`CkipClassicWordSegmenter` for pipeline.""" driver_type = 'word_segmenter' driver_family = '_classic' driver_inputs = ('_wspos',) def _init(self): pass def _call(self, *, _wspos): return _wspos[0] class _CkipClassic2PosTagger(_BaseDriver): """The dummy class for :class:`CkipClassicWordSegmenter` for pipeline.""" driver_type = 'pos_tagger' driver_family = '_classic' driver_inputs = ('_wspos',) def _init(self): pass def _call(self, *, _wspos): return _wspos[1] ################################################################################################################################ class _CkipClassicConParser(_BaseDriver): driver_type = 'con_parser' driver_inputs = ('ws', 'pos',) @_abstractmethod def driver_family(self): # pylint: disable=missing-docstring return NotImplemented @_abstractmethod def _init(self): return NotImplemented def _call(self, *, ws, pos): assert isinstance(ws, _SegParagraph) assert isinstance(pos, _SegParagraph) conparse_text = [] for ws_sent, pos_sent in zip(ws, pos): conparse_sent_text = [] ws_clause = [] pos_clause = [] for ws_token, pos_token in _chain(zip(ws_sent, pos_sent), [(None, None),]): # Skip WHITESPACE if pos_token == 'WHITESPACE': continue # Segment clauses by punctuations if pos_token is None or (pos_token.endswith('CATEGORY') and pos_token != 'PAUSECATEGORY'): if ws_clause: wspos_clause_text = _WsPosSentence.to_text(ws_clause, pos_clause) for conparse_clause_text in self._core.apply_list([wspos_clause_text]): conparse_sent_text.append([self._normalize(conparse_clause_text), '',]) if ws_token: if not conparse_sent_text: conparse_sent_text.append([None, '',]) conparse_sent_text[-1][1] += ws_token ws_clause = [] pos_clause = [] else: ws_clause.append(self._half2full(ws_token)) pos_clause.append(pos_token) conparse_text.append(conparse_sent_text) conparse = _ParseParagraph.from_list(conparse_text) return conparse @staticmethod def _half2full(text): return text \ .replace('(', '(') \ .replace(')', ')') \ .replace('+', '+') \ .replace('-', '-') \ .replace(':', ':') \ .replace('|', '|') @staticmethod def _normalize(text): return text.split('] ', 2)[-1].rstrip('#') ################################################################################################################################
[docs]class CkipClassicConParser(_CkipClassicConParser): """The CKIP constituency parsing driver with CkipClassic backend. Arguments --------- lazy : bool Lazy initialize the driver. .. method:: __call__(*, ws, pos) Apply constituency parsing. Parameters - **ws** (:class:`~ckipnlp.container.text.TextParagraph`) — The word-segmented sentences. - **pos** (:class:`~ckipnlp.container.text.TextParagraph`) — The part-of-speech sentences. Returns **conparse** (:class:`~ckipnlp.container.parse.ParseSentence`) — The constituency-parsing sentences. """ driver_family = 'classic' _count = 0 def _init(self): self.__class__._count += 1 # pylint: disable=protected-access if self.__class__._count > 1: # pylint: disable=protected-access raise RuntimeError(f'Never instance more than one {self.__class__.__name__}!') import ckip_classic.parser self._core = ckip_classic.parser.CkipParser(do_ws=False)
[docs]class CkipClassicConParserClient(_CkipClassicConParser): """The CKIP constituency parsing driver with CkipClassic client backend. Arguments --------- lazy : bool Lazy initialize the driver. username : string (*optional*) The username of CkipClassicParserClient. password : string (*optional*) The password of CkipClassicParserClient. Notes ----- Please register an account at http://parser.iis.sinica.edu.tw/v1/reg.php and set the environment variables ``$CKIPPARSER_USERNAME`` and ``$CKIPPARSER_PASSWORD``. .. method:: __call__(*, ws, pos) Apply constituency parsing. Parameters - **ws** (:class:`~ckipnlp.container.text.TextParagraph`) — The word-segmented sentences. - **pos** (:class:`~ckipnlp.container.text.TextParagraph`) — The part-of-speech sentences. Returns **conparse** (:class:`~ckipnlp.container.parse.ParseSentence`) — The constituency-parsing sentences. """ driver_family = 'classic-client' def __init__(self, *, lazy=False, **opts): super().__init__(lazy=lazy) self._opts = opts def _init(self): import ckip_classic.client self._core = ckip_classic.client.CkipParserClient(**self._opts)