Source code for ckipnlp.driver.ss

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

"""
This module provides built-in sentence segmentation driver.
"""

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2023 CKIP Lab'
__license__ = 'GPL-3.0'

import re as _re

from ckipnlp.container import (
    TextParagraph as _TextParagraph,
)

from .base import (
    BaseDriver as _BaseDriver,
)

################################################################################################################################

[docs]class CkipSentenceSegmenter(_BaseDriver):  # pylint: disable=too-few-public-methods
    """The CKIP sentence segmentation driver.

    Arguments
    ---------
        lazy : bool
            Lazy initialize the driver.
        delims : str
            The delimiters.
        keep_delims : bool
            Keep the delimiters.

    .. method:: __call__(*, raw, keep_all=True)

        Apply sentence segmentation.

        Parameters
            **raw** (*str*) — The raw text.

        Returns
            **text** (:class:`TextParagraph <ckipnlp.container.text.TextParagraph>`) — The sentences.
    """

    driver_type = 'sentence_segmenter'
    driver_family = 'default'
    driver_inputs = ('raw',)

    def __init__(self, *, lazy=False, delims='\n', keep_delims=False):
        super().__init__(lazy=lazy)

        self.delims = delims
        self._keep_delims = keep_delims

    def _init(self):
        pass

    def _call(self, *, raw):
        assert isinstance(raw, str)

        if not self._keep_delims:
            # Replace spaces
            text = _re.sub(rf'[^\S{self.delims}]', '', raw)

            # Segment
            text = _re.split(rf'[{self.delims}]+', text)

            # Remove empty lines
            text = filter(None, text)

        else:
            text = _re.split(rf'([{self.delims}]+)', raw)
            if text[-1] == '':
                del text[-1]
            if len(text) % 2:
                text.append('')
            text = [word+punct for word, punct in zip(text[::2], text[1::2])]

        return _TextParagraph.from_text(text)