#!/usr/bin/env python3
# -*- coding:utf-8 -*-
__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2020 CKIP Lab'
__license__ = 'CC BY-NC-SA 4.0'
import collections as _collections
import json as _json
from typing import (
NamedTuple,
)
import treelib as _treelib
################################################################################################################################
[docs]class ParserNodeData(NamedTuple):
"""A parser node."""
role: str = None #: *str* – the role.
pos: str = None #: *str* – the post-tag.
term: str = None #: *str* – the text term.
def __str__(self):
return self.to_text()
[docs] @classmethod
def from_text(cls, text):
"""Construct an instance from :class:`ckipnlp.parser.CkipParser` output.
Parameters
----------
data : str
text such as ``'Head:Na:中文字'``.
Notes
-----
- ``'Head:Na:中文字'`` -> role = ``'Head'``, pos = ``'Na'``, term = ``'中文字'``
- ``'Head:Na'`` -> role = ``'Head'``, pos = ``'Na'``, term = ``None``
- ``'Na'`` -> role = ``None``, pos = ``'Na'``, term = ``None``
"""
if ':' in text:
fields = text.split(':')
return cls(*fields)
return cls(pos=text)
[docs] def to_text(self):
"""Transform to plain text.
Return
------
str
"""
return ':'.join(filter(None, self))
[docs] @classmethod
def from_dict(cls, data):
"""Construct an instance from python built-in containers.
Parameters
----------
data : dict
dictionary such as ``{ 'role': 'Head', 'pos': 'Na', 'term': '中文字' }``
"""
return cls(**data)
[docs] def to_dict(self):
"""Transform to python built-in containers.
Return
------
dict
"""
return self._asdict() # pylint: disable=no-member
[docs] @classmethod
def from_json(cls, data, **kwargs):
"""Construct an instance from JSON format.
Parameters
----------
data : str
please refer :meth:`from_dict` for format details.
"""
return cls.from_dict(_json.loads(data, **kwargs))
[docs] def to_json(self, **kwargs):
"""Transform to JSON format.
Return
------
str
"""
return _json.dumps(self.to_dict(), **kwargs)
[docs]class ParserNode(_treelib.Node):
"""A parser node for tree.
Attributes
----------
data : :class:`ParserNodeData`
See Also
--------
treelib.tree.Node: Please refer `<https://treelib.readthedocs.io/>`_ for built-in usages.
"""
data_class = ParserNodeData
def __repr__(self):
return '{name}(tag={tag}, identifier={identifier})'.format(
name=self.__class__.__name__,
tag=self.tag,
identifier=self.identifier,
)
[docs] def to_dict(self):
"""Transform to python built-in containers.
Return
------
dict
"""
return _collections.OrderedDict(id=self.identifier, data=self.data.to_dict())
[docs] def to_json(self, **kwargs):
"""Transform to JSON format.
Return
------
str
"""
return _json.dumps(self.to_dict(), **kwargs)
[docs]class ParserRelation(NamedTuple):
"""A parser relation."""
head: ParserNode #: :class:`ParserNode` – the head node.
tail: ParserNode #: :class:`ParserNode` – the tail node.
relation: str #: *str* – the relation.
def __repr__(self):
ret = '{name}(head={head}, tail={tail}, relation={relation})' if self._head_first \
else '{name}(tail={tail}, head={head}, relation={relation})'
return ret.format(name=type(self).__name__, head=self.head, tail=self.tail, relation=self.relation)
@property
def _head_first(self):
return self.head.identifier <= self.tail.identifier
[docs] def to_dict(self):
"""Transform to python built-in containers.
Return
------
dict
"""
return _collections.OrderedDict(head=self.head.to_dict(), tail=self.head.to_dict(), relation=self.relation)
[docs] def to_json(self, **kwargs):
"""Transform to JSON format.
Return
------
str
"""
return _json.dumps(self.to_dict(), **kwargs)
################################################################################################################################
[docs]class ParserTree(_treelib.Tree):
"""A parsed tree.
See Also
--------
treereelib.tree.Tree: Please refer `<https://treelib.readthedocs.io/>`_ for built-in usages.
"""
node_class = ParserNode
[docs] @staticmethod
def normalize_text(tree_text):
"""Text normalization for :class:`ckipnlp.parser.CkipParser` output.
Remove leading number and trailing ``#``.
"""
if '#' in tree_text:
tree_text = tree_text.split('] ', 2)[-1].rstrip('#')
return tree_text
def __str__(self):
self.to_text()
[docs] @classmethod
def from_text(cls, tree_text, *, normalize=True):
"""Create a :class:`ParserTree` object from :class:`ckipnlp.parser.CkipParser` output.
Parameters
----------
text : str
A parsed tree from :class:`ckipnlp.parser.CkipParser` output.
normalize : bool
Do text normalization using :meth:`normalize_text`.
"""
if normalize:
tree_text = cls.normalize_text(tree_text)
tree = cls()
node_id = 0
node_queue = [None]
text = ''
ending = True
for char in tree_text:
if char == '(':
node_data = cls.node_class.data_class.from_text(text)
tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data)
node_queue.append(node_id)
node_id += 1
text = ''
elif char == ')':
if not ending:
node_data = cls.node_class.data_class.from_text(text)
tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data)
node_id += 1
node_queue.pop()
text = ''
ending = True
elif char == '|':
if not ending:
node_data = cls.node_class.data_class.from_text(text)
tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data)
node_id += 1
text = ''
ending = True
else:
ending = False
text += char
return tree
[docs] def to_text(self, node_id=0):
"""Transform to plain text.
Return
------
str
"""
node = self[node_id]
tree_text = node.data.to_text()
children_text = '|'.join((self.to_text(child.identifier) for child in self.children(node_id)))
if children_text:
tree_text = '{}({})'.format(tree_text, children_text)
return tree_text
[docs] @classmethod
def from_dict(cls, data):
"""Construct an instance from python built-in containers.
Parameters
----------
data : dict
dictionary such as ``{ 'id': 0, 'data': { ... }, 'children': [ ... ] }``,
where ``'data'`` is a dictionary with the same format as :meth:`ParserNodeData.to_dict`,
and ``'children'`` is a list of dictionaries of subtrees with the same format as this tree.
"""
tree = cls()
queue = _collections.deque()
queue.append((data, None,))
while queue:
node_dict, parent_id = queue.popleft()
node_id = node_dict['id']
node_data = cls.node_class.data_class.from_dict(node_dict['data'])
tree.create_node(tag=node_data.to_text(), identifier=node_id, parent=parent_id, data=node_data)
for child in node_dict['children']:
queue.append((child, node_id,))
return tree
[docs] def to_dict(self, node_id=0): # pylint: disable=arguments-differ
"""Transform to python built-in containers.
Return
------
dict
"""
tree_dict = self[node_id].to_dict()
tree_dict['children'] = []
for child in self.children(node_id):
tree_dict['children'].append(self.to_dict(child.identifier))
return tree_dict
[docs] @classmethod
def from_json(cls, data, **kwargs):
"""Construct an instance from JSON format.
Parameters
----------
data : str
please refer :meth:`from_dict` for format details.
"""
return cls.from_dict(_json.loads(data, **kwargs))
[docs] def to_json(self, node_id=0, **kwargs): # pylint: disable=arguments-differ
"""Transform to JSON format.
Return
------
str
"""
return _json.dumps(self.to_dict(node_id=node_id), **kwargs)
[docs] def show(self, *, # pylint: disable=arguments-differ
key=lambda node: node.identifier,
idhidden=False,
**kwargs,
):
"""Show pretty tree."""
super().show(key=key, idhidden=idhidden, **kwargs)
[docs] def get_children(self, node_id, *, role):
"""Get children of a node with given role.
Parameters
----------
node_id : int
ID of target node.
role : str
the target role.
Yields
------
:class:`ParserNode`
the children nodes with given role.
"""
for child in self.children(node_id):
if child.data.role == role:
yield child
[docs] def get_heads(self, root_id=0, *, semantic=True, deep=True): # pylint: disable=too-many-branches
"""Get all head nodes of a subtree.
Parameters
----------
root_id : int
ID of the root node of target subtree.
semantic : bool
use semantic/syntactic policy. For semantic mode, return ``DUMMY`` or ``head`` instead of syntactic ``Head``.
deep : bool
find heads recursively.
Yields
------
:class:`ParserNode`
the head nodes.
"""
head_nodes = []
children = list(self.children(root_id))
# No child, choose the root node instead
if not children:
head_nodes.append(self[root_id])
# Semantic mode
if semantic:
# Find DUMMY
if not head_nodes:
for child in children:
if child.data.role in ('DUMMY', 'DUMMY1', 'DUMMY2',):
head_nodes.append(child)
# Find head
if not head_nodes:
for child in children:
if child.data.role == 'head':
head_nodes.append(child)
# Find Head
if not head_nodes:
for child in children:
if child.data.role == 'Head':
head_nodes.append(child)
# Found no head, choose the last child instead
if not head_nodes:
head_nodes.append(children[-1])
# Recursion
for node in head_nodes:
if deep and not node.is_leaf():
yield from self.get_heads(node.identifier, semantic=semantic)
else:
yield node
[docs] def get_relations(self, root_id=0, *, semantic=True):
"""Get all relations of a subtree.
Parameters
----------
root_id : int
ID of the subtree root node.
semantic : bool
please refer :meth:`get_heads` for policy detail.
Yields
------
:class:`ParserRelation`
the relations.
"""
children = list(self.children(root_id))
head_children = list(self.get_heads(root_id, semantic=semantic, deep=False))
# Get heads
for head_node in self.get_heads(root_id, semantic=semantic):
# Get tails
for tail in children:
if tail.data.role != 'Head' and tail not in head_children:
if tail.is_leaf():
yield ParserRelation(head=head_node, tail=tail, relation=tail.data.role)
else:
for node in self.get_heads(tail.identifier, semantic=semantic):
yield ParserRelation(head=head_node, tail=node, relation=tail.data.role)
# Recursion
for child in children:
yield from self.get_relations(child.identifier, semantic=semantic)