Source code for ckipnlp.util.parser

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2019 CKIP Lab'
__license__ = 'CC-BY-NC-SA 4.0'

import collections as _collections
import itertools as _itertools
import json as _json

import treelib as _treelib

################################################################################################################################

_ParserNodeData = _collections.namedtuple('_ParserNodeData', ('role', 'pos', 'term',))
_ParserNodeData.__new__.__defaults__ = (None, None, None,)
[docs]class ParserNodeData(_ParserNodeData): """A parser node. Fields: * **role** (*str*): the role. * **pos** (*str*): the post-tag. * **term** (*str*): the text term. """
[docs] @classmethod def from_text(cls, text): """Create :class:`ParserNodeData` object from :class:`ckipnlp.parser.CkipParser` output.""" fields = text.split(':') return cls(*fields)
[docs] def to_dict(self): return self._asdict()
[docs] def to_json(self, **kwargs): return _json.dumps(self.to_dict(), **kwargs)
[docs]class ParserNode(_treelib.Node): """A parser node for tree."""
[docs] def to_dict(self): return _collections.OrderedDict(id=self.identifier, **self.data.to_dict())
[docs] def to_json(self, **kwargs): return _json.dumps(self.to_dict(), **kwargs)
_ParserRelation = _collections.namedtuple('_ParserRelation', ('head', 'tail', 'relation'))
[docs]class ParserRelation(_ParserRelation): """A parser relation. Fields: * **head** (:class:`ParserNode`): the head node. * **tail** (:class:`ParserNode`): the tail node. * **relation** (str): the relation. """ def __str__(self): ret = '{name}(head={head}, tail={tail}, relation={relation})' if self.head.identifier <= self.tail.identifier \ else '{name}(tail={tail}, head={head}, relation={relation})' return ret.format(name=type(self).__name__, head=self.head, tail=self.tail, relation=self.relation) def __repr__(self): return str(self)
[docs] def to_dict(self): return _collections.OrderedDict(head=self.head.to_dict(), tail=self.head.to_dict(), relation=self.relation)
[docs] def to_json(self, **kwargs): return _json.dumps(self.to_dict(), **kwargs)
################################################################################################################################
[docs]class ParserTree(_treelib.Tree): """A parsed tree."""
[docs] @classmethod def from_text(cls, tree_text): """Create :class:`ParserTree` object from :class:`ckipnlp.parser.CkipParser` output.""" tree = cls(node_class=ParserNode) if '#' in tree_text: tree_text = tree_text.split(' ', 2)[-1].split('#')[0] node_id = 0 node_queue = [None] text = 'root:' ending = True for char in tree_text: if char == '(': node_data = ParserNodeData.from_text(text) tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data) node_queue.append(node_id) node_id += 1 text = '' elif char == ')': if not ending: node_data = ParserNodeData.from_text(text) tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data) node_id += 1 node_queue.pop() text = '' ending = True elif char == '|': if not ending: node_data = ParserNodeData.from_text(text) tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data) node_id += 1 text = '' ending = True else: ending = False text += char return tree
[docs] def to_dict(self, node_id=0): # pylint: disable=arguments-differ node = self[node_id] tree_dict = node.to_dict() for child in self.children(node_id): tree_dict.setdefault('children', list()).append(self.to_dict(child.identifier)) return tree_dict
[docs] def to_json(self, **kwargs): # pylint: disable=arguments-differ return _json.dumps(self.to_dict(), **kwargs)
[docs] def show(self, *, key=lambda node: node.identifier, idhidden=False, **kwargs): # pylint: disable=arguments-differ """Show pretty tree.""" super().show(key=key, idhidden=idhidden, **kwargs)
[docs] def has_dummies(self, node_id): """Determine if a node has dummies. Parameters ---------- node_id : int ID of target node. Returns ------- bool whether or not target node has dummies. """ roles = [node.data.role for node in self.children(node_id)] return 'DUMMY1' in roles and 'DUMMY2' in roles
[docs] def get_dummies(self, node_id, deep=True, _check=True): """Get dummies of a node. Parameters ---------- node_id : int ID of target node. deep : bool find dummies recursively. Returns ------- tuple the dummies (:class:`ParserNode`). Raises ------ LookupError when target node has no dummy (only when **_check** is set). """ if _check and not self.has_dummies(node_id): raise LookupError('Node ({node_id}) does not have dummies!'.format(node_id=node_id)) dummy1 = () dummy2 = () for child in self.children(node_id): if child.data.role == 'DUMMY1': if deep and self.has_dummies(child.identifier): dummy1 = self.get_dummies(child.identifier, deep=True, _check=False) else: dummy1 = (child,) if child.data.role == 'DUMMY2': if deep and self.has_dummies(child.identifier): dummy2 = self.get_dummies(child.identifier, deep=True, _check=False) else: dummy2 = (child,) return (*dummy1, *dummy2,)
[docs] def get_heads(self, root_id=0, deep=True): # pylint: disable=too-many-branches """Get all head nodes of a subtree. Parameters ---------- node_id : int ID of the root node of target subtree. deep : bool find heads recursively. Returns ------- list the head nodes (:class:`ParserNode`). :class:`ParserNode` the head node (when **deep** is set). Todo ---- Get information of nodes with pos type PP or GP. """ head_nodes = None children = list(self.children(root_id)) # No child, choose the root node instead if not children: head_nodes = (self[root_id],) # Find head if head_nodes is None: for child in children: if child.data.role == 'head': if not deep: head_nodes = (child,) else: if child.data.pos == 'Caa': # Found Caa, choose dummies of root instead head_nodes = tuple(_itertools.chain.from_iterable( self.get_heads(node.identifier) for node in self.get_dummies(root_id, _check=False) )) else: head_nodes = self.get_heads(child.identifier) break # Find Head if head_nodes is None: for child in children: if child.data.role == 'Head': if not deep: head_nodes = (child,) else: if child.data.pos == 'Caa': # Found Caa, choose dummies of root instead head_nodes = tuple(_itertools.chain.from_iterable( self.get_heads(node.identifier) for node in self.get_dummies(root_id, _check=False) )) else: head_nodes = self.get_heads(child.identifier) break # Found no head, choose the last child instead if head_nodes is None: head_nodes = (children[-1],) return head_nodes[0] if not deep else head_nodes
[docs] def get_relations(self, root_id=0): """Get all relations of a subtree. Parameters ---------- node_id : int ID of the subtree root node. Yields ------ :class:`ParserRelation` the relation. """ head_root_node = self.get_heads(root_id, deep=False) # Skip Caa if head_root_node.data.pos == 'Caa': return # Get heads for head_node in self.get_heads(root_id): # Get tails for tail in self.children(root_id): if tail.identifier != head_root_node.identifier: if tail.data.term: # if tail is a leaf node yield ParserRelation(head=head_node, tail=tail, relation=tail.data.role) else: for node in self.get_heads(tail.identifier): yield ParserRelation(head=head_node, tail=node, relation=tail.data.role) # Recursion for child in self.children(root_id): yield from self.get_relations(child.identifier)