Source code for ctrie

# -*- coding: UTF-8 -*-

This module provides a compact trie (``CTrie``) implementation for Python.

.. moduleauthor:: Baptiste Fontaine <[email protected]>

__version__ = '0.1.1'

from io import StringIO

def _cut_prefix(prefix, word):
    test if ``prefix`` is a prefix of ``word``. If so, return the trailing part
    of ``word``, or ``None`` if not.
    if word.startswith(prefix):
        return word[len(prefix):]

def _longuest_common_prefix(word1, word2):
    for i in range(min(len(word1), len(word2))):
        if word1[i] != word2[i]:
            return word1[:i]

    return word1 if len(word1) < len(word2) else word2

[docs]class CTrie(object): """ A compact trie. A trie, or prefix tree, is a data structure to efficiently store a set of strings with a lot of shared prefixes, like URLs. Compact tries are recursively defined, a trie has zero or more other child tries. A trie is said to be terminal if it contains the empty string. Each child is labelled with a prefix for its own children. For example, one compact trie that contains words "foo", "bar" and "foo42" has two children, one labelled "foo" and the other labelled "bar". The last one is terminal because once we read "bar" we can end our walk in the trie. The first one is terminal for the same reason, but have one terminal child labelled "42". In the following symbolic representation, a terminated child is represented with a dot at the end of its label: :: <empty> | +-- bar. | +-- foo. | +-- 42. Each possible walk in the trie, starting from the root and ending on a terminal node is a contained word. In the previous example, the empty string is not present in the trie because the root node is not terminal. A non-compact trie contains only one char per node. >>> from ctrie import CTrie >>> ct = CTrie() >>> c.add("foo", "bar", "qux") >>> "foo" in c True >>> "Bar" in c False >>> c.remove("foo") >>> "foo" in c False >>> len(c) 2 Keyword arguments: - ``terminal`` (``bool``, default: ``False``): specifies if the trie contains the empty string. This is the same as: >>> ct = CTrie() >>> ct.add('') # empty string """ __slots__ = ['_children', 'terminal'] def __init__(self, terminal=False): self._children = {} self.terminal = terminal def _add(self, word): # empty string if word == '': if not self.terminal: self.terminal = True return True return False # no child if not self._children: self._children[word] = CTrie(terminal=True) return True # one child with a common prefix for prefix, child in self._children.items(): # one child with the word as a prefix if prefix == word: if not child.terminal: child.terminal = True return True return False trailing = _cut_prefix(prefix, word) if trailing is not None: return child.add(trailing) # split a child prefix """ The goal of the code below is to (if possible) go from the following trie: <root> | ... +-- foobar | +-- qux to the following one, after insertion of the word 'foo123': <root> | ... +-- foo | +-- bar | | | +-- qux | +-- 123 We're essentially creating an intermediate node (called ``middle`` in the following code) with the good prefix, which is the longest common prefix between the original node prefix and the inserted word. """ for prefix, child in self._children.items(): lcp = _longuest_common_prefix(prefix, word) if lcp: middle = CTrie() origin = prefix[len(lcp):] new_branch = word[len(lcp):] middle._children[origin] = child middle._children[new_branch] = CTrie(terminal=True) del self._children[prefix] self._children[lcp] = middle return True self._children[word] = CTrie(terminal=True) return True def _remove(self, word): """ Remove a word from the trie. """ if word == '': self.terminal = False return True for prefix, child in self._children.items(): trailing = _cut_prefix(prefix, word) if trailing is not None: ret = child._remove(trailing) if child.is_empty(): del self._children[prefix] return ret return False
[docs] def add(self, *words): """ Add one or more words in the trie. The function returns ``True`` if all words were successfully added. It'll return ``False`` if one or more words were already present in the trie. >>> ct = CTrie() >>> ct.add('foo', 'bar') True >>> 'foo' in ct True >>> ct.add('bar') False >>> ct.add('qux') True """ ret = True for word in words: ret &= self._add(word) return ret
[docs] def remove(self, *words): """ Remove one or more words from the trie. The function returns ``True`` if all the words were successfully removed (i.e. they were in the trie before) or ``False`` if one or more words couldn't be found in the trie. >>> ct = CTrie() >>> ct.add('foo', 'qux') True >>> 'foo' in ct True >>> ct.remove('foo', 'bar') False >>> 'foo' in ct False >>> ct.remove('qux') >>> True """ ret = True for word in words: ret &= self._remove(word) return ret
[docs] def is_empty(self): """ Return ``True`` if the trie is empty. >>> ct = CTrie() >>> ct.is_empty() True >>> ct.add('') True >>> ct.is_empty() False """ return not (self._children or self.terminal)
[docs] def height(self): """ Compute the height of the trie. An empty trie has a zero height, and the maximum height of a compacted trie is the length of its longuest word. >>> ct = CTrie() >>> ct.height() 0 >>> ct.add(''); ct.height() 0 >>> ct.add('foo'); ct.height() 1 >>> ct.add('bar'); ct.height() 1 >>> ct.add('boo'); ct.height() 2 .. versionadded:: 0.1.0 """ if not self._children: return 0 return 1 + max(map(lambda c: c.height(), self._children.values()))
[docs] def values(self): """ Yield all values from this trie in arbitrary order. The order in which they're yielded doesn't depend on the order in which they're inserted. >>> ct = CTrie() >>> ct.add('foo', 'bar', 'qux') True >>> for x in ct.values(): print x ... qux foo bar .. versionadded:: 0.1.0 """ if self.is_empty(): return if self.terminal: yield '' for prefix, child in self._children.items(): for value in child.values(): yield prefix + value
[docs] def subtree(self, prefix): """ Return a subtree that's equivalent to the current one with all values stripped from the given prefix. Values that don't start with this prefix are not included. An empty trie is returned if the prefix doesn't match any value. Note that the returned subtree may share its nodes with the current one. >>> ct = CTrie() >>> ct.add('foo', 'bar', 'foobar', 'fooo', 'qux') True >>> foo = ct.subtree('foo') >>> list(foo) # actual order may vary ['', 'bar', 'o'] >>> ct.subtree('nope').is_empty() True """ if prefix == '': return self if prefix in self._children: return self._children[prefix] for node_prefix, node in self._children.items(): lcp = _longuest_common_prefix(prefix, node_prefix) if not lcp: continue root = CTrie() child = CTrie() suffix = node_prefix[len(lcp):] root._children[suffix] = child child._children = node._children return root return CTrie()
def _write_pretty_string(self, writer, indent_string): if self.terminal: writer.write("\n") for prefix, ch in self._children.items(): writer.write("%s%s" % (indent_string, prefix)) ch._write_pretty_string(writer, indent_string + " " * len(prefix)) return writer
[docs] def write_pretty_string(self, writer): """ Equivalent of ``pretty_string`` that uses the given writer's ``.write`` method. """ return self._write_pretty_string(writer, "")
[docs] def pretty_string(self): """ Return a pretty-printed version of the internal state of the tree. It helps understand how words are split to fit in a tree. """ writer = self.write_pretty_string(StringIO()) return
# Export # ======
[docs] def to_dict(self): """ Recursively convert the tree to a dictionary. """ return { "terminal": self.terminal, "children": {prefix: ch.to_dict() for prefix, ch in self._children.items()}, }
[docs] def from_dict(cls, d): """ Take a dictionary in the format returned by ``to_dict`` and build a new instance. """ instance = CTrie(terminal=bool(d.get("terminal"))) children = d.get("children", {}) if children: for prefix, ch in children.items(): instance._children[prefix] = cls.from_dict(ch) return instance
# Magic Methods # ============= def __contains__(self, word): """ Check if the trie contains a given word. """ if word == '' and self.terminal: return True for prefix, child in self._children.items(): trailing = _cut_prefix(prefix, word) if trailing is not None and trailing in child: return True return False def __len__(self): """ Compute the number of words in the trie. This result should be cached because we're recursively checking every node in the trie. """ total = 1 if self.terminal else 0 for ch in self._children.values(): total += len(ch) return total def __eq__(self, other): """ Two compact tries are equal if they contain the same set of words. This doesn't guarantee the internal structures are equal. """ if not isinstance(other, CTrie): return False # Bad complexity but there may be multiple compact tries that recognize # the same set of words because we don't re-compact a trie when we # remove a bunch of words from it. for word in self.values(): if word not in other: return False for word in other.values(): if word not in self: return False return True def __iter__(self): return self.values() def __nonzero__(self): return not self.empty() def __iadd__(self, other): for word in other: self._add(word) return self def __ior__(self, other): return self.__iadd__(other) def __isub__(self, other): for word in other: self._remove(word) return self def __iand__(self, other): words_to_remove = [] for word in self.values(): if word not in other: words_to_remove.append(word) for word in words_to_remove: self._remove(word) return self