Source code for ctrie

# -*- coding: UTF-8 -*-

"""
This module provides a compact trie (``CTrie``) implementation for Python.

.. moduleauthor:: Baptiste Fontaine <b@ptistefontaine.fr>
"""

__version__ = '0.1.1'


def _cut_prefix(prefix, word):
    """
    test if ``prefix`` is a prefix of ``word``. If so, return the trailing part
    of ``word``, or ``None`` if not.
    """
    if word.startswith(prefix):
        return word[len(prefix):]


# longuest common prefix
def _lcp(word1, word2):
    for i, (c1, c2) in enumerate(zip(word1, word2)):
        if c1 != c2:
            return word1[:i]

    return word1 if len(word1) < len(word2) else word2


[docs]class CTrie(object): """ A compact trie. A trie, or prefix tree, is a data structure to efficiently store a set of strings with a lot of shared prefixes, like URLs. Compact tries are recursively defined, a trie has zero or more other child tries. A trie is said to be terminal if it contains the empty string. Each child is labelled with a prefix for its own children. For example, one compact trie that contains words "foo", "bar" and "foo42" has two children, one labelled "foo" and the other labelled "bar". The last one is terminal because once we read "bar" we can end our walk in the trie. The first one is terminal for the same reason, but have one terminal child labelled "42". In the following symbolic representation, a terminated child is represented with a dot at the end of its label: :: <empty> | +-- bar. | +-- foo. | +-- 42. Each possible walk in the trie, starting from the root and ending on a terminal node is a contained word. In the previous example, the empty string is not present in the trie because the root node is not terminal. A non-compact trie contains only one char per node. >>> from ctrie import CTrie >>> ct = CTrie() >>> c.add("foo", "bar", "qux") >>> "foo" in c True >>> "Bar" in c False >>> c.remove("foo") >>> "foo" in c False >>> len(c) 2 Keyword arguments: - ``terminal`` (``bool``, default: ``False``): specifies if the trie contains the empty string. This is the same as: >>> ct = CTrie() >>> ct.add('') # empty string """ __slots__ = ['_children', 'terminal'] def __init__(self, terminal=False): self._children = {} self.terminal = terminal def _add(self, word): # empty string if word == '': if not self.terminal: self.terminal = True return True return False # no child if not self._children: self._children[word] = CTrie(terminal=True) return True # one child with a common prefix for prefix, child in self._children.items(): # one child with the word as a prefix if prefix == word: if not child.terminal: child.terminal = True return True return False trailing = _cut_prefix(prefix, word) if trailing is not None: return child.add(trailing) # split a child prefix """ The goal of the code below is to (if possible) go from the following trie: <root> | ... +-- foobar | +-- qux to the following one, after insertion of the word 'foo123': <root> | ... +-- foo | +-- bar | | | +-- qux | +-- 123 We're essentially creating an intermediate node (called ``middle`` in the following code) with the good prefix, which is the longest common prefix between the original node prefix and the inserted word. """ for prefix, child in self._children.items(): lcp = _lcp(prefix, word) if lcp: middle = CTrie() origin = prefix[len(lcp):] new_branch = word[len(lcp):] middle._children[origin] = child middle._children[new_branch] = CTrie(terminal=True) del self._children[prefix] self._children[lcp] = middle return True self._children[word] = CTrie(terminal=True) return True def _remove(self, word): """ Remove a word from the trie. """ if word == '': self.terminal = False return True for prefix, child in self._children.items(): trailing = _cut_prefix(prefix, word) if trailing is not None: ret = child._remove(trailing) if child.is_empty(): del self._children[prefix] return ret return False
[docs] def add(self, *words): """ Add one or more words in the trie. The function returns ``True`` if all words were successfully added. It'll return ``False`` if one or more words were already present in the trie. >>> ct = CTrie() >>> ct.add('foo', 'bar') True >>> 'foo' in ct True >>> ct.add('bar') False >>> ct.add('qux') True """ ret = True for word in words: ret &= self._add(word) return ret
[docs] def remove(self, *words): """ Remove one or more words from the trie. The function returns ``True`` if all the words were successfully removed (i.e. they were in the trie before) or ``False`` if one or more words couldn't be found in the trie. >>> ct = CTrie() >>> ct.add('foo', 'qux') True >>> 'foo' in ct True >>> ct.remove('foo', 'bar') False >>> 'foo' in ct False >>> ct.remove('qux') >>> True """ ret = True for word in words: ret &= self._remove(word) return ret
[docs] def is_empty(self): """ Return ``True`` if the trie is empty. >>> ct = CTrie() >>> ct.is_empty() True >>> ct.add('') True >>> ct.is_empty() False """ return not (self._children or self.terminal)
[docs] def height(self): """ Compute the height of the trie. An empty trie has a zero height, and the maximum height of a compacted trie is the length of its longuest word. >>> ct = CTrie() >>> ct.height() 0 >>> ct.add(''); ct.height() 0 >>> ct.add('foo'); ct.height() 1 >>> ct.add('bar'); ct.height() 1 >>> ct.add('boo'); ct.height() 2 .. versionadded:: 0.1.0 """ if not self._children: return 0 return 1 + max(map(lambda c: c.height(), self._children.values()))
[docs] def values(self): """ Yield all values from this trie in arbitrary order. The order in which they're yielded doesn't depend on the order in which they're inserted. >>> ct = CTrie() >>> ct.add('foo', 'bar', 'qux') True >>> for x in ct.values(): print x ... qux foo bar .. versionadded:: 0.1.0 """ if self.is_empty(): return for prefix, child in self._children.items(): if child.terminal: yield prefix for value in child.values(): yield prefix + value
def __contains__(self, word): """ Check if the trie contains a given word. """ if word == '' and self.terminal: return True for prefix, child in self._children.items(): trailing = _cut_prefix(prefix, word) if trailing is not None and trailing in child: return True return False def __len__(self): """ Compute the number of words in the trie. This result should be cached because we're recursively checking every node in the trie. """ init = 1 if self.terminal else 0 return init + sum(map(len, self._children.values())) def __iter__(self): return self.values() def __iadd__(self, other): for word in other: self._add(word) return self