python/koebi/potypo/potypo/chunkers.py

chunkers.py
from enchant.tokenize import Chunker

def make_PhraseChunker(phrases):
    class PhraseChunker(Chunker):
        """
        It may happen that certain phrases are used in a context that would mark
        them as being incorrectly spelled. We chunk everything but those phrases to
        ignore them.

        The algorithm find the first index at which a phrase starts. In the case of
        multiple phrases, that is the first occurrence of any phrase. It returns
        everything before that phrase and continues with the text after that
        phrase.
        """

        def next(self):
            text = self._text
            offset = self.offset

            if offset >= len(text):
                raise StopIteration

            haystack = text[offset:].tounicode()

            needle = None
            index = None
            for p in phrases:
                try:
                   found = haystack.index(p)
                except ValueError:
                    continue
                if index is None or found  <  index:
                    index = found
                    needle = p

            if index is None:
                self.set_offset(len(text))
                return (text[offset:], offset)

            self.set_offset(index + len(needle))
            return (text[offset:index], offset)
    return PhraseChunker