Source code for textblob.tokenizers

"""Various tokenizer implementations.

.. versionadded:: 0.4.0
"""

from itertools import chain

import nltk

from textblob.base import BaseTokenizer
from textblob.decorators import requires_nltk_corpus
from textblob.utils import strip_punc



[docs]
class WordTokenizer(BaseTokenizer):
    """NLTK's recommended word tokenizer (currently the TreeBankTokenizer).
    Uses regular expressions to tokenize text. Assumes text has already been
    segmented into sentences.

    Performs the following steps:

    * split standard contractions, e.g. don't -> do n't
    * split commas and single quotes
    * separate periods that appear at the end of line
    """


[docs]
    def tokenize(self, text, include_punc=True):
        """Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to
            include punctuation as separate tokens. Default to True.
        """
        tokens = nltk.tokenize.word_tokenize(text)
        if include_punc:
            return tokens
        else:
            # Return each word token
            # Strips punctuation unless the word comes from a contraction
            # e.g. "Let's" => ["Let", "'s"]
            # e.g. "Can't" => ["Ca", "n't"]
            # e.g. "home." => ['home']
            return [
                word if word.startswith("'") else strip_punc(word, all=False)
                for word in tokens
                if strip_punc(word, all=False)
            ]





[docs]
class SentenceTokenizer(BaseTokenizer):
    """NLTK's sentence tokenizer (currently PunktSentenceTokenizer).
    Uses an unsupervised algorithm to build a model for abbreviation words,
    collocations, and words that start sentences,
    then uses that to find sentence boundaries.
    """


[docs]
    @requires_nltk_corpus
    def tokenize(self, text):
        """Return a list of sentences."""
        return nltk.tokenize.sent_tokenize(text)




#: Convenience function for tokenizing sentences
sent_tokenize = SentenceTokenizer().itokenize

_word_tokenizer = WordTokenizer()  # Singleton word tokenizer



[docs]
def word_tokenize(text, include_punc=True, *args, **kwargs):
    """Convenience function for tokenizing text into words.

    NOTE: NLTK's word tokenizer expects sentences as input, so the text will be
    tokenized to sentences before being tokenized to words.
    """
    words = chain.from_iterable(
        _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs)
        for sentence in sent_tokenize(text)
    )
    return words
Source code for textblob.tokenizers

Useful Links

Related Topics