Source code for rosetta.text.nlp

import re

from itertools import izip, chain


###############################################################################
# Globals
###############################################################################
stopwords_eng = set(
    'a,able,about,across,after,all,almost,also,am,among,an,'
    'and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,'
    'do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,'
    'hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,'
    'likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,'
    'only,or,other,our,own,rather,said,say,says,she,should,since,so,some,'
    'than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,'
    'us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,'
    'with,would,yet,you,your'.split(','))


[docs]def word_tokenize(text, L=1, numeric=True):
    """
    Word tokenizer to replace the nltk.word_tokenize()

    Paramters
    ---------
    text : string
    L : int, min length of word to return
    numeric : bool, True if you want to include numerics
    """
    text = re.sub(
        r'(?:\s|\[|\]|\(|\)|\{|\}|;|,|\.(\s|$)|:|\n|\r|\?|\!|\"|\-)', r'  ', 
        text)
    if numeric:
        word_list = re.findall(
            r'(?:\s|^)([A-Za-z\.\'&]{%s,}|[0-9]{%s,}|\
            (?:(?<=.|\s)[A-Z]\.)+)(?:\s|$)' % (L, L), text)
    else:
        word_list = re.findall(
            r'(?:\s|^)([A-Za-z\.\'&]{%s,}|(?:(?<=.|\s)[A-Z]\.)+)(?:\s|$)' % L, 
            text)

    return word_list


def is_stopword(string):
    return string.lower() in stopwords_eng


def is_letter(s):
    try:
        return len(s) == 1 and s.isalpha()
    except:
        return False


[docs]def bigram_tokenize_iter(
    text, word_tok=word_tokenize, skip_regex=r'\.|,|:|;|\?|!',
    **word_tok_kwargs):
    """
    Bigram tokenizer generator function.

    Paramters
    ---------
    text : string
    word_tok : function
        a word tokenizer function that takes in a string and returns a list
        of strings (tokens)
    skip_regex : string, or raw string, regular expression
        if a word pair is seperated by a match of the regular expression the
        pair will be ingored; for example, r'\.|,|:|;|\?|!' makes sure no word
        pairs separated by basic punctation are included; to inlcude all word
        pairs let skip_regex=''
    word_tok_kwargs : kwargs dict
        kwargs compatible with the work_tok api
    Returns
    -------
    bigram_iter : iterator
    """
    text_frags = re.split(skip_regex, text)
    word_lists = [word_tok(frag, **word_tok_kwargs) for frag in text_frags]
    bigram_iter = izip(word_lists[0], word_lists[0][1:])
    for words in word_lists[1:]:
        bigram_iter = chain(bigram_iter, izip(words, words[1:]))
    return bigram_iter


[docs]def bigram_tokenize(text, word_tok=word_tokenize,
        skip_regex=r'\.|,|:|;|\?|!', **word_tok_kwargs):
    """
    Same as bigram_tokenize_iter, except returns a list.
    """
    bigram_iter = bigram_tokenize_iter(text, word_tok, skip_regex,
            **word_tok_kwargs)
    return [bg for bg in bigram_iter]

bigram_tokenize.__doc__ += bigram_tokenize_iter.__doc__