Source code for rosetta.text.nlp

import re

from itertools import izip, chain


###############################################################################
# Globals
###############################################################################
stopwords_eng = set(
    'a,able,about,across,after,all,almost,also,am,among,an,'
    'and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,'
    'do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,'
    'hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,'
    'likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,'
    'only,or,other,our,own,rather,said,say,says,she,should,since,so,some,'
    'than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,'
    'us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,'
    'with,would,yet,you,your'.split(','))


[docs]def word_tokenize(text, L=1, numeric=True): """ Word tokenizer to replace the nltk.word_tokenize() Paramters --------- text : string L : int, min length of word to return numeric : bool, True if you want to include numerics """ text = re.sub( r'(?:\s|\[|\]|\(|\)|\{|\}|;|,|\.(\s|$)|:|\n|\r|\?|\!|\"|\-)', r' ', text) if numeric: word_list = re.findall( r'(?:\s|^)([A-Za-z\.\'&]{%s,}|[0-9]{%s,}|\ (?:(?<=.|\s)[A-Z]\.)+)(?:\s|$)' % (L, L), text) else: word_list = re.findall( r'(?:\s|^)([A-Za-z\.\'&]{%s,}|(?:(?<=.|\s)[A-Z]\.)+)(?:\s|$)' % L, text) return word_list
def is_stopword(string): return string.lower() in stopwords_eng def is_letter(s): try: return len(s) == 1 and s.isalpha() except: return False
[docs]def bigram_tokenize_iter( text, word_tok=word_tokenize, skip_regex=r'\.|,|:|;|\?|!', **word_tok_kwargs): """ Bigram tokenizer generator function. Paramters --------- text : string word_tok : function a word tokenizer function that takes in a string and returns a list of strings (tokens) skip_regex : string, or raw string, regular expression if a word pair is seperated by a match of the regular expression the pair will be ingored; for example, r'\.|,|:|;|\?|!' makes sure no word pairs separated by basic punctation are included; to inlcude all word pairs let skip_regex='' word_tok_kwargs : kwargs dict kwargs compatible with the work_tok api Returns ------- bigram_iter : iterator """ text_frags = re.split(skip_regex, text) word_lists = [word_tok(frag, **word_tok_kwargs) for frag in text_frags] bigram_iter = izip(word_lists[0], word_lists[0][1:]) for words in word_lists[1:]: bigram_iter = chain(bigram_iter, izip(words, words[1:])) return bigram_iter
[docs]def bigram_tokenize(text, word_tok=word_tokenize, skip_regex=r'\.|,|:|;|\?|!', **word_tok_kwargs): """ Same as bigram_tokenize_iter, except returns a list. """ bigram_iter = bigram_tokenize_iter(text, word_tok, skip_regex, **word_tok_kwargs) return [bg for bg in bigram_iter]
bigram_tokenize.__doc__ += bigram_tokenize_iter.__doc__