from .features import Dictionary, RegexMatches, Stemmed, Stopwords
from .features.dictionary import utf16_cleanup
name = "english"
try:
import enchant
enchant_dict = enchant.Dict("en")
except enchant.errors.DictNotFoundError:
raise ImportError("No enchant-compatible dictionary found for 'en'. " +
"Consider installing 'myspell-en-au', " +
"'myspell-en-gb', 'myspell-en-us' and/or " +
"'myspell-en-za'.")
def safe_dictionary_check(word):
return enchant_dict.check(utf16_cleanup(word))
dictionary = Dictionary(name + ".dictionary", safe_dictionary_check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
:class:`enchant.Dict` "en". Provided by `myspell-en-au`, `myspell-en-gb`,
`myspell-en-us`, and `myspell-en-za`.
"""
try:
from nltk.corpus import stopwords as nltk_stopwords
stopwords = set(nltk_stopwords.words('english'))
except LookupError:
raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
"You may need to install the nltk 'stopwords' " +
"corpora. See http://www.nltk.org/data.html")
stopwords = Stopwords(name + ".stopwords", stopwords)
"""
:class:`~revscoring.languages.features.Stopwords` features provided by
:func:`nltk.corpus.stopwords` "english"
"""
try:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
except ValueError:
raise ImportError("Could not load stemmer for {0}. ".format(__name__))
stemmed = Stemmed(name + ".stemmed", stemmer.stem)
"""
:class:`~revscoring.languages.features.Stemmed` word features via
:class:`nltk.stem.snowball.SnowballStemmer` "english"
"""
badword_regexes = [
r"(fat|stupid|lazy)?a+[sr]+s+e*([-_ ]?butt|clown|face|hole|hat|e?s)?",
r"autofel+at(e|io|ing|ion)s?",
r"b+i+o?t+c+h+\w*",
r"bootlip",
r"blow(job|me)\w*",
r"bollock\w*",
r"boo+ger\w*",
r"(ass|arse)?b+u+t+t+([-_ ]?clown|face|hole|hat|es)?",
r"bugg(er|ing)\w*",
r"butthead", r"buttface", r"buttsex", r"buttf+u+c*k+\w*",
r"chlamydia",
r"cholo",
r"chug",
r"clunge\w*",
r"cock\w*",
r"coo+n\w*",
r"[ck]racker\w*",
r"c+u+n+t\w*",
r"crack[-_ ]?head\w*",
r"crooks?",
r"defraud",
r"(limp)?dick\w*",
r"d+i+l+d+o+\w*",
r"dishonest\w*",
r"dot[-_ ]?head\w*",
r"dyk(e|ing)\w*",
r"(f|ph)a+g+(ot)?\w*",
r"fart\w*",
r"fraud",
r"f+u+c*k+\w*",
r"gh?[ea]+y+\w*",
r"g[yi]p+(o|y|ie?)?", r"gyppie",
r"goo+k",
r"gringo",
r"he+rpe+s",
r"hill-?billy",
r"hom(a|o|er)(sexual)?\w*",
r"hooker\w*",
r"injun\w*",
r"j+a+p+o?",
r"k[iy]+ke",
r"kwash(i|ee)",
r"\w*l+e+s+b+i+a+n+\w*",
r"liar",
r"lick(er)?s?",
r"meth",
r"meth[-_ ]?head\w*",
r"naz+i",
r"nig", r"\w*n+i+gg+[aeious]+\w*", r"niglet", r"nigor", r"nigr", r"nigra",
r"nonc(e|ing)\w*",
r"overdose[sd]",
r"peckerwood\w*",
r"p(a?e|æ)do((f|ph)[iy]le)?s?",
r"peni(s)?\w*",
r"piss\w*",
r"prostitute\w*",
r"pot[-_ ]?head\w*",
r"q(w|u)ash(i|ee)",
r"rag[-_ ]?head",
r"red[-_ ]?(neck|skin)",
r"round[-_ ]?eye",
r"satan(ic|ism|ist)s?",
r"scabies",
r"s+h+[ia]+t+\w*",
r"s+l+u+t+\w*",
r"spi(g|c|k)+",
r"spigotty",
r"spik",
r"spook",
r"squarehead",
r"stupid(s+h+[ia]+t+|c+u+n+t+|f+u+c*k+|t+w+a+t+|w+h+o+r+e+)\w*",
r"subnormal",
r"su+c*k+(er|iest|a)",
r"syphil+is",
r"terror(ist|ism|i[zs](e|ing|ed))s?",
r"thei[fv](e?s)?",
r"tran(ny|sexual)",
r"t+w+a+t+\w*",
r"ti+t+((s|ies|y)[\w]*)?",
r"v+a+g+(i+n+a+)?", r"vajaja\w*",
r"wank\w*", r"wetback\w*", r"w+h+o+r+(e+|ing)\w*", r"w+o+g+", r"w+o+p+",
r"yank(e+)?", r"yid",
r"zipperhead"
]
badwords = RegexMatches(name + ".badwords", badword_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
badword detecting regexes.
"""
informal_regexes = [
r"ain'?t", "a+we?some?(r|st)?",
r"(b+l+a+h*)+",
r"b+o+n+e+r+",
r"boobs?",
r"bullshit",
r"bro",
r"(bye)+",
r"can'?t",
r"[ck](oo+|e+w+)l+\w*",
r"[ck]+r+a+p+(s|ier|iest)?",
r"chu+g+",
r"dad+(y|a)?",
r"don'?t", r"dum+b*(y|ies|er|est)?(ass)?",
r"d+u+d+e+\w*",
r"good[-_]?bye",
r"(mw?[au]+)?h+[aiou]+(h+[aeiou]*)*", r"h+[e]+(h+[aeiou]*)+",
r"hel+o+", r"h(aa+|e+)y+",
r"h+m+",
r"i", r"i+d+i+o+t+",
r"(la)+",
r"loser",
r"(l+[uo]+l+)([uo]+l+)*",
r"l+m+a+o+",
r"l[ou]+ve?",
r"m+e+o+w+",
r"munch\w*",
r"mom+(y|a)?",
r"moron",
r"nerds?",
r"noo+b(y|ie|s)?\w*",
r"no+pe",
r"o+k+(a+y+)?",
r"o+m+g+\w*",
r"poo+p\w*",
r"retard\w*", r"tard",
r"r+o+f+l+(mao)?",
r"s+e+x+y+",
r"so+rry",
r"shove",
r"smelly",
r"soo+",
r"stink(s|y)?",
r"s+t+[uo]+p+i+d+\w*",
r"suck(s|ing|er)?", r"sux",
r"shouldn'?t",
r"test +edit", r"t+u+r+d+s?\w*",
r"wasn'?t",
r"w+[oua]+t+", r"wtf\w*", r"wh?[ua]+t?[sz]+[ua]+p", "s+u+p+",
r"wu+z+",
r"won'?t",
r"w+o+o+f+",
r"ya'?ll", r"y+a+y+", r"y+e+a+h?", r"you('?(ve|re|ll))?",
r"y+o+l+o+"
]
informals = RegexMatches(name + ".informals", informal_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
informal word detecting regexes.
"""