Source code for revscoring.languages.dutch

from .features import Dictionary, RegexMatches, Stemmed, Stopwords

name = "dutch"

try:
    import enchant
    dictionary = enchant.Dict("nl")
except enchant.errors.DictNotFoundError:
    raise ImportError("No enchant-compatible dictionary found for 'nl'.  " +
                      "Consider installing 'myspell-nl'.")

dictionary = Dictionary(name + ".dictionary", dictionary.check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
:class:`enchant.Dict` "nl".  Provided by `myspell-nl`
"""

try:
    from nltk.corpus import stopwords as nltk_stopwords
    stopwords = set(nltk_stopwords.words('dutch'))
except LookupError:
    raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
                      "You may need to install the nltk 'stopwords' " +
                      "corpora.  See http://www.nltk.org/data.html")

stopwords = Stopwords(name + ".stopwords", stopwords)
"""
:class:`~revscoring.languages.features.Stopwords` features provided by
:func:`nltk.corpus.stopwords` "dutch"
"""

try:
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("dutch")
except ValueError:
    raise ImportError("Could not load stemmer for {0}. ".format(__name__))

stemmed = Stemmed(name + ".stemmed", stemmer.stem)
"""
:class:`~revscoring.languages.features.Stemmed` word features via
:class:`nltk.stem.snowball.SnowballStemmer` "dutch"
"""

badword_regexes = [
    r"aars",
    r"an(aal|us)\w*",
    r"balhaar",
    r"drol(len)?",
    r"fack(en|ing|s)?", "facking",
    r"flikkers?",
    r"focking",
    r"ge(ile?|lul)",
    r"geneukt",
    r"hoer(en?)?",
    r"homos?",
    r"kaka?",
    r"kak(hoofd|ken)",
    r"k[ae]nker",
    r"klootzak(ken)?",
    r"klote",
    r"kont(gat|je)?",
    r"pedo",
    r"penis(sen)?",
    r"peop",
    r"piemels?",
    r"pijpen",
    r"pik",
    r"pimel",
    r"pipi",
    r"poep(chinees?|en|hoofd)?",
    r"poep(ie|je|sex|te?)s?",
    r"porno?",
    r"neuke?",
    r"neuken(de)?",
    r"neukt(en?)?",
    r"stron(d|t)",
    r"suck(s|t)?",
    r"zuigt",
    r"sukkels?",
    r"ter(ing|ten)", "tetten",
    r"tieten",
    r"vagina",
    r"verekte",
    r"verkracht",
    r"dikzak",
    r"dildo",
    r"mon?g(olen|ool)?", "mooiboy",
    r"negers?",
    r"shit",
    r"sperma",
    r"kut(jes?)?",
    r"stelletje",
    r"losers?",
    r"lul(len)?",
    r"reet",
    r"scheet", "scheten", r"schijt",
    r"diaree",
    r"slet",
    r"lekkerding",
    r"likken"
]

badwords = RegexMatches(name + ".badwords", badword_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
badword detecting regexes.
"""

informal_regexes = [
    r"aap(jes)?",
    r"banaan",
    r"bent",
    r"boe(it)?",
    r"doei"
    r"dombo",
    r"domme",
    r"eigelijk",
    r"godverdomme",
    r"groetjes",
    r"gwn",
    r"hoi",
    r"hal+o+",
    r"heb",
    r"hee+[jyl]", r"heee+l",
    r"houd?",
    r"(hoi+)+",
    r"hoor",
    r"izan",
    r"jij",
    r"jou",
    r"jullie",
    r"kaas",
    r"klopt",
    r"kots",
    r"kusjes",
    r"le?kke?re?",
    r"maarja",
    r"mama",
    r"nou",
    r"oma",
    r"ofzo",
    r"oke",
    r"sexy?",
    r"snap",
    r"stink(en|t)",
    r"stoer",
    r"swag",
    r"swek",
    r"vies", "vieze",
    r"vind",
    r"vuile",
    r"xxx",
    r"yeah",
    r"zielig",
    r"zooi",
    r"yolo",
    r"zeg"
]

informals = RegexMatches(name + ".informals", informal_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
informal word detecting regexes.
"""

Revision Scoring

Navigation