Source code for revscoring.languages.french

from .features import Dictionary, RegexMatches, Stemmed, Stopwords

name = "french"

try:
    import enchant
    dictionary = enchant.Dict("fr")
except enchant.errors.DictNotFoundError:
    raise ImportError("No enchant-compatible dictionary found for 'fr'.  " +
                      "Consider installing 'myspell-fr'.")

dictionary = Dictionary(name + ".dictionary", dictionary.check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
:class:`enchant.Dict` "fr".  Provided by `myspell-fr`
"""

try:
    from nltk.corpus import stopwords as nltk_stopwords
    stopwords = set(nltk_stopwords.words('french') + ["a"])
except LookupError:
    raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
                      "You may need to install the nltk 'stopwords' " +
                      "corpora.  See http://www.nltk.org/data.html")

stopwords = Stopwords(name + ".stopwords", stopwords)
"""
:class:`~revscoring.languages.features.Stopwords` features provided by
:func:`nltk.corpus.stopwords` "french"
"""

try:
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("french")
except ValueError:
    raise ImportError("Could not load stemmer for {0}. ".format(__name__))

stemmed = Stemmed(name + ".stemmed", stemmer.stem)
"""
:class:`~revscoring.languages.features.Stemmed` word features via
:class:`nltk.stem.snowball.SnowballStemmer` "french"
"""

badword_regexes = [
    r"anus",
    r"bais[eé]",
    r"baiz",
    r"batard?",
    r"bit+es?",
    r"branle(r|tte|ur)",
    r"cacas?",
    r"caliss",
    r"chiante?",
    r"chiasse",
    r"chi[eé](nne|r)?", r"chiot+e",
    r"con(ard?|nard?)?s?", r"conn(asse|e|erie)s?",
    r"couill(es?|on)",
    r"cul",
    r"d[ée]bile",
    r"ducon",
    r"encul[eé][rs]?",
    r"fesses?",
    r"fion",
    r"foutre",
    r"homosexuel",
    r"lesbien",
    r"(e[mn])?m[ae]rd(es?|ique)",
    r"ni(ke|gue)r?", "niker", "nique", "niquer",
    "pd", "p[eé]dophile", "p[eé]d[eé]",
    "petasse",
    "pipi",
    "piss+e",
    "poop",
    "pour+i",
    "prostitu[eé]+",
    "proute?",
    "pues?",
    "put(a|ain|e|in)s?",
    "pénis",
    "pétasse",
    "quequette",  # pecker
    "queu", "queue",  # tail
    "salaud",
    "salo(p|pe?|pes?)?",
    "sodom(ie|iser)",
    "stupide",
    "suc[eé](u?r|use)?",
    "tapette",
    "teub",
    "vagin",
    "zboub",
    "zizi"
]

badwords = RegexMatches(name + ".badwords", badword_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
badword detecting regexes.
"""

informal_regexes = [
    r"ahah",
    r"allez",
    r"allo",
    r"bisous",
    r"(bla)+",
    r"bonjour",
    r"coucou",
    r"etais",
    r"etes",
    r"ha(ha)+",
    r"hi(hi)+",
    r"insérez",
    r"jadore",
    r"jai",
    r"kikoo",
    r"lo+l",
    r"mdr+",
    r"moche",
    r"ouais?",
    r"ptdr",
    r"truc",
    r"voila",
    r"voulez"
]

informals = RegexMatches(name + ".informals", informal_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
informal word detecting regexes.
"""

Revision Scoring

Navigation