Source code for revscoring.languages.spanish

from .features import Dictionary, RegexMatches, Stemmed, Stopwords

name = "spanish"

try:
    import enchant
    dictionary = enchant.Dict("es")
except enchant.errors.DictNotFoundError:
    raise ImportError("No enchant-compatible dictionary found for 'es'.  " +
                      "Consider installing 'myspell-es'.")

dictionary = Dictionary(name + ".dictionary", dictionary.check)
"""
:class:`~revscoring.languages.features.Dictionary` features via
:class:`enchant.Dict` "es".  Provided by `myspell-es`
"""

try:
    from nltk.corpus import stopwords as nltk_stopwords
    stopwords = set(nltk_stopwords.words('spanish'))
except LookupError:
    raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
                      "You may need to install the nltk 'stopwords' " +
                      "corpora.  See http://www.nltk.org/data.html")

stopwords = Stopwords(name + ".stopwords", stopwords)
"""
:class:`~revscoring.languages.features.Stopwords` features provided by
:func:`nltk.corpus.stopwords` "spanish"
"""

try:
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("spanish")
except ValueError:
    raise ImportError("Could not load stemmer for {0}. ".format(__name__))

stemmed = Stemmed(name + ".stemmed", stemmer.stem)
"""
:class:`~revscoring.languages.features.Stemmed` word features via
:class:`nltk.stem.snowball.SnowballStemmer` "spanish"
"""

badword_regexes = [
    r"awe(onao|vo)",  # idiot
    r"babosa",  # slug
    r"bastardo",  # bastard
    r"bollo",  # drunk (to the extreme)
    r"boludo",  # asshole
    r"bugarr[óo]n",  # gay/bi man
    r"buseta",  # ???
    r"caca", r"\w*cag(a|o|ar(ro)?)(n(es)?)?\w*",  # shit / shits / dirty
    r"caquita",  # shit
    r"cabr[óo]n(es)?",  # bastards
    r"cagad[ao]",  # funny / absurd
    r"capullo",  # end of a penis
    r"carajo", r"chinga(r|da)?", r"chingu?en",  # fuck
    r"chichornia",  # ???
    r"chiguero",  # ???
    r"chima(r|da|s)?",  # fuck
    r"chingad(o|azo|erita)s?",  # fuck
    r"chingo(n[cs]i[cs]imo|ner[íi]a|nes|rrón)s?",  # fuck
    r"chino(n[cs]i[cs]imo)?",  # chinese
    r"chosto",  # ???
    r"choch[oa]",  # cunt
    r"cholo",  # mixed race person
    r"chucha",  # fuck
    r"\w*chupa(n|r|mea?|mel[ao]|medias|pollas|ban?)?s?\w*",
    r"chupen(la|me|mela)", r"chupo",  # variants of suck / blow me
    r"cipote",  # child (slang)
    r"cochetumadre",  # car of your mother ???
    r"co[gj]er",  # have sex / fuck
    r"cojio",  # wedgie
    r"cojones?",  # balls
    r"comeme",  # "eat me"
    r"comian",  # ???
    r"conch(a|etumadre)",  # vagina (of your mother)
    r"conejo",  # rabbit ???
    r"consolador",  # dildo
    r"co[ñn]o",  # dammit
    r"cupan",  # Misspelling of "chupan" ???
    r"cuca",  # vagina
    r"cule(ar|ros?)",  # fuck
    r"cul(o|ito|iaos?)",  # ass
    r"cundango",  # gay / homosexual
    r"desmadr([ae]r?|ado|arse)?",  # chaotic ???
    r"dlaversh",  # ???
    r"droga(ta)?s?",  # drugs
    r"emput(e|ar|ado)s?",  # pissed off
    r"enc", r"encabron(ar|ado|adas|arse)s?",  # pissed off
    r"encronada",  # ???
    r"enputado",  # bitch / complain
    r"fach(a|ero|era)",  # superficial appearance
    r"foll(o|ar|aban?)?",  # fuck
    r"fornicar",  # sex
    r"freg[oó]n",  # wet blanket -- no fun ???
    r"furcia",  # whore
    r"gabacho",  # frenchy (disrespectful)
    r"gamberro",  # thug
    r"gilipoll[ao]s?",  # asshole
    r"gitan[ao]s?",  # gypsy
    r"gord[ao]s?",  # fat
    r"gringo",  # US national (historically derogatory)
    r"guev(o|on(es)?)",  # cock / idiot
    r"g[uü]ey",  # bitch ???
    r"guiri",  # foreigner / tourist (derogatory)
    r"güila",  # ???
    r"golfa",  # slut
    r"huevos",  # balls
    r"huev[óo]n(es)?", r"huevospateados?",  # lazy ass
    r"hifueputa",  # ???
    r"hijode?putas?", r"hijoduta",  # motherfucker
    r"hijueputas?",  # son of a bitch
    r"holocuento",  # the Hollocaust
    r"idiotas?", r"jidiotas?",  # idiot
    r"imb[ée][cs]il(es)?",  # imbecile
    r"japo",  # japanese (derogatory)
    r"jetear",  # sleep (slang)
    r"jili(puerta|polla)s?",  # cock sucker ???
    r"joder",  # fuck
    r"jot(a|o|eria)s?",  # faggot
    r"jud[íi]o",  # jew (derogatory)
    r"jod(er|ido)",  # fucked
    r"komekaka",  # misspelling of 'come caca' -- "eat shit"
    r"lesbiana",  # lesbian
    r"lamadre",  # 'your mother' explitive
    r"mach(orra|etorra)",  # lesbian (offensive)
    r"madrea(do|r)",  # to beat someone up
    r"maldito",  # dammit
    r"malparid[oa]s?",  # miscarried
    r"mama(da|guevo)s?",  # blowjob
    r"mam[óo]+n(es)?",  # insolent little douchebag
    r"manola",  # the (female) hand one masturbates with
    r"mari[ck](a|[óo]ne?)s?",  # gay man
    r"marimach[ao]",  # tomboy
    r"maripos[óo]n",  # fag
    r"marrano",  # pig
    r"meachupan", r"meapelan",  # something 'suck me' ???
    r"me[- ]?castra[- ]?la[- ]?madre",  # 'I <something> your mother' ???
    r"me[- ]?la[- ]?pelan?s?",  # "You ain't shit" ???
    r"\w*mi?er[dg]a+s?\w*",  # shit
    r"mimsn",  # ???
    r"ming[ao]",  # ugly person
    r"mocos",  # boogers / mucus
    r"mojon",  # shit
    r"monda",  # man part / penis
    r"moro",  # moor / muslim (derogatory)
    r"nacio",  # ???
    r"nazi",  # nazi
    r"negrat[oa]",  # nigger
    r"n[oe]m[ea]mes",  # ???
    r"ojetes?", "ogts?",  # asshole / stingy
    r"paj(a|aro|ero)s?",  # wank / wanker
    r"paki",  # ???
    r"pamearlo",  # ???
    r"pario",  # ???
    r"pattaya",  # ???
    r"pedos?",  # pedofile
    r"pel(ado|an)",  # boy (derogatory)
    r"pelot(a|udo)s?",  # balls
    r"pendej(o|a|ada)s?",  # stupid people
    r"penes?",  # penis
    r"peos?",  # fart
    r"perras?",  # bitch
    r"petard[oa]s?",  # slow / retarded
    r"pich(a|ula)",  # cock
    r"pijas?",  # cock
    r"piko",  # ???
    r"pinches?",  # kitchen boy / insignificant
    r"pinga",  # dick
    r"pipi",  # small penis
    r"pirobos",  # fag
    r"pit(o|ito|ote)s?",  # penis
    r"pollas?",  # cock
    r"poronga",  # cock
    r"polvo",  # nut (testical)
    r"poto",  # ass
    r"prostituta",  # prostitute
    r"\w*put([ao]+|isim[oa]|iza)(s|n)?\w*",  # bitch/whore
    r"puñal",  # fag ???
    r"rabo",  # ass
    r"ramera",  # easy woman
    r"sida",  # AIDS
    r"sorete",  # piece of shit
    r"sudaca",  # greaser
    r"s[íi]filis",  # syphilis
    r"tajodido",  # ???
    r"tetas?",  # tits
    r"tont[ao]s?",  # fool / low intelligence
    r"torta",  # fat woman
    r"tortillera",  # lesbian
    r"tranca",  # bar ???
    r"travesti",  # transvestite
    r"travolo",  # ???
    r"trol(o|a)",  # faggot
    r"uta", r"utama", r"utamadre",  # short for 'puta'
    r"\w*verg(a|uero|ud[ao])s?\w*", r"versh",  # cock
    r"vibrador",  # vibrator
    r"viol(o|a|e|ar)",  # violate
    r"vulva",  # vulva
    r"watdafuq",  # "what the fuck"
    r"wea",  # shit
    r"webon(ada|e)?s?",  # Chileans (derogatory)
    r"zapatona",  # shoe ???
    r"zorr(a|ear)"  # bitch
]

badwords = RegexMatches(name + ".badwords", badword_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
badword detecting regexes.
"""

informal_regexes = [
    r"agan",  # ???
    r"agregenme",  # ???
    r"aguante",  # stamina
    r"aki",  # ???
    r"amaona",  # ???
    r"amigui",  # best (girl) friend
    r"amo+",  # love
    r"apesta",  # stinks
    r"asco",  # disgust
    r"asi",  # so
    r"atte?",  # short for atentamente
    r"(b+l+a+h*)+",
    r"bobada",  # ???
    r"bobos?",  # stupid / simple / fool
    r"bubis",  # ???
    r"chafa",  # low quality
    r"chale",  # no way!
    r"chido",  # cool
    r"chí",  # ???
    r"chil[ei]ar",  # ???
    r"comi[ao]",  # ???
    r"copien",  # copy
    r"cursiva",  # italics
    r"esq",  # abbrev. of 'esquina' meaning "corner" -- used in addresses
    r"est[uú]pid[aeo]+[rs]?",  # stupid
    r"fulan[ao]",  # pronoun for person
    r"fe[ao]s?",  # ugly
    r"gra(x|sias)",  # thank you
    r"guapo",  # beautiful
    r"h?o+la+",  # hello
    r"ho+lis?",  # ???
    r"ijos",  # ???
    r"inserta",  # inserts
    r"j+[eaiou]+(j+[aeiou]*)*",  # jaja jeje jojojo
    r"ke",  # short for 'que'
    r"kie(n|ro)",  # I want
    r"komo",  # ???
    r"lean",  # ???
    r"lees",  # ???
    r"lo[ck]os?",  # crazy
    r"(l+[uo]+l+)([uo]+l+)*",  # lol lollloololol lulllololul
    r"madrazo",  # a crash (slang)
    r"malparida",  # miscaried
    r"mcfinnigan",  # ???
    r"mensos?",  # stupid / annoying person
    r"meti[oa]",  # ???
    r"metroflog",  # ???
    r"migu?is",  # ???
    r"muxo",  # ???
    r"negrit[ao]",  # black person (non-offensive)
    r"nocheto",  # ???
    r"noo+",  # noooo
    r"nop",  # nope
    r"ojala",  # hopefully
    r"o+l[ia]",
    r"osea",  # "that is"
    r"pollid",  # ???
    r"popo",  # ???
    r"pipi",  # small penis
    r"plis",  # ???
    r"por[- ]?favor",  # please
    r"por[- ]?[kq]ue?", r"porke", r"porqe?",  # why!?
    r"porquer[ií]as?",  # low quality
    r"profe",  # slang for teacher
    r"pupu",  # ???
    r"qiero",  # ???
    r"salud(o?s)?",  # cheers!
    r"sierto",  # ???
    r"shí",  # ???
    r"sii+",  # "yessss"
    r"soi",  # I am
    r"sophonpanich",  # ???
    r"ta?mbn", r"tkm",  # also (slang)
    r"tanga",  # thong
    r"te[- ]?quiero[- ]?mucho", r"tqm",  # I care for you a lot
    r"umaxnet",  # ???
    r"vallanse",  # move out
    r"vayanse",  # go away
    r"wen[ao]",  # Hello
    r"weon(es)?",  # dude / thing
    r"wey",  # buddy
    r"xd+",  # XD (smiley face squiting)
    r"xfarm",  # ???
    r"yolo",  # You Only Live Once
    r"zorpia"  # ???
]

informals = RegexMatches(name + ".informals", informal_regexes)
"""
:class:`~revscoring.languages.features.RegexMatches` features via a list of
informal word detecting regexes.
"""

Revision Scoring

Navigation