LibInsult - Python
insult.py
Go to the documentation of this file.
1 ## \package insult
2 ## \copyright Copyright (C) 2016 Mattia Basaglia
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 #
17 import random
18 import os
19 import re
20 import json
21 
22 class WordList(object):
23  """!
24  A list of insulting words of some kind
25  """
26 
27  def __init__(self, id, words=[]):
28  """!
29  \param id A WordListId (expanded with word_list_id())
30  \param words An iterable containing words to add to the list
31  """
32  id = word_list_id(id)
33 
34  ## Name of the word list
35  self.name = id.name
36  ## Set of words in the list
37  self.words = set(words)
38  ## Flags fot the set of words \todo
39  ## \note Two word lists can have the same name iff they have different flags
40  self.flags = id.flags
41 
42  def add_word(self, word):
43  """!
44  Appends a word to the list
45  """
46  self.words.add(word)
47 
48  def get(self, max_count=1, min_count=None):
49  """!
50  Retrieves a random subset of words
51  \param max_count Maximum number of words to return
52  \param min_count Minimum number of words to return,
53  if omitted, returns exactly \p max_count words
54  """
55  if max_count > len(self.words):
56  max_count = len(self.words)
57  if min_count is None:
58  min_count = max_count
59  return random.sample(self.words, random.randint(min_count, max_count))
60 
61  def check_flags(self, flags):
62  """!
63  Checks if the flags match
64  \todo
65  """
66  return True
67 
68 
69 class WordListId(object):
70  """!
71  Identifier for a word list
72  """
73 
74  def __init__(self, name, flags=0):
75  self.name = name
76  self.flags = flags
77 
78  def check(self, word_list):
79  """!
80  Checks if the word list matches this Id
81  \param word_list a WordList object
82  """
83  return word_list.name == self.name and word_list.check_flags(self.flags)
84 
85 
86 def word_list_id(*args):
87  """!
88  Workaround for the lack of overloading
89  """
90  if len(args) == 1:
91  if isinstance(args[0], basestring):
92  return WordListId(args[0])
93  if isinstance(args[0], WordListId):
94  return args[0]
95  if type(args[0]) is tuple and len(args[0]) == 2:
96  return WordListId(*args[0])
97  elif len(args) == 2:
98  return WordListId(*args)
99  raise TypeError("Invalid arguments to the WordListId constructor")
100 
101 
102 class Insulter(object):
103  """!
104  Object that can generate insults
105  \note all methods that take a WordListId argument, will expand it
106  using word_list_id()
107  """
108 
109  ## Regex used to recognize valid word file names
110  regex_word_file = re.compile(r'^[a-z_]+$')
111  ## File name for language rules
112  rules_file = "rules.json"
113 
114  def __init__(self):
115  """!
116  """
117  ## Word lists to look up
118  self.word_lists = []
119  ## Maximum number of repetitions allowed
120  self.max_count = None
121  ## Maximum number of repetitions allowed for a specific word list
122  self.list_max_count = {}
123  ## Language rules
124  self.rules = {}
125 
126  def word_list(self, wl_id, add=False):
127  """!
128  Returns a matching word list
129  \param wl_id WordListId to match the list
130  \param add if \c True, missing lists will be added to the insulter
131  \throws Exception if the list cannot be retrieved
132  """
133  wl_id = word_list_id(wl_id)
134  for word_list in self.word_lists:
135  if wl_id.check(word_list):
136  return word_list
137 
138  if add:
139  self.word_lists.append(WordList(wl_id, []))
140  return self.word_lists[-1]
141 
142  raise Exception("Word list not found: %s" % wl_id.name)
143 
144  def load_directory(self, path):
145  """!
146  Loads all word lists in \p path
147  \param path The path to the directory to load
148  \note Only files matching regex_word_file will be considered
149  """
150  for basename in os.listdir(path):
151  full = os.path.join(path, basename)
152  if basename == Insulter.rules_file:
153  try:
154  with open(full) as file:
155  rules = json.load(file)
156  for name, rule in rules.iteritems():
157  self.set_rules(name, rule)
158  except ValueError:
159  pass
160  elif os.path.isfile(full) and self.regex_word_file.match(basename):
161  with open(full) as file:
162  lines = filter(bool, (line.strip() for line in file))
163  self.add_words(basename, lines)
164 
165  def add_words(self, wl_id, words):
166  """!
167  Adds words to a word list
168  \param wl_id WordListId to match the list
169  \param words Iterable with words to be added
170  """
171  self.word_list(wl_id, True).words |= set(words)
172 
173  def get(self, wl_id, max_count=1, min_count=None):
174  """!
175  Retrieves a random subset of words form a word list
176  \param wl_id WordListId to match the list
177  \param max_count Maximum number of words to return,
178  note that it will be checked against the value set
179  with set_max()
180  \param min_count Minimum number of words to return,
181  if omitted, returns exactly \p max_count words
182  \see set_max(), WordList.get()
183  """
184  wl_id = word_list_id(wl_id)
185  if self.max_count is not None and max_count > self.max_count:
186  max_count = self.max_count
187  if max_count > self.list_max_count.get(wl_id.name, max_count):
188  max_count = self.list_max_count[wl_id.name]
189  return self.word_list(wl_id).get(max_count, min_count)
190 
191  def format(self, string):
192  """!
193  Formats an insult string
194  """
195  doc = NotQuiteXml(string, lambda e: e.attrs.get("_expansion", ""))
196 
197  new_unexpanded = [
198  elem
199  for elem in doc.contents
200  if isinstance(elem, NotQuiteXmlElement)
201  ]
202  unexpanded = []
203 
204  while len(new_unexpanded) != len(unexpanded) and new_unexpanded:
205  unexpanded = new_unexpanded
206  new_unexpanded = []
207  for elem in unexpanded:
208  if not self._expand_element(elem, doc):
209  new_unexpanded.append(elem)
210  return str(doc)
211 
212  def _expand_element(self, element, doc):
213  try:
214  if element.tag_name in self.rules:
215  return self._expand_rule(element, doc)
216  min = None
217  max = 1
218  if "count" in element.attrs:
219  max = int(element.count)
220  elif "max" in element.attrs:
221  max = int(element.max)
222  min = int(element.attrs.get("min", element.max))
223  element._expansion = " ".join(self.get(element.tag_name, max, min))
224  return True
225  except Exception:
226  return False
227 
228  def _expand_rule(self, element, doc):
229  for rule in self.rules[element.tag_name]:
230  pattern = rule["target"]
231  target = doc.element_by_id(element.target)
232  if re.match(pattern, target._expansion):
233  element._expansion = re.sub(pattern, rule["result"], target._expansion)
234  return True
235  return False
236 
237  def set_max(self, max_count, word_list=None):
238  """!
239  Set the maximum number of repetitions for get()
240  \param max_count Maximum to be set,
241  if \c None will disable the maximum limit
242  \param word_list Word list name (note: not a WordListId)
243  """
244  if word_list is None:
245  self.max_count = max_count
246  elif max_count is None:
247  if str(word_list) in self.list_max_count:
248  del self.list_max_count[str(word_list)]
249  else:
250  self.list_max_count[str(word_list)] = max_count
251 
252  def set_rules(self, name, rules):
253  """!
254  Sets some language rules for the given identifier
255  """
256  self.rules[name] = rules
257 
258 
259 class NotQuiteXml(object):
260  """!
261  Parses strings with flat xml elements intermixed with text
262  Only a very minimal subset of xml/sgml is supported
263  """
264  _entities = {
265  "lt": "<",
266  "gt": ">",
267  "amp": "&",
268  }
269 
270  def __init__(self, contents=None, to_string=lambda x: ""):
271  """!
272  \param contents \c None or a sting to be parsed
273  \param to_string A functor to convert NotQuiteXmlElement
274  objects to a string
275  """
276  ## List mixing strings and NotQuiteXmlElement elements
277  self.contents = []
278  ## Elements with an ID
280  ## Functor to convert NotQuiteXmlElement objects to a string
281  self.to_string = to_string
282 
283  if isinstance(contents, basestring):
284  self.parse_string(contents)
285 
286  def element_by_id(self, id):
287  """!
288  Returns the element matching the given id
289  """
290  return self.elements_with_id[id]
291 
292  def elements_by_tag_name(self, name):
293  """!
294  Returns a list of elements with the given tag name
295  """
296  return [
297  element
298  for element in self.contents
299  if isinstance(element, NotQuiteXmlElement) and
300  element.tag_name == name
301  ]
302 
303  def elements_by_attribute(self, name, value):
304  """!
305  Returns a list of elements having the given attribute with the given value
306  """
307  return [
308  element
309  for element in self.contents
310  if isinstance(element, NotQuiteXmlElement) and
311  name in element.attrs and element.attrs[name] == value
312  ]
313 
314 
315  def __str__(self):
316  """!
317  Converts the document to a string (using self.to_string for elements)
318  """
319  return "".join(str(elem) for elem in self.contents)
320 
321  def __repr__(self):
322  return "".join(
323  elem if type(elem) is str else repr(elem)
324  for elem in self.contents
325  )
326 
327  def parse_string(self, string):
328  """!
329  Parses a string into self.contents
330  """
331  self.contents = list(self._lex_text(iter(string)))
332 
333  def _lex_text(self, iterator):
334  """!
335  Internal lexer, starting state (text)
336  yields elements for self.contents
337  \param iterator A character iterator
338  """
339  string = ""
340  try:
341  while True:
342  ch = iterator.next()
343  if ch == "<":
344  if string:
345  yield string
346  string = ""
347  yield self._lex_elem_name(iterator)
348  elif ch == "&":
349  string += self._lex_entity(iterator)
350  else:
351  string += ch
352  except StopIteration:
353  if string:
354  yield string
355 
356  def _lex_entity(self, iterator):
357  """!
358  Internal lexer, entity state
359  in: (text) -> &
360  out: ; -> (text)
361  \returns A string corresponding to the entity
362  \param iterator A character iterator
363  """
364  name = ""
365  while True:
366  ch = iterator.next()
367  if ch == ";":
368  break
369  else:
370  name += ch
371  return self._entities.get(name, "")
372 
373  def _lex_elem_name(self, iterator):
374  """!
375  Internal lexer, element name state
376  in: (text) -> <
377  out: /> | > | _ -> (attrs) -> (text)
378  \returns A string corresponding to the entity
379  \param iterator A character iterator
380  """
381  name = ""
382  while True:
383  ch = iterator.next()
384  if ch.isspace() or ch in "/>":
385  element = NotQuiteXmlElement(self, name)
386  self._lex_elem_attrs(ch, iterator, element)
387  return element
388  name += ch
389 
390  def _lex_elem_attrs(self, och, iterator, element):
391  """!
392  Internal lexer, element attributes state
393  in: (text) -> (element name) -> _ | > | />
394  out: /> | > | -> (text)
395  \param och Character used to enter this state
396  \param iterator A character iterator
397  \param element Element to set the attributes to
398  """
399  ch = och
400  while True:
401  if ch == ">":
402  return element
403  elif ch == "/":
404  iterator.next()
405  return element
406  elif ch.isspace() or ch == "":
407  ch = iterator.next()
408  else:
409  ch = self._lex_elem_attr_name(ch, iterator, element)
410 
411  def _lex_elem_attr_name(self, och, iterator, element):
412  """!
413  Internal lexer, element attribute name
414  in: (attrs) -> [not space or tag end]
415  out: = -> (attr_value) -> (attrs)
416  out: /> | > | _ -> (attrs)
417  \param och Character used to enter this state
418  \param iterator A character iterator
419  \param element Element to set the attributes to
420  \returns A lookahead character
421  """
422  name = och
423  while True:
424  ch = iterator.next()
425  if ch.isspace() or ch in "/>":
426  if name != "id":
427  element[name] = name
428  return ch
429  elif ch == "=":
430  return self._lex_elem_attr_value(name, iterator, element)
431  name += ch
432 
433  def _lex_elem_attr_value(self, name, iterator, element):
434  """!
435  Internal lexer, element attribute value
436  in: (attr_name) -> =
437  out: (attr_name) -> =" -> (here) -> " -> (attrs)
438  out: (attr_name) -> =' -> (here) -> ' -> (attrs)
439  out: (attr_name) -> = -> (here) -> _ -> (attrs)
440  \param name Name of the attribute
441  \param iterator A character iterator
442  \param element Element to set the attributes to
443  \returns A lookahead character
444  """
445  ch = iterator.next()
446  value = ""
447  skip = True
448  if ch == '"' or ch == "'":
449  delim = ch
450  break_cond = lambda char: char == delim
451  elif ch.isspace() or ch in "/>":
452  break_cond = lambda char: True
453  skip = False
454  else:
455  break_cond = lambda char: char.isspace() or char in "/>"
456  value = ch
457  skip = False
458 
459  ch = iterator.next()
460  while not break_cond(ch):
461  value += ch
462  ch = iterator.next()
463 
464  self._elem_set_attr(element, name, value)
465  if skip:
466  ch = iterator.next()
467  return ch
468 
469  def _elem_set_attr(self, element, name, value):
470  """!
471  Internal, sets an element attribute during parsing
472  \todo handle this where you can set attributes in NotQuiteXmlElement
473  """
474  if name == "id":
475  if value not in self.elements_with_id:
476  element.id = value if value else None
477  else:
478  element[name] = value
479 
480 
481 class NotQuiteXmlElement(object):
482  """!
483  Non-text element in a NotQuiteXml document.
484  Attributes can be accessed with the subscript operator or as members
485  (if they don't clash with other members)
486  """
487  def __init__(self, document, tag_name, id=None, attrs={}):
488  """!
489  \param document A NotQuiteXml object which contains this element
490  \param tag_name Name of the element tag in the source string
491  \param id Element id, must be unique in \p document or None
492  \param attrs Extra attribues
493  """
494  self._document = document
495  self._tag_name = tag_name
496  self.attrs = attrs.copy()
497  self._id = None # this must be last (see __setattr__)
498  self.id = id
499 
500  @property
501  def document(self):
502  return self._document
503 
504  @property
505  def tag_name(self):
506  return self._tag_name
507 
508  @property
509  def id(self):
510  return self._id
511 
512  @id.setter
513  def id(self, value):
514  if value == self._id:
515  return
516 
517  if value is None:
518  del self.id
519  return
520 
521  if value in self.document.elements_with_id:
522  raise KeyError("ID already in use: %s" % value)
523 
524  if self._id is not None:
525  del self.document.elements_with_id[self._id]
526  self._id = value
527  self.document.elements_with_id[value] = self
528 
529  @id.deleter
530  def id(self):
531  if self._id is not None:
532  del self.document.elements_with_id[self._id]
533  self._id = None
534 
535  def __getitem__(self, key):
536  return self.attrs[key]
537 
538  def __setitem__(self, key, value):
539  self.attrs[key] = value
540 
541  def __delitem__(self, key):
542  del self.attrs[key]
543 
544  def __getattr__(self, name):
545  try:
546  super(NotQuiteXmlElement, self).__getattr__(name)
547  except AttributeError:
548  if name in self.attrs:
549  return self.attrs[name]
550  else:
551  raise
552 
553  def __setattr__(self, name, value):
554  if name in dir(self) or "_id" not in dir(self):
555  super(NotQuiteXmlElement, self).__setattr__(name, value)
556  else:
557  self.attrs[name] = value
558 
559  def __delattr__(self, name):
560  try:
561  super(NotQuiteXmlElement, self).__delattr__(name)
562  except AttributeError:
563  if name in self.attrs:
564  del self.attrs[name]
565  else:
566  raise
567 
568  def __str__(self):
569  """!
570  Converts the element to a string using the document to_string attribute
571  """
572  return self.document.to_string(self)
573 
574  def __repr__(self):
575  return "<%s%s%s/>" % (
576  self.tag_name,
577  " id=\"%s\"" % self._id if self._id is not None else "",
578  "".join(" %s=\"%s\"" % attr for attr in self.attrs.iteritems() )
579  )
def word_list(self, wl_id, add=False)
Returns a matching word list.
Definition: insult.py:126
to_string
Functor to convert NotQuiteXmlElement objects to a string.
Definition: insult.py:281
list_max_count
Maximum number of repetitions allowed for a specific word list.
Definition: insult.py:122
def __repr__(self)
Definition: insult.py:321
def check(self, word_list)
Checks if the word list matches this Id.
Definition: insult.py:78
def __init__(self, document, tag_name, id=None, attrs={})
Definition: insult.py:487
elements_with_id
Elements with an ID.
Definition: insult.py:279
def _lex_elem_attrs(self, och, iterator, element)
Internal lexer, element attributes state in: (text) -> (element name) -> _ | > | /> out: /> | > | -> ...
Definition: insult.py:390
def _elem_set_attr(self, element, name, value)
Internal, sets an element attribute during parsing.
Definition: insult.py:469
def check_flags(self, flags)
Checks if the flags match.
Definition: insult.py:61
words
Set of words in the list.
Definition: insult.py:37
def add_word(self, word)
Appends a word to the list.
Definition: insult.py:42
def __init__(self)
Definition: insult.py:114
def __init__(self, contents=None, to_string=lambda x:"")
Definition: insult.py:270
def __init__(self, name, flags=0)
Definition: insult.py:74
def add_words(self, wl_id, words)
Adds words to a word list.
Definition: insult.py:165
def _lex_entity(self, iterator)
Internal lexer, entity state in: (text) -> & out: ; -> (text)
Definition: insult.py:356
def _lex_elem_attr_value(self, name, iterator, element)
Internal lexer, element attribute value in: (attr_name) -> = out: (attr_name) -> =" -> (here) -> " ->...
Definition: insult.py:433
def set_rules(self, name, rules)
Sets some language rules for the given identifier.
Definition: insult.py:252
def __setattr__(self, name, value)
Definition: insult.py:553
def __getitem__(self, key)
Definition: insult.py:535
def _expand_element(self, element, doc)
Definition: insult.py:212
def __str__(self)
Converts the element to a string using the document to_string attribute.
Definition: insult.py:568
def set_max(self, max_count, word_list=None)
Set the maximum number of repetitions for get()
Definition: insult.py:237
Object that can generate insults.
Definition: insult.py:102
def __getattr__(self, name)
Definition: insult.py:544
name
Name of the word list.
Definition: insult.py:35
def __delitem__(self, key)
Definition: insult.py:541
def word_list_id(args)
Workaround for the lack of overloading.
Definition: insult.py:86
def __setitem__(self, key, value)
Definition: insult.py:538
def parse_string(self, string)
Parses a string into self.contents.
Definition: insult.py:327
Non-text element in a NotQuiteXml document.
Definition: insult.py:481
def _lex_text(self, iterator)
Internal lexer, starting state (text) yields elements for self.contents.
Definition: insult.py:333
rules
Language rules.
Definition: insult.py:124
flags
Flags fot the set of words.
Definition: insult.py:40
def get(self, wl_id, max_count=1, min_count=None)
Retrieves a random subset of words form a word list.
Definition: insult.py:173
def __str__(self)
Converts the document to a string (using self.to_string for elements)
Definition: insult.py:315
def get(self, max_count=1, min_count=None)
Retrieves a random subset of words.
Definition: insult.py:48
max_count
Maximum number of repetitions allowed.
Definition: insult.py:120
def load_directory(self, path)
Loads all word lists in path.
Definition: insult.py:144
Parses strings with flat xml elements intermixed with text Only a very minimal subset of xml/sgml is ...
Definition: insult.py:259
def __init__(self, id, words=[])
Definition: insult.py:27
A list of insulting words of some kind.
Definition: insult.py:22
def __delattr__(self, name)
Definition: insult.py:559
def elements_by_tag_name(self, name)
Returns a list of elements with the given tag name.
Definition: insult.py:292
contents
List mixing strings and NotQuiteXmlElement elements.
Definition: insult.py:277
def _lex_elem_attr_name(self, och, iterator, element)
Internal lexer, element attribute name in: (attrs) -> [not space or tag end] out: = -> (attr_value) -...
Definition: insult.py:411
def elements_by_attribute(self, name, value)
Returns a list of elements having the given attribute with the given value.
Definition: insult.py:303
def format(self, string)
Formats an insult string.
Definition: insult.py:191
def _lex_elem_name(self, iterator)
Internal lexer, element name state in: (text) -> < out: /> | > | _ -> (attrs) -> (text) ...
Definition: insult.py:373
def element_by_id(self, id)
Returns the element matching the given id.
Definition: insult.py:286
Identifier for a word list.
Definition: insult.py:69
def _expand_rule(self, element, doc)
Definition: insult.py:228
word_lists
Word lists to look up.
Definition: insult.py:118