# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2013-2015 Samuel Damashek, Peter Foley, James Forcier, Srijay Kasturi, Reed Koser, Christopher Reffett, and Fox Wilson
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
import collections
import re
import string
import time
from sqlalchemy import Index, or_
from sqlalchemy.exc import OperationalError
from .orm import Babble, Babble2, Babble_count, Babble_last, Log
[docs]def get_messages(cursor, cmdchar, ctrlchan, speaker, newer_than_id):
query = cursor.query(Log).filter(Log.id > newer_than_id)
# Ignore commands, and messages addressed to the ctrlchan
query = query.filter(or_(Log.type == 'pubmsg', Log.type == 'privmsg', Log.type == 'action'), ~Log.msg.startswith(cmdchar), Log.target != ctrlchan)
if speaker is not None:
location = 'target' if speaker.startswith(('#', '+', '@')) else 'source'
query = query.filter(getattr(Log, location).ilike(speaker, escape='$'))
return query.order_by(Log.id).all()
# Don't exclude (, because lenny.
exclude_re = re.compile('https?://|^[0-9%s]+$' % string.punctuation.replace('(', ''))
[docs]def clean_msg(msg):
return [x for x in msg.split() if not exclude_re.match(x)]
[docs]def get_markov(cursor, length, node, initial_run):
ret = collections.defaultdict(int)
if initial_run:
return ret
table = Babble if length == 1 else Babble2
key, source, target = node
old = cursor.query(table).filter(table.key == key, table.source == source, table.target == target).all()
ret.update({x.word: x.freq for x in old})
return ret
[docs]def update_count(cursor, rows, length, source, target):
try:
count_source = next(r for r in rows if r.type == 'source' and r.key == source)
count_source.count = count_source.count + 1
except StopIteration:
cursor.add(Babble_count(type='source', length=length, key=source, count=1))
try:
count_target = next(r for r in rows if r.type == 'target' and r.key == target)
count_target.count = count_target.count + 1
except StopIteration:
cursor.add(Babble_count(type='target', length=length, key=target, count=1))
[docs]def generate_markov(cursor, length, messages, initial_run):
markov = {}
for row in messages:
msg = clean_msg(row.msg)
for i in range(length, len(msg)):
if length == 1:
prev = msg[i - 1]
else:
prev = "%s %s" % (msg[i - 2], msg[i - 1])
node = (prev, row.source, row.target)
if node not in markov:
markov[node] = get_markov(cursor, length, node, initial_run)
markov[node][msg[i]] += 1
return markov
[docs]def build_rows(cursor, length, markov, initial_run):
table = Babble if length == 1 else Babble2
data = []
count_source = collections.defaultdict(int)
count_target = collections.defaultdict(int)
count_rows = cursor.query(Babble_count).filter(Babble_count.length == length).all()
for node, word_freqs in markov.items():
key, source, target = node
if not initial_run:
row_dict = {}
for row in cursor.query(table).filter(table.key == key, table.source == source, table.target == target):
row_dict[row.word] = row
for word, freq in word_freqs.items():
row = None
if not initial_run:
if word in row_dict.keys():
row = row_dict[word]
if row:
row.freq = freq
else:
if initial_run:
count_source[source] += 1
count_target[target] += 1
else:
update_count(cursor, count_rows, length, source, target)
data.append((source, target, key, word, freq))
count_data = []
for source, count in count_source.items():
count_data.append({'type': 'source', 'key': source, 'count': count, 'length': length})
for target, count in count_target.items():
count_data.append({'type': 'target', 'key': target, 'count': count, 'length': length})
return data, count_data
[docs]def postgres_hack(cursor, length, data):
table = "babble" if length == 1 else "babble2"
# Crazy magic to insert a ton of data really fast, drops runtime in half on large datasets.
raw_cursor = cursor.connection().connection.cursor()
prev = 0
insert_str = "INSERT INTO " + table + " (source,target,key,word,freq) VALUES(%s,%s,%s,%s,%s);"
for i in range(20000, len(data), 20000):
args_str = '\n'.join([raw_cursor.mogrify(insert_str, x).decode() for x in data[prev:i]])
# Don't die on empty log table.
if args_str:
raw_cursor.execute(args_str)
prev = i
args_str = '\n'.join([raw_cursor.mogrify(insert_str, x).decode() for x in data[prev:]])
# Don't die on empty log table.
if args_str:
raw_cursor.execute(args_str)
[docs]def delete_tables(cursor):
if cursor.bind.dialect.name == 'mysql':
cursor.execute('DROP INDEX ix_babble_key ON babble')
cursor.execute('DROP INDEX ix_babble2_key ON babble2')
else:
cursor.execute('DROP INDEX IF EXISTS ix_babble_key')
cursor.execute('DROP INDEX IF EXISTS ix_babble2_key')
cursor.execute(Babble.__table__.delete())
cursor.execute(Babble2.__table__.delete())
cursor.execute(Babble_count.__table__.delete())
[docs]def build_markov(cursor, cmdchar, ctrlchan, speaker=None, initial_run=False, debug=False):
""" Builds a markov dictionary."""
if initial_run:
cursor.query(Babble_last).delete()
lastrow = cursor.query(Babble_last).first()
if not lastrow:
lastrow = Babble_last(last=0)
cursor.add(lastrow)
t = time.time() # for debug
messages = get_messages(cursor, cmdchar, ctrlchan, speaker, lastrow.last)
# FIXME: count can be too low if speaker is not None
curr = messages[-1].id if messages else None
markov = generate_markov(cursor, 1, messages, initial_run)
markov2 = generate_markov(cursor, 2, messages, initial_run)
if debug:
print('Generated markov in %f' % (time.time() - t))
t = time.time()
data, count_data = build_rows(cursor, 1, markov, initial_run)
data2, count_data2 = build_rows(cursor, 2, markov2, initial_run)
if debug:
print('Rows built in %f' % (time.time() - t))
if initial_run:
t = time.time() # for debug
delete_tables(cursor)
if debug:
print('Tables deleted in %f' % (time.time() - t))
t = time.time() # for debug
if initial_run and cursor.bind.dialect.name == 'postgresql':
postgres_hack(cursor, 1, data)
postgres_hack(cursor, 2, data2)
else:
data = [{'source': x[0], 'target': x[1], 'key': x[2], 'word': x[3], 'freq': x[4]} for x in data]
cursor.bulk_insert_mappings(Babble, data)
data2 = [{'source': x[0], 'target': x[1], 'key': x[2], 'word': x[3], 'freq': x[4]} for x in data2]
cursor.bulk_insert_mappings(Babble2, data2)
cursor.bulk_insert_mappings(Babble_count, count_data)
cursor.bulk_insert_mappings(Babble_count, count_data2)
if debug:
print('Inserted rows in %f' % (time.time() - t))
if curr is not None:
lastrow.last = curr
if initial_run:
if debug:
t = time.time()
key_index = Index('ix_babble_key', Babble.key)
key_index2 = Index('ix_babble2_key', Babble2.key)
key_index.create(cursor.connection())
key_index2.create(cursor.connection())
if debug:
print('Created index in %f' % (time.time() - t))
t = time.time() # for debug
cursor.commit()
if debug:
print('Commited in %f' % (time.time() - t))
[docs]def update_markov(cursor, config):
cmdchar = config['core']['cmdchar']
ctrlchan = config['core']['ctrlchan']
try:
# FIXME: support locking for other dialects?
if cursor.bind.dialect.name == 'postgresql':
cursor.execute('LOCK TABLE babble IN EXCLUSIVE MODE NOWAIT')
cursor.execute('LOCK TABLE babble2 IN EXCLUSIVE MODE NOWAIT')
cursor.execute('LOCK TABLE babble_count IN EXCLUSIVE MODE NOWAIT')
cursor.execute('LOCK TABLE babble_last IN EXCLUSIVE MODE NOWAIT')
build_markov(cursor, cmdchar, ctrlchan)
return True
except OperationalError as ex:
# If we can't lock the table, silently skip updating and wait for the next time we're called.
if 'could not obtain lock on relation "babble' not in str(ex):
raise
return False