Source code for cslbot.helpers.babble

# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# Copyright (C) 2013-2015 Samuel Damashek, Peter Foley, James Forcier, Srijay Kasturi, Reed Koser, Christopher Reffett, and Fox Wilson
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.

import collections
import re
import string
import time

from sqlalchemy import Index, or_
from sqlalchemy.exc import OperationalError

from .orm import Babble, Babble2, Babble_count, Babble_last, Log


[docs]def get_messages(cursor, cmdchar, ctrlchan, speaker, newer_than_id):
    query = cursor.query(Log).filter(Log.id > newer_than_id)
    # Ignore commands, and messages addressed to the ctrlchan
    query = query.filter(or_(Log.type == 'pubmsg', Log.type == 'privmsg', Log.type == 'action'), ~Log.msg.startswith(cmdchar), Log.target != ctrlchan)
    if speaker is not None:
        location = 'target' if speaker.startswith(('#', '+', '@')) else 'source'
        query = query.filter(getattr(Log, location).ilike(speaker, escape='$'))
    return query.order_by(Log.id).all()


# Don't exclude (, because lenny.
exclude_re = re.compile('https?://|^[0-9%s]+$' % string.punctuation.replace('(', ''))


[docs]def clean_msg(msg):
    return [x for x in msg.split() if not exclude_re.match(x)]


[docs]def get_markov(cursor, length, node, initial_run):
    ret = collections.defaultdict(int)
    if initial_run:
        return ret
    table = Babble if length == 1 else Babble2
    key, source, target = node
    old = cursor.query(table).filter(table.key == key, table.source == source, table.target == target).all()
    ret.update({x.word: x.freq for x in old})
    return ret


[docs]def update_count(cursor, rows, length, source, target):
    try:
        count_source = next(r for r in rows if r.type == 'source' and r.key == source)
        count_source.count = count_source.count + 1
    except StopIteration:
        cursor.add(Babble_count(type='source', length=length, key=source, count=1))
    try:
        count_target = next(r for r in rows if r.type == 'target' and r.key == target)
        count_target.count = count_target.count + 1
    except StopIteration:
        cursor.add(Babble_count(type='target', length=length, key=target, count=1))


[docs]def generate_markov(cursor, length, messages, initial_run):
    markov = {}
    for row in messages:
        msg = clean_msg(row.msg)
        for i in range(length, len(msg)):
            if length == 1:
                prev = msg[i - 1]
            else:
                prev = "%s %s" % (msg[i - 2], msg[i - 1])
            node = (prev, row.source, row.target)
            if node not in markov:
                markov[node] = get_markov(cursor, length, node, initial_run)
            markov[node][msg[i]] += 1
    return markov


[docs]def build_rows(cursor, length, markov, initial_run):
    table = Babble if length == 1 else Babble2
    data = []
    count_source = collections.defaultdict(int)
    count_target = collections.defaultdict(int)
    count_rows = cursor.query(Babble_count).filter(Babble_count.length == length).all()
    for node, word_freqs in markov.items():
        key, source, target = node
        if not initial_run:
            row_dict = {}
            for row in cursor.query(table).filter(table.key == key, table.source == source, table.target == target):
                row_dict[row.word] = row
        for word, freq in word_freqs.items():
            row = None
            if not initial_run:
                if word in row_dict.keys():
                    row = row_dict[word]
            if row:
                row.freq = freq
            else:
                if initial_run:
                    count_source[source] += 1
                    count_target[target] += 1
                else:
                    update_count(cursor, count_rows, length, source, target)
                data.append((source, target, key, word, freq))
    count_data = []
    for source, count in count_source.items():
        count_data.append({'type': 'source', 'key': source, 'count': count, 'length': length})
    for target, count in count_target.items():
        count_data.append({'type': 'target', 'key': target, 'count': count, 'length': length})
    return data, count_data


[docs]def postgres_hack(cursor, length, data):
    table = "babble" if length == 1 else "babble2"
    # Crazy magic to insert a ton of data really fast, drops runtime in half on large datasets.
    raw_cursor = cursor.connection().connection.cursor()
    prev = 0
    insert_str = "INSERT INTO " + table + " (source,target,key,word,freq) VALUES(%s,%s,%s,%s,%s);"
    for i in range(20000, len(data), 20000):
        args_str = '\n'.join([raw_cursor.mogrify(insert_str, x).decode() for x in data[prev:i]])
        # Don't die on empty log table.
        if args_str:
            raw_cursor.execute(args_str)
        prev = i
    args_str = '\n'.join([raw_cursor.mogrify(insert_str, x).decode() for x in data[prev:]])
    # Don't die on empty log table.
    if args_str:
        raw_cursor.execute(args_str)


[docs]def delete_tables(cursor):
    if cursor.bind.dialect.name == 'mysql':
        cursor.execute('DROP INDEX ix_babble_key ON babble')
        cursor.execute('DROP INDEX ix_babble2_key ON babble2')
    else:
        cursor.execute('DROP INDEX IF EXISTS ix_babble_key')
        cursor.execute('DROP INDEX IF EXISTS ix_babble2_key')
    cursor.execute(Babble.__table__.delete())
    cursor.execute(Babble2.__table__.delete())
    cursor.execute(Babble_count.__table__.delete())


[docs]def build_markov(cursor, cmdchar, ctrlchan, speaker=None, initial_run=False, debug=False):
    """ Builds a markov dictionary."""
    if initial_run:
        cursor.query(Babble_last).delete()
    lastrow = cursor.query(Babble_last).first()
    if not lastrow:
        lastrow = Babble_last(last=0)
        cursor.add(lastrow)
    t = time.time()  # for debug
    messages = get_messages(cursor, cmdchar, ctrlchan, speaker, lastrow.last)
    # FIXME: count can be too low if speaker is not None
    curr = messages[-1].id if messages else None
    markov = generate_markov(cursor, 1, messages, initial_run)
    markov2 = generate_markov(cursor, 2, messages, initial_run)
    if debug:
        print('Generated markov in %f' % (time.time() - t))
        t = time.time()
    data, count_data = build_rows(cursor, 1, markov, initial_run)
    data2, count_data2 = build_rows(cursor, 2, markov2, initial_run)
    if debug:
        print('Rows built in %f' % (time.time() - t))
    if initial_run:
        t = time.time()  # for debug
        delete_tables(cursor)
        if debug:
            print('Tables deleted in %f' % (time.time() - t))
    t = time.time()  # for debug
    if initial_run and cursor.bind.dialect.name == 'postgresql':
        postgres_hack(cursor, 1, data)
        postgres_hack(cursor, 2, data2)
    else:
        data = [{'source': x[0], 'target': x[1], 'key': x[2], 'word': x[3], 'freq': x[4]} for x in data]
        cursor.bulk_insert_mappings(Babble, data)
        data2 = [{'source': x[0], 'target': x[1], 'key': x[2], 'word': x[3], 'freq': x[4]} for x in data2]
        cursor.bulk_insert_mappings(Babble2, data2)
    cursor.bulk_insert_mappings(Babble_count, count_data)
    cursor.bulk_insert_mappings(Babble_count, count_data2)
    if debug:
        print('Inserted rows in %f' % (time.time() - t))
    if curr is not None:
        lastrow.last = curr
    if initial_run:
        if debug:
            t = time.time()
        key_index = Index('ix_babble_key', Babble.key)
        key_index2 = Index('ix_babble2_key', Babble2.key)
        key_index.create(cursor.connection())
        key_index2.create(cursor.connection())
        if debug:
            print('Created index in %f' % (time.time() - t))
    t = time.time()  # for debug
    cursor.commit()
    if debug:
        print('Commited in %f' % (time.time() - t))


[docs]def update_markov(cursor, config):
    cmdchar = config['core']['cmdchar']
    ctrlchan = config['core']['ctrlchan']
    try:
        # FIXME: support locking for other dialects?
        if cursor.bind.dialect.name == 'postgresql':
            cursor.execute('LOCK TABLE babble IN EXCLUSIVE MODE NOWAIT')
            cursor.execute('LOCK TABLE babble2 IN EXCLUSIVE MODE NOWAIT')
            cursor.execute('LOCK TABLE babble_count IN EXCLUSIVE MODE NOWAIT')
            cursor.execute('LOCK TABLE babble_last IN EXCLUSIVE MODE NOWAIT')
        build_markov(cursor, cmdchar, ctrlchan)
        return True
    except OperationalError as ex:
        # If we can't lock the table, silently skip updating and wait for the next time we're called.
        if 'could not obtain lock on relation "babble' not in str(ex):
            raise
        return False