Source code for pynfg.rlsolutions.mcrl

# -*- coding: utf-8 -*-
"""
Implements Monte Carlo Reinforcement Learning for iterSemiNFG objects

Created on Mon Feb 18 09:03:32 2013

Copyright (C) 2013 James Bono (jwbono@gmail.com)

GNU Affero General Public License

"""
from __future__ import division
import time
import copy
import numpy as np
import matplotlib.pylab as plt

[docs]def ewma_mcrl(Game, bn, J, N, alpha, delta, eps, uni=False, pureout=False): """ Use EWMA MC RL to approximate the optimal CPT at bn given G :arg Game: The iterated semi-NFG on which to perform the RL :type Game: iterSemiNFG :arg bn: the basename of the node with the CPT to be trained :type bn: str :arg J: The number of runs per training episode. If a schedule is desired, enter a list or np.array with size equal to N. :type J: int, list, or np.array :arg N: The number of training episodes :type N: int :arg alpha: The exponential weight for the moving average. If a schedule is desired, enter a list or np.array with size equal to N :type alpha: int, list or np.array :arg delta: The discount factor :type delta: float :arg eps: The maximum step-size for policy improvements :type eps: float :arg uni: if True, training is initialized with a uniform policy. Default False to allow "seeding" with different policies, e.g. level k-1 :type uni: bool :arg pureout: if True, the policy is turned into a pure policy at the end of training by assigning argmax actions prob 1. Default is False :type pureout: bool Example:: import copy GG = copy.deepcopy(G) from pynfg.rlsolutions.mcrl import ewma_mcrl G1, Rseries = ewma_mcrl(GG, 'D1', J=np.floor(linspace(300,100,num=50)), N=50, alpha=1, delta=0.8, eps=0.4, pureout=True) """ G = copy.deepcopy(Game) timepassed = np.zeros(N) # initializing training schedules from scalar inputs if isinstance(J, (int)): J = J*np.ones(N) if isinstance(alpha, (int, long, float)): alpha = alpha*np.ones(N) if isinstance(eps, (int, long, float)): eps = eps*np.ones(N) # getting shorter/more descriptive variable names to work with T0 = G.starttime T = G.endtime+1 player = G.bn_part[bn][0].player shape = G.bn_part[bn][0].CPT.shape shape_last = shape[-1] if uni: G.bn_part[bn][0].uniformCPT() #starting with a uniform CPT for dn in G.bn_part[bn]: #pointing all CPTs to T0, i.e. single policy dn.CPT = G.bn_part[bn][0].CPT visit = set() #dict of the messages and mapairs visited throughout training averew = 0 # for x in xrange(30): # rew = 0 # G.sample() # for t in xrange(T0,T): # rew += G.reward(player, t)/(T-T0-1) # averew += rew/(30) R = averew #average reward A = 0 #normalizing constant for average reward B = {} #dict associates messages and mapairs with beta exponents D = {} #dict associates messages and mapairs with norm constants for Q,V Q = np.zeros(shape) #Qtable V = np.zeros(shape[:-1]) #Value table Rseries = np.zeros(N) #tracking average reward for plotting convergence for n in xrange(N): print n indicaten = np.zeros(Q.shape) #indicates visited mapairs visitn = set() #dict of messages and mapairs visited in episode n Rseries[n] = R #adding the most recent ave reward to the data series A *= alpha[n] #rescaling A at start of new episode, see writeup for j in xrange(int(J[n])): visitj = set() #visitj must be cleared at the start of every run for t in xrange(T0,T): #import pdb; pdb.set_trace() #G.bn_part[bn][t-T0].CPT = copy.copy(G.bn_part[bn][0].CPT) G.sample_timesteps(t, t) #sampling the timestep rew = G.reward(player, t) #getting the reward mapair = G.bn_part[bn][t-T0].get_CPTindex() A += 1 r = R R = (1/A)*((A-1)*r+rew) xm = set() #used below to keep track of updated messages for values in visitj: b = B[values] #past values d = D[values] q = Q[values] bb = (b+1) #update equations double letters are time t dd = d+1 qq = (1/dd)*(d*q+(delta**(bb-1))*(rew)) B[values] = bb #update dictionaries D[values] = dd Q[values] = qq message = values[:-1] #V indexed by message only if message not in xm: #updating message only once b = B[message] #past values d = D[message] v = V[message] bb = (b+1) #update equations double letters are time t dd = d+1 vv = (1/dd)*(d*v+(delta**(bb-1))*(rew)) B[message] = bb #update dictionaries D[message] = dd V[message] = vv xm.add(message) #so that message isn't updated again if mapair not in visitj: #first time in j visiting mapair message = mapair[:-1] messtrue = (message not in xm) #for checking message visited B[mapair] = 1 #whenever mapair not in visitj if mapair not in visitn and mapair not in visit: D[mapair] = 1 Q[mapair] = rew if messtrue: D[message] = 1 V[message] = rew elif mapair not in visitn: D[mapair] = alpha[n]*D[mapair]+1 Q[mapair] = (1/D[mapair])*((D[mapair]-1)*Q[mapair]\ +(rew)) if messtrue: D[message] = alpha[n]*D[message]+1 V[message] = (1/D[message])*((D[message]-1)*\ V[message]+(rew)) else: D[mapair] += 1 Q[mapair] = (1/D[mapair])*((D[mapair]-1)*Q[mapair]\ +(rew)) if messtrue: D[message] += 1 V[message] = (1/D[message])*((D[message]-1)*\ V[message]+(rew)) if messtrue: B[message] = 1 visit.add(mapair)#mapair added to visit sets the first time visitn.add(mapair) visitj.add(mapair) indicaten[mapair] = 1 #only visited actions are updated # update CPT with shift towards Qtable argmax actions. shift = Q-V[...,np.newaxis] idx = np.nonzero(shift) #indices of nonzero shifts (avoid divide by 0) # normalizing shifts to be a % of message's biggest shift shiftnorm = np.absolute(shift).max(axis=-1)[...,np.newaxis] # for each mapair shift only eps% of the percent shift updater = eps[n]*indicaten*G.bn_part[bn][0].CPT/shiftnorm # increment the CPT G.bn_part[bn][0].CPT[idx] += updater[idx]*shift[idx] # normalize after the shift CPTsum = G.bn_part[bn][0].CPT.sum(axis=-1) G.bn_part[bn][0].CPT /= CPTsum[...,np.newaxis] if pureout: #if True, output is a pure policy G.bn_part[bn][0].makeCPTpure() for tau in xrange(1, T-T0): #before exit, make CPTs independent in memory G.bn_part[bn][tau].CPT = copy.copy(G.bn_part[bn][0].CPT) plt.plot(Rseries) #plotting Rseries to gauge convergence fig = plt.gcf() plt.show() return G, fig

Mailing List

Join the Google Group: