Source code for flask_mab.bandits

from random import random, choice, uniform, betavariate
from math import log, exp, expm1

[docs]class Bandit(object): """The primary bandit interface. Don't use this unless you really want uniform random arm selection (which defeats the whole purpose, really) Used as a control to test against and as an interface to define methods against. """ @classmethod
[docs] def fromdict(cls, dict_spec): extra_args = dict([(key, value) for key, value in dict_spec.items() if key not in ["arms", "pulls", "reward", "values", "bandit_type"]]) bandit = globals()[dict_spec["bandit_type"]](**extra_args) bandit.arms = dict_spec["arms"] bandit.pulls = dict_spec["pulls"] bandit.reward = dict_spec["reward"] bandit.values = dict_spec["values"] bandit.confidence = dict_spec.get("confidence", [0.0] * len(bandit.arms)) return bandit
def __init__(self): self.arms = [] self.pulls = [] self.reward = [] self.values = [] self.confidence = []
[docs] def add_arm(self, arm_id, value=None): self.arms.append(arm_id) self.pulls.append(0) self.reward.append(0.0) self.confidence.append(0.0) self.values.append(value)
[docs] def pull_arm(self, arm_id): ind = self.arms.index(arm_id) if ind > -1: self.pulls[ind] += 1
[docs] def reward_arm(self, arm_id, reward): ind = self.arms.index(arm_id) if ind > -1: self.reward[ind] += reward self._update(ind, reward)
def _update(self, arm_index, reward): n = self.pulls[arm_index] current = self.confidence[arm_index] self.confidence[arm_index] = ((n - 1) / float(n)) * current + (1 / float(n)) * reward
[docs] def suggest_arm(self): """Uniform random for default bandit. Just uses random.choice to choose between arms """ return self[random.choice(self.arms)]
def __getitem__(self, key): ind = self.arms.index(key) if ind > -1: arm = { "id":self.arms[ind], "pulls":self.pulls[ind], "reward":self.reward[ind], "value":self.values[ind] } return arm else: raise KeyError("Arm is not found in this bandit") def __str__(self): output = '%s ' % self.__class__.__name__ output += '; '.join(['%s:%s' % (key, val) for key, val in self.__dict__.items()]) return output
[docs]class EpsilonGreedyBandit(Bandit): """Epsilon Greedy Bandit implementation. Aggressively favors the present winner. Will assign winning arm 1.0 - epsilon of the time, uniform random between arms epsilon % of the time. Will "exploit" the present winner more often with lower values of epsilon, "explore" other candidates more often with higher values of epsilon. :param epsilon: The percentage of the time to "explore" other arms, E.G a value of 0.1 will perform random assignment for %10 of traffic :type epsilon: float """ def __init__(self, epsilon=0.1): super(EpsilonGreedyBandit, self).__init__() self.epsilon = epsilon
[docs] def suggest_arm(self): """Get an arm according to the EpsilonGreedy Strategy """ random_determination = random() if random_determination > self.epsilon: key = self._ind_max() else: key = choice(self.arms) return self[key]
def _ind_max(self): return self.arms[self.confidence.index(max(self.confidence))] def __str__(self): return Bandit.__str__(self) def __repr(self): return Bandit.__str__(self)
[docs]def all_same(items): return all(x == items[0] for x in items)
[docs]class NaiveStochasticBandit(Bandit): """A naive weighted random Bandit. Favors the winner by giving it greater weight in random selection. Winner will eventually flatten out the losers if margin is great enough """ def __init__(self): super(NaiveStochasticBandit, self).__init__() def _compute_weights(self): weights = [] for ind, n in enumerate(self.pulls): reward = self.reward[ind] try: weights.append(1.0 * (float(reward)/float(n))) except ZeroDivisionError: weights.append(1.0/len(self.arms)) return weights
[docs] def suggest_arm(self): """Get an arm according to the Naive Stochastic Strategy """ weights = self._compute_weights() random_determination = uniform(0.0, 1.0) cum_weight = 0.0 for ind, weight in enumerate(weights): cum_weight += weight if cum_weight > random_determination: return self[self.arms[ind]] return self[self.arms[0]]
[docs]class SoftmaxBandit(NaiveStochasticBandit): def __init__(self, tau=1.0): super(SoftmaxBandit, self).__init__() self.tau = tau def _compute_weights(self): weights = [] total_reward = sum([exp(x / self.tau) for x in self.confidence]) for ind, n in enumerate(self.pulls): weights.append(exp(self.confidence[ind] / self.tau) / total_reward) return weights
[docs]class AnnealingSoftmaxBandit(SoftmaxBandit): def __init__(self, tau=0): super(AnnealingSoftmaxBandit, self).__init__() self.tau = tau def _compute_weights(self): t = sum(self.pulls) + 1 self.tau = 1 / log(t + 0.0000001) weights = [] total_reward = sum([exp(x / self.tau) for x in self.confidence]) for ind, n in enumerate(self.pulls): weights.append(exp(self.confidence[ind] / self.tau) / total_reward) return weights
[docs]class ThompsonBandit(NaiveStochasticBandit): def __init__(self, prior=(1.0,1.0)): super(ThompsonBandit, self).__init__() self.prior = prior def _compute_weights(self): sampled_theta = [] for ind, n in enumerate(self.arms): dist = betavariate(self.prior[0] + self.reward[ind], self.prior[1]+self.pulls[ind]-self.reward[ind]) sampled_theta += [dist] return sampled_theta
[docs] def suggest_arm(self): weights = self._compute_weights() return self[self.arms[weights.index(max(weights))]]
[docs] def reward_arm(self, arm_id, reward): if reward != 1.0: reward = 1.0 super(ThompsonBandit, self).reward_arm(arm_id, reward)
Fork me on GitHub