Previously Can you learn addition games with reinforcement learning? and I was able to learn without any problems.
This time, let's assume a more realistic problem.
There is a problem. Well, there is a story that you should send it to everyone if it is about email, but if you send too much, it will lead to withdrawal, and coupons are costly, so I do not want to burst out too much.
The subject of this issue is what would happen if this problem were solved in a Q-Learning framework. With Q-Learning, it's good to be able to handle multiple actions. However, think of it as a simple, completely virtual situation.
To write it easily
+ 1 after 4 turns.It's like that.
In the case of this game, I was worried that it would be difficult to say, "There is an interval between taking the best action and getting the reward." It seems that learning will be quick if you get a reward immediately after the optimal action, but the biggest point is that it is not. I think it's difficult to define a "penalty", but now I'm just giving it to an action that seems to be wrong. (Is this too easy for the problem ...?).
However, as you can see in Searching for a maze with reinforcement learning, you can learn retroactively, so I'm wondering if you can do it. I just wanted to verify it.
You will only get rewards when U = 2 occurs, so increase chance_count when U = 2 occurs. The maximum reward you can get over a period of time is chance_count.
Therefore, let hit_rate be the reward / chance_count` obtained.
The figure below shows how it changed after 10,000 learning / evaluations.

I tried about 50 million times, but after about 30 million times, I felt that learning had peaked, and it was about hit_rate = 0.9.
Q Learning is fun because it looks like a simple AI in a sense. I hope it can be used for something.
I will post it for reference.
#!/usr/bin/env python
# coding: utf-8
import numpy as np
from random import random, choice
class Game(object):
    state = None
    actions = None
    game_over = False
    def __init__(self, player):
        self.player = player
        self.turn = 0
        self.last_reward = 0
        self.total_reward = 0
        self.init_state()
    def player_action(self):
        action = self.player.action(self.state, self.last_reward)
        if action not in self.actions:
            raise Exception("Invalid Action: '%s'" % action)
        self.state, self.last_reward = self.get_next_state_and_reward(self.state, action)
    def play(self):
        yield(self)
        while not self.game_over:
            self.player_action()
            self.turn += 1
            self.total_reward += self.last_reward
            yield(self)
    def init_state(self):
        raise NotImplemented()
    def get_next_state_and_reward(self, state, action):
        raise NotImplemented()
class UserAndPushEventGame(Game):
    """
    State           :S : list of (U, A)
    UserActivity    :U : int of 0~3
    Action          :A : int of 0 or 1
    Next-State(S, A):S':
        S[-1][1] = A
        S.append((Next-U, None))
        S = S[-5:]
    Next-U          :  :
        if S[-4] == (2, 1) then 3
        else 10% -> 2, 10% -> 1, 80% -> 0
    Reward(S, A)    :R :
        if S[-1] == (3, *) then R += 1
        wrong_action_count := Number of ({0,1,3}, 1) in S
        R -= wrong_action_count * 0.3
    """
    STATE_HISTORY_SIZE = 5
    def init_state(self):
        self.actions = [0, 1]
        self.state = [(0, None)]
        self.chance_count = 0
    def get_next_state_and_reward(self, state, action):
        next_state = (state + [(self.next_user_action(state), None)])[-self.STATE_HISTORY_SIZE:]
        next_state[-2] = (next_state[-2][0], action)
        reward = 0
        if len(state) > 0 and state[-1][0] == 3:
            reward += 1
        action_count = reduce(lambda t, x: t+(x[1] or 0), state, 0)
        correct_action_count = len([0 for x in state if x == (2, 1)])
        wrong_action_count = action_count - correct_action_count
        reward -= wrong_action_count * 0.3
        return next_state, reward
    def next_user_action(self, state):
        if len(state) > 4 and state[-4] == (2, 1):
            return 3
        else:
            rnd = np.random.random()
            if rnd < 0.8:
                return 0
            elif rnd < 0.9:
                return 1
            else:
                self.chance_count += 1
                return 2
class HumanPlayer(object):
    training = False
    def action(self, state, last_reward):
        print "LastReward=%s, CurrentState: %s" % (last_reward, state)
        while True:
            action_input = raw_input("Enter 0~1: ")
            if int(action_input) in [0, 1]:
                return int(action_input)
class QLearnPlayer(object):
    ALPHA = 0.1
    GAMMA = 0.99
    E_GREEDY = 0.05
    def __init__(self):
        self.actions = [0, 1]
        self.q_table = {}
        self.last_state = self.last_action = None
        self.training = True
    def get_q_value(self, state, action):
        return self.q_table.get(state, {}).get(action, (np.random.random() - 0.5)/1000)  #Undefined returns a small random number
    def get_all_q_values(self, state):
        return [self.get_q_value(state, act) for act in self.actions]
    def set_q_value(self, state, action, val):
        if state in self.q_table:
            self.q_table[state][action] = val
        else:
            self.q_table[state] = {action: val}
    def action(self, state, last_reward):
        state = tuple(state)
        next_action = self.select_action(state)
        if self.last_state is not None:
            self.update_q_table(self.last_state, self.last_action, state, last_reward)
        self.last_state = state
        self.last_action = next_action
        return next_action
    def select_action(self, state):
        if self.training and random() < self.E_GREEDY:
            return choice(self.actions)
        else:
            return np.argmax(self.get_all_q_values(state))
    def update_q_table(self, last_state, last_action, cur_state, last_reward):
        if self.training:
            d = last_reward + np.max(self.get_all_q_values(cur_state)) * self.GAMMA - self.get_q_value(last_state, last_action)
            self.set_q_value(last_state, last_action, self.get_q_value(last_state, last_action) + self.ALPHA * d)
if __name__ == '__main__':
    SWITCH_MODE_TURN_NUM = 10000
    fp = file("result.txt", "w")
    dt = file("detail.txt", "w")
    player = QLearnPlayer()
    # player = HumanPlayer()
    game = UserAndPushEventGame(player)
    last_chance_count = last_score = 0
    for g in game.play():
        # dt.write("%s: isT?=%s LastReward=%s TotalReward=%s S=%s\n" %
        #          (g.turn, player.training, g.last_reward, g.total_reward, g.state))
        if g.turn % SWITCH_MODE_TURN_NUM == 0:
            if not player.training:
                this_term_score = game.total_reward - last_score
                this_term_chance = game.chance_count - last_chance_count
                if this_term_chance > 0:
                    hit_rate = 100.0*this_term_score/this_term_chance
                else:
                    hit_rate = 0
                # print "Turn=%d: This 100 turn score=%2.2f chance=%02d: HitRate=%.1f%% %s" % \
                #       (g.turn, this_term_score, this_term_chance, hit_rate, '*' * int(hit_rate/2))
                fp.write("%d\t%.2f\t%d\t%f\n" % (g.turn, this_term_score, this_term_chance, hit_rate))
            last_score = game.total_reward
            last_chance_count = game.chance_count
            player.training = not player.training
        if g.turn % 10000 == 0:
            fp.flush()
        Recommended Posts