调整激励算法

2021-12-07 10:33:18 +08:00 · 2021-12-07 10:33:18 +08:00 · c7f105d20d
parent cfa9da6b2c
commit c7f105d20d
2 changed files with 3 additions and 7 deletions
--- a/douzero/env/env.py
+++ b/douzero/env/env.py
@ -283,17 +283,14 @@ class Env:
        self_bomb_num = self._env.pos_bomb_num[pos]
        if winner == 'landlord':
            if self.objective == 'adp':
-                return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] + self._env.multiply_count[pos]) /8
+                return (1.1 - self._env.step_count * 0.0033) * (1.3 ** bomb_num[0]) * (1.95 ** bomb_num[1]) / 8
                return (2.0 ** bomb_num[0]) * (3.0 ** bomb_num[1])
            elif self.objective == 'logadp':
                return (1.0 - self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4
                return bomb_num[0] + bomb_num[1] + 1.0
            else:
                return 1.0 - self._env.step_count * 0.0033
                return 1.0
        else:
            if self.objective == 'adp':
-                return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] +self._env.multiply_count[pos]) /8
+                return (-1.1 + self._env.step_count * 0.0033) * (1.3 ** bomb_num[0]) * (1.95 ** bomb_num[1]) / 8
            elif self.objective == 'logadp':
                return (-1.0 + self._env.step_count * 0.0033) * 1.3**(self_bomb_num) * 2**self._env.multiply_count[pos] / 4
            else:
--- a/douzero/env/game.py
+++ b/douzero/env/game.py
@ -139,8 +139,7 @@ class GameEnv(object):
    def step(self):
        action = self.players[self.acting_player_position].act(
            self.game_infoset)
-        assert action in self.game_infoset.legal_actions
+        self.step_count += 1
        if len(action) > 0:
            self.last_pid = self.acting_player_position