调整激励算法

This commit is contained in:
zhiyang7 2021-12-07 10:33:18 +08:00
parent cfa9da6b2c
commit c7f105d20d
2 changed files with 3 additions and 7 deletions

7
douzero/env/env.py vendored
View File

@ -283,17 +283,14 @@ class Env:
self_bomb_num = self._env.pos_bomb_num[pos] self_bomb_num = self._env.pos_bomb_num[pos]
if winner == 'landlord': if winner == 'landlord':
if self.objective == 'adp': if self.objective == 'adp':
return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] + self._env.multiply_count[pos]) /8 return (1.1 - self._env.step_count * 0.0033) * (1.3 ** bomb_num[0]) * (1.95 ** bomb_num[1]) / 8
return (2.0 ** bomb_num[0]) * (3.0 ** bomb_num[1])
elif self.objective == 'logadp': elif self.objective == 'logadp':
return (1.0 - self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4 return (1.0 - self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4
return bomb_num[0] + bomb_num[1] + 1.0
else: else:
return 1.0 - self._env.step_count * 0.0033 return 1.0 - self._env.step_count * 0.0033
return 1.0
else: else:
if self.objective == 'adp': if self.objective == 'adp':
return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] +self._env.multiply_count[pos]) /8 return (-1.1 + self._env.step_count * 0.0033) * (1.3 ** bomb_num[0]) * (1.95 ** bomb_num[1]) / 8
elif self.objective == 'logadp': elif self.objective == 'logadp':
return (-1.0 + self._env.step_count * 0.0033) * 1.3**(self_bomb_num) * 2**self._env.multiply_count[pos] / 4 return (-1.0 + self._env.step_count * 0.0033) * 1.3**(self_bomb_num) * 2**self._env.multiply_count[pos] / 4
else: else:

3
douzero/env/game.py vendored
View File

@ -139,8 +139,7 @@ class GameEnv(object):
def step(self): def step(self):
action = self.players[self.acting_player_position].act( action = self.players[self.acting_player_position].act(
self.game_infoset) self.game_infoset)
assert action in self.game_infoset.legal_actions self.step_count += 1
if len(action) > 0: if len(action) > 0:
self.last_pid = self.acting_player_position self.last_pid = self.acting_player_position