diff --git a/douzero/dmc/dmc.py b/douzero/dmc/dmc.py index 55c4a54..175a1e4 100644 --- a/douzero/dmc/dmc.py +++ b/douzero/dmc/dmc.py @@ -16,7 +16,7 @@ from .file_writer import FileWriter from .models import Model, OldModel from .utils import get_batch, log, create_env, create_optimizers, act -mean_episode_return_buf = {p:deque(maxlen=100) for p in ['landlord', 'landlord_up', 'landlord_down', 'bidding']} +mean_episode_return_buf = {p:deque(maxlen=100) for p in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']} def compute_loss(logits, targets): loss = ((logits.squeeze(-1) - targets)**2).mean() @@ -27,7 +27,7 @@ def compute_loss_for_bid(outputs, reward): def learn(position, actor_models, model, batch, optimizer, flags, lock): """Performs a learning (optimization) step.""" - position_index = {"landlord": 31, "landlord_up": 32, "landlord_down": 33} + position_index = {"landlord": 31, "landlord_up": 32, 'landlord_front': 33, "landlord_down": 34} print("Learn", position) if flags.training_device != "cpu": device = torch.device('cuda:'+str(flags.training_device)) @@ -46,7 +46,8 @@ def learn(position, actor_models, model, batch, optimizer, flags, lock): with lock: learner_outputs = model(obs_z, obs_x, return_value=True) if position == "bidding": - pass + loss = compute_loss(learner_outputs['values'], target) + # pass else: loss = compute_loss(learner_outputs['values'], target) stats = { @@ -101,7 +102,7 @@ def train(flags): # Initialize queues actor_processes = [] ctx = mp.get_context('spawn') - batch_queues = {"landlord": ctx.SimpleQueue(), "landlord_up": ctx.SimpleQueue(), "landlord_down": ctx.SimpleQueue(), "bidding": ctx.SimpleQueue()} + batch_queues = {"landlord": ctx.SimpleQueue(), "landlord_up": ctx.SimpleQueue(), 'landlord_front': ctx.SimpleQueue(), "landlord_down": ctx.SimpleQueue(), "bidding": ctx.SimpleQueue()} # Learner model for training learner_model = Model(device=flags.training_device) @@ -115,20 +116,22 @@ def train(flags): 'loss_landlord', 'mean_episode_return_landlord_up', 'loss_landlord_up', + 'mean_episode_return_landlord_front', + 'loss_landlord_front', 'mean_episode_return_landlord_down', 'loss_landlord_down', 'mean_episode_return_bidding', 'loss_bidding', ] frames, stats = 0, {k: 0 for k in stat_keys} - position_frames = {'landlord':0, 'landlord_up':0, 'landlord_down':0, 'bidding': 0} + position_frames = {'landlord':0, 'landlord_up':0, 'landlord_front':0, 'landlord_down':0, 'bidding': 0} # Load models if any if flags.load_model and os.path.exists(checkpointpath): checkpoint_states = torch.load( checkpointpath, map_location=("cuda:"+str(flags.training_device) if flags.training_device != "cpu" else "cpu") ) - for k in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down'] + for k in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down'] learner_model.get_model(k).load_state_dict(checkpoint_states["model_state_dict"][k]) optimizers[k].load_state_dict(checkpoint_states["optimizer_state_dict"][k]) for device in device_iterator: @@ -176,12 +179,12 @@ def train(flags): threads = [] locks = {} for device in device_iterator: - locks[device] = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()} - position_locks = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()} + locks[device] = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_front': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()} + position_locks = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_front': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()} for device in device_iterator: for i in range(flags.num_threads): - for position in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: + for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: thread = threading.Thread( target=batch_and_learn, name='batch-and-learn-%d' % i, args=(i,device,position,locks[device][position],position_locks[position])) thread.start() @@ -202,7 +205,7 @@ def train(flags): }, checkpointpath) # Save the weights for evaluation purpose - for position in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down'] + for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_front', 'landlord_down'] model_weights_dir = os.path.expandvars(os.path.expanduser( '%s/%s/%s' % (flags.savedir, flags.xpid, "general_"+position+'_'+str(frames)+'.ckpt'))) torch.save(learner_model.get_model(position).state_dict(), model_weights_dir) @@ -229,15 +232,17 @@ def train(flags): fps_avg = np.mean(fps_log) position_fps = {k:(position_frames[k]-position_start_frames[k])/(end_time-start_time) for k in position_frames} - log.info('After %i (L:%i U:%i D:%i) frames: @ %.1f fps (avg@ %.1f fps) (L:%.1f U:%.1f D:%.1f) Stats:\n%s', + log.info('After %i (L:%i U:%i F:%i D:%i) frames: @ %.1f fps (avg@ %.1f fps) (L:%.1f U:%.1f F:%.1f D:%.1f) Stats:\n%s', frames, position_frames['landlord'], position_frames['landlord_up'], + position_frames['landlord_front'], position_frames['landlord_down'], fps, fps_avg, position_fps['landlord'], position_fps['landlord_up'], + position_fps['landlord_front'], position_fps['landlord_down'], pprint.pformat(stats)) diff --git a/douzero/dmc/models.py b/douzero/dmc/models.py index c333115..5a6ec2e 100644 --- a/douzero/dmc/models.py +++ b/douzero/dmc/models.py @@ -12,13 +12,13 @@ import torch.nn.functional as F class LandlordLstmModel(nn.Module): def __init__(self): super().__init__() - self.lstm = nn.LSTM(162, 128, batch_first=True) - self.dense1 = nn.Linear(373 + 128, 512) - self.dense2 = nn.Linear(512, 512) - self.dense3 = nn.Linear(512, 512) - self.dense4 = nn.Linear(512, 512) - self.dense5 = nn.Linear(512, 512) - self.dense6 = nn.Linear(512, 1) + self.lstm = nn.LSTM(432, 128, batch_first=True) + self.dense1 = nn.Linear(846 + 128, 1024) + self.dense2 = nn.Linear(1024, 1024) + self.dense3 = nn.Linear(1024, 768) + self.dense4 = nn.Linear(768, 512) + self.dense5 = nn.Linear(512, 256) + self.dense6 = nn.Linear(256, 1) def forward(self, z, x, return_value=False, flags=None): lstm_out, (h_n, _) = self.lstm(z) @@ -47,13 +47,13 @@ class LandlordLstmModel(nn.Module): class FarmerLstmModel(nn.Module): def __init__(self): super().__init__() - self.lstm = nn.LSTM(162, 128, batch_first=True) - self.dense1 = nn.Linear(484 + 128, 512) - self.dense2 = nn.Linear(512, 512) - self.dense3 = nn.Linear(512, 512) - self.dense4 = nn.Linear(512, 512) - self.dense5 = nn.Linear(512, 512) - self.dense6 = nn.Linear(512, 1) + self.lstm = nn.LSTM(432, 128, batch_first=True) + self.dense1 = nn.Linear(1178 + 128, 1024) + self.dense2 = nn.Linear(1024, 1024) + self.dense3 = nn.Linear(1024, 768) + self.dense4 = nn.Linear(768, 512) + self.dense5 = nn.Linear(512, 256) + self.dense6 = nn.Linear(256, 1) def forward(self, z, x, return_value=False, flags=None): lstm_out, (h_n, _) = self.lstm(z) @@ -82,8 +82,8 @@ class FarmerLstmModel(nn.Module): class LandlordLstmNewModel(nn.Module): def __init__(self): super().__init__() - self.lstm = nn.LSTM(162, 128, batch_first=True) - self.dense1 = nn.Linear(373 + 128, 512) + self.lstm = nn.LSTM(432, 128, batch_first=True) + self.dense1 = nn.Linear(846 + 128, 512) self.dense2 = nn.Linear(512, 512) self.dense3 = nn.Linear(512, 512) self.dense4 = nn.Linear(512, 512) @@ -117,8 +117,8 @@ class LandlordLstmNewModel(nn.Module): class FarmerLstmNewModel(nn.Module): def __init__(self): super().__init__() - self.lstm = nn.LSTM(162, 128, batch_first=True) - self.dense1 = nn.Linear(484 + 128, 512) + self.lstm = nn.LSTM(432, 128, batch_first=True) + self.dense1 = nn.Linear(1178 + 128, 512) self.dense2 = nn.Linear(512, 512) self.dense3 = nn.Linear(512, 512) self.dense4 = nn.Linear(512, 512) @@ -253,20 +253,22 @@ class GeneralModel(nn.Module): def __init__(self): super().__init__() self.in_planes = 80 - #input 1*54*41 + #input 1*108*41 self.conv1 = nn.Conv1d(40, 80, kernel_size=(3,), - stride=(2,), padding=1, bias=False) #1*27*80 + stride=(2,), padding=1, bias=False) #1*54*80 self.bn1 = nn.BatchNorm1d(80) - self.layer1 = self._make_layer(BasicBlock, 80, 2, stride=2)#1*14*80 - self.layer2 = self._make_layer(BasicBlock, 160, 2, stride=2)#1*7*160 - self.layer3 = self._make_layer(BasicBlock, 320, 2, stride=2)#1*4*320 + self.layer1 = self._make_layer(BasicBlock, 80, 2, stride=2)#1*27*80 + self.layer2 = self._make_layer(BasicBlock, 160, 2, stride=2)#1*14*160 + self.layer3 = self._make_layer(BasicBlock, 320, 2, stride=2)#1*7*320 + self.layer4 = self._make_layer(BasicBlock, 640, 2, stride=2)#1*4*320 # self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.linear1 = nn.Linear(320 * BasicBlock.expansion * 4 + 15 * 4, 1024) - self.linear2 = nn.Linear(1024, 512) - self.linear3 = nn.Linear(512, 256) - self.linear4 = nn.Linear(256, 1) + self.linear1 = nn.Linear(640 * BasicBlock.expansion * 4 + 24 * 4, 2048) + self.linear2 = nn.Linear(2048, 1024) + self.linear3 = nn.Linear(1024, 512) + self.linear4 = nn.Linear(512, 256) + self.linear5 = nn.Linear(256, 1) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1] * (num_blocks - 1) @@ -281,12 +283,14 @@ class GeneralModel(nn.Module): out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) + out = self.layer4(out) out = out.flatten(1,2) out = torch.cat([x,x,x,x,out], dim=-1) out = F.leaky_relu_(self.linear1(out)) out = F.leaky_relu_(self.linear2(out)) out = F.leaky_relu_(self.linear3(out)) out = F.leaky_relu_(self.linear4(out)) + out = F.leaky_relu_(self.linear5(out)) if return_value: return dict(values=out) else: @@ -304,7 +308,7 @@ class BidModel(nn.Module): def __init__(self): super().__init__() - self.dense1 = nn.Linear(114, 512) + self.dense1 = nn.Linear(208, 512) self.dense2 = nn.Linear(512, 512) self.dense3 = nn.Linear(512, 512) self.dense4 = nn.Linear(512, 512) @@ -342,15 +346,18 @@ class BidModel(nn.Module): model_dict = {} model_dict['landlord'] = LandlordLstmModel model_dict['landlord_up'] = FarmerLstmModel +model_dict['landlord_front'] = FarmerLstmModel model_dict['landlord_down'] = FarmerLstmModel model_dict_new = {} model_dict_new['landlord'] = GeneralModel model_dict_new['landlord_up'] = GeneralModel +model_dict_new['landlord_front'] = GeneralModel model_dict_new['landlord_down'] = GeneralModel model_dict_new['bidding'] = BidModel model_dict_lstm = {} model_dict_lstm['landlord'] = GeneralModel model_dict_lstm['landlord_up'] = GeneralModel +model_dict_lstm['landlord_front'] = GeneralModel model_dict_lstm['landlord_down'] = GeneralModel class General_Model: @@ -365,6 +372,7 @@ class General_Model: # model = GeneralModel().to(torch.device(device)) self.models['landlord'] = GeneralModel1().to(torch.device(device)) self.models['landlord_up'] = GeneralModel1().to(torch.device(device)) + self.models['landlord_front'] = GeneralModel1().to(torch.device(device)) self.models['landlord_down'] = GeneralModel1().to(torch.device(device)) self.models['bidding'] = BidModel().to(torch.device(device)) @@ -375,12 +383,14 @@ class General_Model: def share_memory(self): self.models['landlord'].share_memory() self.models['landlord_up'].share_memory() + self.models['landlord_front'].share_memory() self.models['landlord_down'].share_memory() self.models['bidding'].share_memory() def eval(self): self.models['landlord'].eval() self.models['landlord_up'].eval() + self.models['landlord_front'].eval() self.models['landlord_down'].eval() self.models['bidding'].eval() @@ -404,6 +414,7 @@ class OldModel: device = 'cuda:' + str(device) self.models['landlord'] = LandlordLstmModel().to(torch.device(device)) self.models['landlord_up'] = FarmerLstmModel().to(torch.device(device)) + self.models['landlord_front'] = FarmerLstmModel().to(torch.device(device)) self.models['landlord_down'] = FarmerLstmModel().to(torch.device(device)) def forward(self, position, z, x, training=False, flags=None): @@ -413,11 +424,13 @@ class OldModel: def share_memory(self): self.models['landlord'].share_memory() self.models['landlord_up'].share_memory() + self.models['landlord_front'].share_memory() self.models['landlord_down'].share_memory() def eval(self): self.models['landlord'].eval() self.models['landlord_up'].eval() + self.models['landlord_front'].eval() self.models['landlord_down'].eval() def parameters(self, position): @@ -442,6 +455,7 @@ class Model: # model = GeneralModel().to(torch.device(device)) self.models['landlord'] = GeneralModel().to(torch.device(device)) self.models['landlord_up'] = GeneralModel().to(torch.device(device)) + self.models['landlord_front'] = GeneralModel().to(torch.device(device)) self.models['landlord_down'] = GeneralModel().to(torch.device(device)) self.models['bidding'] = BidModel().to(torch.device(device)) @@ -452,12 +466,14 @@ class Model: def share_memory(self): self.models['landlord'].share_memory() self.models['landlord_up'].share_memory() + self.models['landlord_front'].share_memory() self.models['landlord_down'].share_memory() self.models['bidding'].share_memory() def eval(self): self.models['landlord'].eval() self.models['landlord_up'].eval() + self.models['landlord_front'].eval() self.models['landlord_down'].eval() self.models['bidding'].eval() @@ -470,11 +486,3 @@ class Model: def get_models(self): return self.models - - - - - - - - diff --git a/douzero/dmc/utils.py b/douzero/dmc/utils.py index 1ed0c7d..70eced7 100644 --- a/douzero/dmc/utils.py +++ b/douzero/dmc/utils.py @@ -16,11 +16,15 @@ from douzero.env import Env Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 17: 12} -NumOnes2Array = {0: np.array([0, 0, 0, 0]), - 1: np.array([1, 0, 0, 0]), - 2: np.array([1, 1, 0, 0]), - 3: np.array([1, 1, 1, 0]), - 4: np.array([1, 1, 1, 1])} +NumOnes2Array = {0: np.array([0, 0, 0, 0, 0, 0, 0, 0]), + 1: np.array([1, 0, 0, 0, 0, 0, 0, 0]), + 2: np.array([1, 1, 0, 0, 0, 0, 0, 0]), + 3: np.array([1, 1, 1, 0, 0, 0, 0, 0]), + 4: np.array([1, 1, 1, 1, 0, 0, 0, 0]), + 5: np.array([1, 1, 1, 1, 1, 0, 0, 0]), + 6: np.array([1, 1, 1, 1, 1, 1, 0, 0]), + 7: np.array([1, 1, 1, 1, 1, 1, 1, 0]), + 8: np.array([1, 1, 1, 1, 1, 1, 1, 1])} shandle = logging.StreamHandler() shandle.setFormatter( @@ -60,7 +64,7 @@ def create_optimizers(flags, learner_model): """ Create three optimizers for the three positions """ - positions = ['landlord', 'landlord_up', 'landlord_down', 'bidding'] + positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding'] optimizers = {} for position in positions: optimizer = RAdam( @@ -72,7 +76,7 @@ def create_optimizers(flags, learner_model): def act(i, device, batch_queues, model, flags): - positions = ['landlord', 'landlord_up', 'landlord_down', 'bidding'] + positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding'] for pos in positions: model.models[pos].to(torch.device(device if device == "cpu" else ("cuda:"+str(device)))) try: @@ -90,9 +94,9 @@ def act(i, device, batch_queues, model, flags): type_buf = {p: [] for p in positions} obs_x_batch_buf = {p: [] for p in positions} - position_index = {"landlord": 31, "landlord_up": 32, "landlord_down": 33} - bid_type_index = {"landlord": 41, "landlord_up": 42, "landlord_down": 43} - bid_type_map = {41: "landlord", 42: "landlord_up", 43: "landlord_down"} + position_index = {"landlord": 31, "landlord_up": 32, "landlord_front": 33, "landlord_down": 34} + bid_type_index = {"landlord": 41, "landlord_up": 42, "landlord_front": 43, "landlord_down": 43} + bid_type_map = {41: "landlord", 42: "landlord_up", 43: "landlord_front", 44: "landlord_down"} position, obs, env_output = env.initial(model, device, flags=flags) bid_obs_buffer = env_output["begin_buf"]["bid_obs_buffer"] @@ -149,7 +153,7 @@ def act(i, device, batch_queues, model, flags): target_buf[p].append(episode_return) break for p in positions: - if size[p] > T: + while size[p] > T: # print(p, "epr", torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in episode_return_buf[p][:T]]),) batch_queues[p].put({ "done": torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in done_buf[p][:T]]), @@ -182,18 +186,22 @@ def _cards2tensor(list_cards): See Figure 2 in https://arxiv.org/pdf/2106.06135.pdf """ if len(list_cards) == 0: - return torch.zeros(54, dtype=torch.int8) + return torch.zeros(108, dtype=torch.int8) - matrix = np.zeros([4, 13], dtype=np.int8) - jokers = np.zeros(2, dtype=np.int8) + matrix = np.zeros([8, 13], dtype=np.int8) + jokers = np.zeros(4, dtype=np.int8) counter = Counter(list_cards) for card, num_times in counter.items(): if card < 20: matrix[:, Card2Column[card]] = NumOnes2Array[num_times] elif card == 20: jokers[0] = 1 + if num_times == 2: + jokers[1] = 1 elif card == 30: - jokers[1] = 1 + jokers[2] = 1 + if num_times == 2: + jokers[3] = 1 matrix = np.concatenate((matrix.flatten('F'), jokers)) matrix = torch.from_numpy(matrix) return matrix diff --git a/douzero/env/env.py b/douzero/env/env.py index 75ab10f..91947e2 100644 --- a/douzero/env/env.py +++ b/douzero/env/env.py @@ -11,17 +11,22 @@ env_url = "http://od.vcccz.com/hechuan/env.py" Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 17: 12} -NumOnes2Array = {0: np.array([0, 0, 0, 0]), - 1: np.array([1, 0, 0, 0]), - 2: np.array([1, 1, 0, 0]), - 3: np.array([1, 1, 1, 0]), - 4: np.array([1, 1, 1, 1])} +NumOnes2Array = {0: np.array([0, 0, 0, 0, 0, 0, 0, 0]), + 1: np.array([1, 0, 0, 0, 0, 0, 0, 0]), + 2: np.array([1, 1, 0, 0, 0, 0, 0, 0]), + 3: np.array([1, 1, 1, 0, 0, 0, 0, 0]), + 4: np.array([1, 1, 1, 1, 0, 0, 0, 0]), + 5: np.array([1, 1, 1, 1, 1, 0, 0, 0]), + 6: np.array([1, 1, 1, 1, 1, 1, 0, 0]), + 7: np.array([1, 1, 1, 1, 1, 1, 1, 0]), + 8: np.array([1, 1, 1, 1, 1, 1, 1, 1])} + deck = [] for i in range(3, 15): - deck.extend([i for _ in range(4)]) -deck.extend([17 for _ in range(4)]) -deck.extend([20, 30]) + deck.extend([i for _ in range(8)]) +deck.extend([17 for _ in range(8)]) +deck.extend([20, 20, 30, 30]) class Env: @@ -46,7 +51,7 @@ class Env: # Initialize players # We use three dummy player for the target position self.players = {} - for position in ['landlord', 'landlord_up', 'landlord_down']: + for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']: self.players[position] = DummyAgent(position) # Initialize the internal environment @@ -67,10 +72,11 @@ class Env: if model is None: _deck = deck.copy() np.random.shuffle(_deck) - card_play_data = {'landlord': _deck[:20], - 'landlord_up': _deck[20:37], - 'landlord_down': _deck[37:54], - 'three_landlord_cards': _deck[17:20], + card_play_data = {'landlord': _deck[:33], + 'landlord_up': _deck[33:58], + 'landlord_front': _deck[58:83], + 'landlord_down': _deck[83:108], + # 'three_landlord_cards': _deck[17:20], } for key in card_play_data: card_play_data[key].sort() @@ -97,18 +103,20 @@ class Env: _deck = deck.copy() np.random.shuffle(_deck) card_play_data = [ - _deck[:17], - _deck[17:34], - _deck[34:51], + _deck[:25], + _deck[25:50], + _deck[50:75], + _deck[75:100], ] - for i in range(3): + for i in range(4): card_play_data[i].sort() - landlord_cards = _deck[51:54] + landlord_cards = _deck[100:108] landlord_cards.sort() - bid_info = np.array([[-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1]]) + bid_info = np.array([[-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1]]) bidding_player = random.randint(0, 2) # bidding_player = 0 # debug first_bid = -1 @@ -116,7 +124,7 @@ class Env: bid_count = 0 if bid_limit <= 0: force_bid = True - for r in range(3): + for r in range(4): bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player]) with torch.no_grad(): action = model.forward("bidding", torch.tensor(bidding_obs["z_batch"], device=device), @@ -137,19 +145,19 @@ class Env: bid_count += 1 if first_bid == -1: first_bid = bidding_player - for p in range(3): + for p in range(4): if p == bidding_player: bid_info[r][p] = 1 else: bid_info[r][p] = 0 else: - bid_info[r] = [0, 0, 0] - bidding_player = (bidding_player + 1) % 3 + bid_info[r] = [0, 0, 0, 0] + bidding_player = (bidding_player + 1) % 4 one_count = np.count_nonzero(bid_info == 1) if one_count == 0: continue elif one_count > 1: - r = 3 + r = 4 bidding_player = first_bid bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player]) with torch.no_grad(): @@ -163,7 +171,7 @@ class Env: if action["action"] == 1: last_bid = bidding_player bid_count += 1 - for p in range(3): + for p in range(4): if p == bidding_player: bid_info[r][p] = 1 else: @@ -171,20 +179,23 @@ class Env: break card_play_data[last_bid].extend(landlord_cards) card_play_data = {'landlord': card_play_data[last_bid], - 'landlord_up': card_play_data[(last_bid - 1) % 3], - 'landlord_down': card_play_data[(last_bid + 1) % 3], - 'three_landlord_cards': landlord_cards, + 'landlord_up': card_play_data[(last_bid - 1) % 4], + 'landlord_down': card_play_data[(last_bid + 1) % 4], + 'landlord_front': card_play_data[(last_bid + 2) % 4], + # 'three_landlord_cards': landlord_cards, } card_play_data["landlord"].sort() player_ids = { 'landlord': last_bid, - 'landlord_up': (last_bid - 1) % 3, - 'landlord_down': (last_bid + 1) % 3, + 'landlord_up': (last_bid - 1) % 4, + 'landlord_down': (last_bid + 1) % 4, + 'landlord_front': (last_bid + 2) % 4, } player_positions = { last_bid: 'landlord', - (last_bid - 1) % 3: 'landlord_up', - (last_bid + 1) % 3: 'landlord_down' + (last_bid - 1) % 4: 'landlord_up', + (last_bid + 1) % 4: 'landlord_down', + (last_bid + 2) % 4: 'landlord_front', } for bid_obs in bid_obs_buffer: bid_obs.update({"position": player_positions[bid_obs["pid"]]}) @@ -192,14 +203,15 @@ class Env: # Initialize the cards self._env.card_play_init(card_play_data) multiply_map = [ - np.array([1, 0, 0]), - np.array([0, 1, 0]), - np.array([0, 0, 1]) + np.array([1, 0, 0, 0]), + np.array([0, 1, 0, 0]), + np.array([0, 0, 1, 0]), + np.array([0, 0, 0, 1]) ] - for pos in ["landlord", "landlord_up", "landlord_down"]: + for pos in ["landlord", "landlord_up", "landlord_front", "landlord_down"]: pid = player_ids[pos] self._env.info_sets[pos].player_id = pid - self._env.info_sets[pos].bid_info = bid_info[:, [(pid - 1) % 3, pid, (pid + 1) % 3]] + self._env.info_sets[pos].bid_info = bid_info[:, [(pid - 1) % 4, pid, (pid + 1) % 4, (pid + 2) % 4]] self._env.bid_count = bid_count # multiply_obs = _get_obs_for_multiply(pos, self._env.info_sets[pos].bid_info, card_play_data[pos], # landlord_cards) @@ -245,11 +257,13 @@ class Env: "play": { "landlord": self._get_reward("landlord"), "landlord_up": self._get_reward("landlord_up"), + "landlord_front": self._get_reward("landlord_front"), "landlord_down": self._get_reward("landlord_down") }, "bid": { - "landlord": self._get_reward_bidding("landlord")*2, + "landlord": self._get_reward_bidding("landlord")*3, "landlord_up": self._get_reward_bidding("landlord_up"), + "landlord_front": self._get_reward_bidding("landlord_front"), "landlord_down": self._get_reward_bidding("landlord_down") } } @@ -269,16 +283,19 @@ class Env: self_bomb_num = self._env.pos_bomb_num[pos] if winner == 'landlord': if self.objective == 'adp': - return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num +self._env.multiply_count[pos]) /8 + return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] + self._env.multiply_count[pos]) /8 + return (2.0 ** bomb_num[0]) * (3.0 ** bomb_num[1]) elif self.objective == 'logadp': return (1.0 - self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4 + return bomb_num[0] + bomb_num[1] + 1.0 else: return 1.0 - self._env.step_count * 0.0033 + return 1.0 else: if self.objective == 'adp': - return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num +self._env.multiply_count[pos]) /8 + return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] +self._env.multiply_count[pos]) /8 elif self.objective == 'logadp': - return (-1.0 + self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4 + return (-1.0 + self._env.step_count * 0.0033) * 1.3**(self_bomb_num) * 2**self._env.multiply_count[pos] / 4 else: return -1.0 + self._env.step_count * 0.0033 @@ -371,12 +388,12 @@ def get_obs(infoset, use_general=True): This function obtains observations with imperfect information from the infoset. It has three branches since we encode different features for different positions. - + This function will return dictionary named `obs`. It contains several fields. These fields will be used to train the model. One can play with those features to improve the performance. - `position` is a string that can be landlord/landlord_down/landlord_up + `position` is a string that can be landlord/landlord_down/landlord_front/landlord_up `x_batch` is a batch of features (excluding the hisorical moves). It also encodes the action feature @@ -391,7 +408,7 @@ def get_obs(infoset, use_general=True): `z`: same as z_batch but not a batch. """ if use_general: - if infoset.player_position not in ["landlord", "landlord_up", "landlord_down"]: + if infoset.player_position not in ["landlord", "landlord_up", "landlord_front", "landlord_down"]: raise ValueError('') return _get_obs_general(infoset, infoset.player_position) else: @@ -399,6 +416,8 @@ def get_obs(infoset, use_general=True): return _get_obs_landlord(infoset) elif infoset.player_position == 'landlord_up': return _get_obs_landlord_up(infoset) + elif infoset.player_position == 'landlord_front': + return _get_obs_landlord_front(infoset) elif infoset.player_position == 'landlord_down': return _get_obs_landlord_down(infoset) else: @@ -424,18 +443,22 @@ def _cards2array(list_cards): the representations. """ if len(list_cards) == 0: - return np.zeros(54, dtype=np.int8) + return np.zeros(108, dtype=np.int8) - matrix = np.zeros([4, 13], dtype=np.int8) - jokers = np.zeros(2, dtype=np.int8) + matrix = np.zeros([8, 13], dtype=np.int8) + jokers = np.zeros(4, dtype=np.int8) counter = Counter(list_cards) for card, num_times in counter.items(): if card < 20: matrix[:, Card2Column[card]] = NumOnes2Array[num_times] elif card == 20: jokers[0] = 1 + if num_times == 2: + jokers[1] = 1 elif card == 30: - jokers[1] = 1 + jokers[2] = 1 + if num_times == 2: + jokers[3] = 1 return np.concatenate((matrix.flatten('F'), jokers)) @@ -449,7 +472,7 @@ def _cards2array(list_cards): # Finally, we obtain a 5x162 matrix, which will be fed # into LSTM for encoding. # """ -# action_seq_array = np.zeros((len(action_seq_list), 54)) +# action_seq_array = np.zeros((len(action_seq_list), 108)) # for row, list_cards in enumerate(action_seq_list): # action_seq_array[row, :] = _cards2array(list_cards) # # action_seq_array = action_seq_array.reshape(5, 162) @@ -458,26 +481,26 @@ def _cards2array(list_cards): def _action_seq_list2array(action_seq_list, new_model=True): """ A utility function to encode the historical moves. - We encode the historical 15 actions. If there is - no 15 actions, we pad the features with 0. Since + We encode the historical 20 actions. If there is + no 20 actions, we pad the features with 0. Since three moves is a round in DouDizhu, we concatenate the representations for each consecutive three moves. - Finally, we obtain a 5x162 matrix, which will be fed + Finally, we obtain a 5x432 matrix, which will be fed into LSTM for encoding. """ if new_model: - position_map = {"landlord": 0, "landlord_up": 1, "landlord_down": 2} - action_seq_array = np.ones((len(action_seq_list), 54)) * -1 # Default Value -1 for not using area + # position_map = {"landlord": 0, "landlord_up": 1, "landlord_front": 2, "landlord_down": 3} + action_seq_array = np.ones((len(action_seq_list), 108)) * -1 # Default Value -1 for not using area for row, list_cards in enumerate(action_seq_list): if list_cards != []: - action_seq_array[row, :54] = _cards2array(list_cards[1]) + action_seq_array[row, :108] = _cards2array(list_cards[1]) else: - action_seq_array = np.zeros((len(action_seq_list), 54)) + action_seq_array = np.zeros((len(action_seq_list), 108)) for row, list_cards in enumerate(action_seq_list): if list_cards != []: action_seq_array[row, :] = _cards2array(list_cards[1]) - action_seq_array = action_seq_array.reshape(5, 162) + action_seq_array = action_seq_array.reshape(5, 432) return action_seq_array # action_seq_array = np.zeros((len(action_seq_list), 54)) @@ -487,10 +510,10 @@ def _action_seq_list2array(action_seq_list, new_model=True): # return action_seq_array -def _process_action_seq(sequence, length=15, new_model=True): +def _process_action_seq(sequence, length=20, new_model=True): """ A utility function encoding historical moves. We - encode 15 moves. If there is no 15 moves, we pad + encode 20 moves. If there is no 20 moves, we pad with zeros. """ sequence = sequence[-length:].copy() @@ -508,8 +531,8 @@ def _get_one_hot_bomb(bomb_num): A utility function to encode the number of bombs into one-hot representation. """ - one_hot = np.zeros(15) - one_hot[bomb_num] = 1 + one_hot = np.zeros(29) + one_hot[bomb_num[0] + bomb_num[1]] = 1 return one_hot @@ -536,13 +559,19 @@ def _get_obs_landlord(infoset): my_action_batch[j, :] = _cards2array(action) landlord_up_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord_up'], 17) + infoset.num_cards_left_dict['landlord_up'], 25) landlord_up_num_cards_left_batch = np.repeat( landlord_up_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) + landlord_front_num_cards_left = _get_one_hot_array( + infoset.num_cards_left_dict['landlord_front'], 25) + landlord_front_num_cards_left_batch = np.repeat( + landlord_front_num_cards_left[np.newaxis, :], + num_legal_actions, axis=0) + landlord_down_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord_down'], 17) + infoset.num_cards_left_dict['landlord_down'], 25) landlord_down_num_cards_left_batch = np.repeat( landlord_down_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) @@ -553,6 +582,12 @@ def _get_obs_landlord(infoset): landlord_up_played_cards[np.newaxis, :], num_legal_actions, axis=0) + landlord_front_played_cards = _cards2array( + infoset.played_cards['landlord_front']) + landlord_front_played_cards_batch = np.repeat( + landlord_front_played_cards[np.newaxis, :], + num_legal_actions, axis=0) + landlord_down_played_cards = _cards2array( infoset.played_cards['landlord_down']) landlord_down_played_cards_batch = np.repeat( @@ -569,8 +604,10 @@ def _get_obs_landlord(infoset): other_handcards_batch, last_action_batch, landlord_up_played_cards_batch, + landlord_front_played_cards_batch, landlord_down_played_cards_batch, landlord_up_num_cards_left_batch, + landlord_front_num_cards_left_batch, landlord_down_num_cards_left_batch, bomb_num_batch, my_action_batch)) @@ -578,23 +615,25 @@ def _get_obs_landlord(infoset): other_handcards, last_action, landlord_up_played_cards, + landlord_front_played_cards, landlord_down_played_cards, landlord_up_num_cards_left, + landlord_front_num_cards_left, landlord_down_num_cards_left, bomb_num)) z = _action_seq_list2array(_process_action_seq( - infoset.card_play_action_seq, 15, False), False) + infoset.card_play_action_seq, 20, False), False) z_batch = np.repeat( z[np.newaxis, :, :], num_legal_actions, axis=0) obs = { - 'position': 'landlord', - 'x_batch': x_batch.astype(np.float32), - 'z_batch': z_batch.astype(np.float32), - 'legal_actions': infoset.legal_actions, - 'x_no_action': x_no_action.astype(np.int8), - 'z': z.astype(np.int8), - } + 'position': 'landlord', + 'x_batch': x_batch.astype(np.float32), + 'z_batch': z_batch.astype(np.float32), + 'legal_actions': infoset.legal_actions, + 'x_no_action': x_no_action.astype(np.int8), + 'z': z.astype(np.int8), + } return obs def _get_obs_landlord_up(infoset): @@ -625,7 +664,7 @@ def _get_obs_landlord_up(infoset): last_landlord_action[np.newaxis, :], num_legal_actions, axis=0) landlord_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord'], 20) + infoset.num_cards_left_dict['landlord'], 33) landlord_num_cards_left_batch = np.repeat( landlord_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) @@ -642,7 +681,7 @@ def _get_obs_landlord_up(infoset): last_teammate_action[np.newaxis, :], num_legal_actions, axis=0) teammate_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord_down'], 17) + infoset.num_cards_left_dict['landlord_down'], 25) teammate_num_cards_left_batch = np.repeat( teammate_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) @@ -653,6 +692,144 @@ def _get_obs_landlord_up(infoset): teammate_played_cards[np.newaxis, :], num_legal_actions, axis=0) + last_teammate_front_action = _cards2array( + infoset.last_move_dict['landlord_front']) + last_teammate_front_action_batch = np.repeat( + last_teammate_front_action[np.newaxis, :], + num_legal_actions, axis=0) + teammate_front_num_cards_left = _get_one_hot_array( + infoset.num_cards_left_dict['landlord_front'], 25) + teammate_front_num_cards_left_batch = np.repeat( + teammate_front_num_cards_left[np.newaxis, :], + num_legal_actions, axis=0) + + teammate_front_played_cards = _cards2array( + infoset.played_cards['landlord_front']) + teammate_front_played_cards_batch = np.repeat( + teammate_front_played_cards[np.newaxis, :], + num_legal_actions, axis=0) + + bomb_num = _get_one_hot_bomb( + infoset.bomb_num) + bomb_num_batch = np.repeat( + bomb_num[np.newaxis, :], + num_legal_actions, axis=0) + + x_batch = np.hstack((my_handcards_batch, + other_handcards_batch, + landlord_played_cards_batch, + teammate_played_cards_batch, + teammate_front_played_cards_batch, + last_action_batch, + last_landlord_action_batch, + last_teammate_action_batch, + last_teammate_front_action_batch, + landlord_num_cards_left_batch, + teammate_num_cards_left_batch, + teammate_front_num_cards_left_batch, + bomb_num_batch, + my_action_batch)) + x_no_action = np.hstack((my_handcards, + other_handcards, + landlord_played_cards, + teammate_played_cards, + teammate_front_played_cards, + last_action, + last_landlord_action, + last_teammate_action, + last_teammate_front_action, + landlord_num_cards_left, + teammate_num_cards_left, + teammate_front_num_cards_left, + bomb_num)) + z = _action_seq_list2array(_process_action_seq( + infoset.card_play_action_seq, 20, False), False) + z_batch = np.repeat( + z[np.newaxis, :, :], + num_legal_actions, axis=0) + obs = { + 'position': 'landlord_up', + 'x_batch': x_batch.astype(np.float32), + 'z_batch': z_batch.astype(np.float32), + 'legal_actions': infoset.legal_actions, + 'x_no_action': x_no_action.astype(np.int8), + 'z': z.astype(np.int8), + } + return obs + +def _get_obs_landlord_front(infoset): + """ + Obttain the landlord_front features. See Table 5 in + https://arxiv.org/pdf/2106.06135.pdf + """ + num_legal_actions = len(infoset.legal_actions) + my_handcards = _cards2array(infoset.player_hand_cards) + my_handcards_batch = np.repeat(my_handcards[np.newaxis, :], + num_legal_actions, axis=0) + + other_handcards = _cards2array(infoset.other_hand_cards) + other_handcards_batch = np.repeat(other_handcards[np.newaxis, :], + num_legal_actions, axis=0) + + last_action = _cards2array(infoset.last_move) + last_action_batch = np.repeat(last_action[np.newaxis, :], + num_legal_actions, axis=0) + + my_action_batch = np.zeros(my_handcards_batch.shape) + for j, action in enumerate(infoset.legal_actions): + my_action_batch[j, :] = _cards2array(action) + + last_landlord_action = _cards2array( + infoset.last_move_dict['landlord']) + last_landlord_action_batch = np.repeat( + last_landlord_action[np.newaxis, :], + num_legal_actions, axis=0) + landlord_num_cards_left = _get_one_hot_array( + infoset.num_cards_left_dict['landlord'], 33) + landlord_num_cards_left_batch = np.repeat( + landlord_num_cards_left[np.newaxis, :], + num_legal_actions, axis=0) + + landlord_played_cards = _cards2array( + infoset.played_cards['landlord']) + landlord_played_cards_batch = np.repeat( + landlord_played_cards[np.newaxis, :], + num_legal_actions, axis=0) + + last_teammate_action = _cards2array( + infoset.last_move_dict['landlord_down']) + last_teammate_action_batch = np.repeat( + last_teammate_action[np.newaxis, :], + num_legal_actions, axis=0) + teammate_num_cards_left = _get_one_hot_array( + infoset.num_cards_left_dict['landlord_down'], 25) + teammate_num_cards_left_batch = np.repeat( + teammate_num_cards_left[np.newaxis, :], + num_legal_actions, axis=0) + + teammate_played_cards = _cards2array( + infoset.played_cards['landlord_down']) + teammate_played_cards_batch = np.repeat( + teammate_played_cards[np.newaxis, :], + num_legal_actions, axis=0) + + last_teammate_front_action = _cards2array( + infoset.last_move_dict['landlord_front']) + last_teammate_front_action_batch = np.repeat( + last_teammate_front_action[np.newaxis, :], + num_legal_actions, axis=0) + teammate_front_num_cards_left = _get_one_hot_array( + infoset.num_cards_left_dict['landlord_front'], 25) + teammate_front_num_cards_left_batch = np.repeat( + teammate_front_num_cards_left[np.newaxis, :], + num_legal_actions, axis=0) + + teammate_front_played_cards = _cards2array( + infoset.played_cards['landlord_front']) + teammate_front_played_cards_batch = np.repeat( + teammate_played_cards[np.newaxis, :], + num_legal_actions, axis=0) + bomb_num = _get_one_hot_bomb( infoset.bomb_num) bomb_num_batch = np.repeat( @@ -663,36 +840,42 @@ def _get_obs_landlord_up(infoset): other_handcards_batch, landlord_played_cards_batch, teammate_played_cards_batch, + teammate_front_played_cards_batch, last_action_batch, last_landlord_action_batch, last_teammate_action_batch, + last_teammate_front_action_batch, landlord_num_cards_left_batch, teammate_num_cards_left_batch, + teammate_front_num_cards_left_batch, bomb_num_batch, my_action_batch)) x_no_action = np.hstack((my_handcards, other_handcards, landlord_played_cards, teammate_played_cards, + teammate_front_played_cards, last_action, last_landlord_action, last_teammate_action, + last_teammate_front_action, landlord_num_cards_left, teammate_num_cards_left, + teammate_front_num_cards_left, bomb_num)) z = _action_seq_list2array(_process_action_seq( - infoset.card_play_action_seq, 15, False), False) + infoset.card_play_action_seq, 20, False), False) z_batch = np.repeat( z[np.newaxis, :, :], num_legal_actions, axis=0) obs = { - 'position': 'landlord_up', - 'x_batch': x_batch.astype(np.float32), - 'z_batch': z_batch.astype(np.float32), - 'legal_actions': infoset.legal_actions, - 'x_no_action': x_no_action.astype(np.int8), - 'z': z.astype(np.int8), - } + 'position': 'landlord_front', + 'x_batch': x_batch.astype(np.float32), + 'z_batch': z_batch.astype(np.float32), + 'legal_actions': infoset.legal_actions, + 'x_no_action': x_no_action.astype(np.int8), + 'z': z.astype(np.int8), + } return obs def _get_obs_landlord_down(infoset): @@ -723,7 +906,7 @@ def _get_obs_landlord_down(infoset): last_landlord_action[np.newaxis, :], num_legal_actions, axis=0) landlord_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord'], 20) + infoset.num_cards_left_dict['landlord'], 33) landlord_num_cards_left_batch = np.repeat( landlord_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) @@ -740,7 +923,7 @@ def _get_obs_landlord_down(infoset): last_teammate_action[np.newaxis, :], num_legal_actions, axis=0) teammate_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord_up'], 17) + infoset.num_cards_left_dict['landlord_up'], 25) teammate_num_cards_left_batch = np.repeat( teammate_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) @@ -751,10 +934,21 @@ def _get_obs_landlord_down(infoset): teammate_played_cards[np.newaxis, :], num_legal_actions, axis=0) - landlord_played_cards = _cards2array( - infoset.played_cards['landlord']) - landlord_played_cards_batch = np.repeat( - landlord_played_cards[np.newaxis, :], + last_teammate_front_action = _cards2array( + infoset.last_move_dict['landlord_front']) + last_teammate_front_action_batch = np.repeat( + last_teammate_front_action[np.newaxis, :], + num_legal_actions, axis=0) + teammate_front_num_cards_left = _get_one_hot_array( + infoset.num_cards_left_dict['landlord_front'], 25) + teammate_front_num_cards_left_batch = np.repeat( + teammate_front_num_cards_left[np.newaxis, :], + num_legal_actions, axis=0) + + teammate_front_played_cards = _cards2array( + infoset.played_cards['landlord_front']) + teammate_front_played_cards_batch = np.repeat( + teammate_front_played_cards[np.newaxis, :], num_legal_actions, axis=0) bomb_num = _get_one_hot_bomb( @@ -767,36 +961,42 @@ def _get_obs_landlord_down(infoset): other_handcards_batch, landlord_played_cards_batch, teammate_played_cards_batch, + teammate_front_played_cards_batch, last_action_batch, last_landlord_action_batch, last_teammate_action_batch, + last_teammate_front_action_batch, landlord_num_cards_left_batch, teammate_num_cards_left_batch, + teammate_front_num_cards_left_batch, bomb_num_batch, my_action_batch)) x_no_action = np.hstack((my_handcards, other_handcards, landlord_played_cards, teammate_played_cards, + teammate_front_played_cards, last_action, last_landlord_action, last_teammate_action, + last_teammate_front_action, landlord_num_cards_left, teammate_num_cards_left, + teammate_front_num_cards_left, bomb_num)) z = _action_seq_list2array(_process_action_seq( - infoset.card_play_action_seq, 15, False), False) + infoset.card_play_action_seq, 20, False), False) z_batch = np.repeat( z[np.newaxis, :, :], num_legal_actions, axis=0) obs = { - 'position': 'landlord_down', - 'x_batch': x_batch.astype(np.float32), - 'z_batch': z_batch.astype(np.float32), - 'legal_actions': infoset.legal_actions, - 'x_no_action': x_no_action.astype(np.int8), - 'z': z.astype(np.int8), - } + 'position': 'landlord_down', + 'x_batch': x_batch.astype(np.float32), + 'z_batch': z_batch.astype(np.float32), + 'legal_actions': infoset.legal_actions, + 'x_no_action': x_no_action.astype(np.int8), + 'z': z.astype(np.int8), + } return obs def _get_obs_landlord_withbid(infoset): @@ -869,7 +1069,7 @@ def _get_obs_landlord_withbid(infoset): landlord_down_num_cards_left, bomb_num)) z = _action_seq_list2array(_process_action_seq( - infoset.card_play_action_seq, 15, False), False) + infoset.card_play_action_seq, 20, False), False) z_batch = np.repeat( z[np.newaxis, :, :], num_legal_actions, axis=0) @@ -970,21 +1170,21 @@ def _get_obs_general1(infoset, position): bomb_num[np.newaxis, :], num_legal_actions, axis=0) - x_batch = np.hstack((position_info_batch, # 3 - my_handcards_batch, # 54 - other_handcards_batch, # 54 - three_landlord_cards_batch, # 54 - last_action_batch, # 54 - landlord_played_cards_batch, # 54 - landlord_up_played_cards_batch, # 54 - landlord_down_played_cards_batch, # 54 + x_batch = np.hstack((position_info_batch, # 4 + my_handcards_batch, # 108 + other_handcards_batch, # 108 + three_landlord_cards_batch, # 108 + last_action_batch, # 108 + landlord_played_cards_batch, # 108 + landlord_up_played_cards_batch, # 108 + landlord_down_played_cards_batch, # 108 landlord_num_cards_left_batch, # 20 landlord_up_num_cards_left_batch, # 17 landlord_down_num_cards_left_batch, # 17 bomb_num_batch, # 15 bid_info_batch, # 12 multiply_info_batch, # 3 - my_action_batch)) # 54 + my_action_batch)) # 108 x_no_action = np.hstack((position_info, my_handcards, other_handcards, @@ -1025,9 +1225,10 @@ def _get_obs_general(infoset, position): num_legal_actions, axis=0) position_map = { - "landlord": [1, 0, 0], - "landlord_up": [0, 1, 0], - "landlord_down": [0, 0, 1] + "landlord": [1, 0, 0, 0], + "landlord_up": [0, 1, 0, 0], + "landlord_front": [0, 0, 1, 0], + "landlord_down": [0, 0, 0, 1] } position_info = np.array(position_map[position]) position_info_batch = np.repeat(position_info[np.newaxis, :], @@ -1041,9 +1242,9 @@ def _get_obs_general(infoset, position): multiply_info_batch = np.repeat(multiply_info[np.newaxis, :], num_legal_actions, axis=0) - three_landlord_cards = _cards2array(infoset.three_landlord_cards) - three_landlord_cards_batch = np.repeat(three_landlord_cards[np.newaxis, :], - num_legal_actions, axis=0) + # three_landlord_cards = _cards2array(infoset.three_landlord_cards) + # three_landlord_cards_batch = np.repeat(three_landlord_cards[np.newaxis, :], + # num_legal_actions, axis=0) last_action = _cards2array(infoset.last_move) last_action_batch = np.repeat(last_action[np.newaxis, :], @@ -1054,25 +1255,31 @@ def _get_obs_general(infoset, position): my_action_batch[j, :] = _cards2array(action) landlord_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord'], 20) + infoset.num_cards_left_dict['landlord'], 33) landlord_num_cards_left_batch = np.repeat( landlord_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) landlord_up_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord_up'], 17) + infoset.num_cards_left_dict['landlord_up'], 25) landlord_up_num_cards_left_batch = np.repeat( landlord_up_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) + landlord_front_num_cards_left = _get_one_hot_array( + infoset.num_cards_left_dict['landlord_front'], 25) + landlord_front_num_cards_left_batch = np.repeat( + landlord_front_num_cards_left[np.newaxis, :], + num_legal_actions, axis=0) + landlord_down_num_cards_left = _get_one_hot_array( - infoset.num_cards_left_dict['landlord_down'], 17) + infoset.num_cards_left_dict['landlord_down'], 25) landlord_down_num_cards_left_batch = np.repeat( landlord_down_num_cards_left[np.newaxis, :], num_legal_actions, axis=0) other_handcards_left_list = [] - for pos in ["landlord", "landlord_up", "landlord_up"]: + for pos in ["landlord", "landlord_up", "landlord_front", "landlord_down"]: if pos != position: other_handcards_left_list.extend(infoset.all_handcards[pos]) @@ -1088,6 +1295,12 @@ def _get_obs_general(infoset, position): landlord_up_played_cards[np.newaxis, :], num_legal_actions, axis=0) + landlord_front_played_cards = _cards2array( + infoset.played_cards['landlord_front']) + landlord_front_played_cards_batch = np.repeat( + landlord_front_played_cards[np.newaxis, :], + num_legal_actions, axis=0) + landlord_down_played_cards = _cards2array( infoset.played_cards['landlord_down']) landlord_down_played_cards_batch = np.repeat( @@ -1100,24 +1313,26 @@ def _get_obs_general(infoset, position): bomb_num[np.newaxis, :], num_legal_actions, axis=0) num_cards_left = np.hstack(( - landlord_num_cards_left, # 20 - landlord_up_num_cards_left, # 17 + landlord_num_cards_left, # 33 + landlord_up_num_cards_left, # 25 + landlord_front_num_cards_left, # 25 landlord_down_num_cards_left)) x_batch = np.hstack(( - bid_info_batch, # 12 - multiply_info_batch)) # 3 + bid_info_batch, # 16 + multiply_info_batch)) # 4 x_no_action = np.hstack(( bid_info, multiply_info)) z =np.vstack(( num_cards_left, - my_handcards, # 54 - other_handcards, # 54 - three_landlord_cards, # 54 - landlord_played_cards, # 54 - landlord_up_played_cards, # 54 - landlord_down_played_cards, # 54 + my_handcards, # 108 + other_handcards, # 108 + # three_landlord_cards, # 108 + landlord_played_cards, # 108 + landlord_up_played_cards, # 108 + landlord_front_played_cards, # 108 + landlord_down_played_cards, # 108 _action_seq_list2array(_process_action_seq(infoset.card_play_action_seq, 32)) )) @@ -1125,7 +1340,7 @@ def _get_obs_general(infoset, position): z[np.newaxis, :, :], num_legal_actions, axis=0) my_action_batch = my_action_batch[:,np.newaxis,:] - z_batch = np.zeros([len(_z_batch),40,54],int) + z_batch = np.zeros([len(_z_batch),40,108],int) for i in range(0,len(_z_batch)): z_batch[i] = np.vstack((my_action_batch[i],_z_batch[i])) obs = { @@ -1139,17 +1354,17 @@ def _get_obs_general(infoset, position): return obs def gen_bid_legal_actions(player_id, bid_info): - self_bid_info = bid_info[:, [(player_id - 1) % 3, player_id, (player_id + 1) % 3]] + self_bid_info = bid_info[:, [(player_id - 1) % 4, player_id, (player_id + 1) % 4, (player_id + 2) % 4]] curr_round = -1 - for r in range(4): + for r in range(5): if -1 in self_bid_info[r]: curr_round = r break bid_actions = [] if curr_round != -1: - self_bid_info[curr_round] = [0, 0, 0] + self_bid_info[curr_round] = [0, 0, 0, 0] bid_actions.append(np.array(self_bid_info).flatten()) - self_bid_info[curr_round] = [0, 1, 0] + self_bid_info[curr_round] = [0, 1, 0, 0] bid_actions.append(np.array(self_bid_info).flatten()) return np.array(bid_actions) @@ -1273,9 +1488,9 @@ def _get_obs_for_bid_legacy(player_id, bid_info, hand_cards): return obs def _get_obs_for_bid(player_id, bid_info, hand_cards): - all_cards = [3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, - 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, - 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 20, 30] + # all_cards = [3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + # 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, + # 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 20, 30] num_legal_actions = 2 my_handcards = _cards2array(hand_cards) my_handcards_batch = np.repeat(my_handcards[np.newaxis, :], diff --git a/douzero/env/game.py b/douzero/env/game.py index f9252f5..1b716f7 100644 --- a/douzero/env/game.py +++ b/douzero/env/game.py @@ -11,10 +11,19 @@ RealCard2EnvCard = {'3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10, 'J': 11, 'Q': 12, 'K': 13, 'A': 14, '2': 17, 'X': 20, 'D': 30} -bombs = [[3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], - [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9], [10, 10, 10, 10], - [11, 11, 11, 11], [12, 12, 12, 12], [13, 13, 13, 13], [14, 14, 14, 14], - [17, 17, 17, 17], [20, 30]] +bombs = [ + [[3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7], + [8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10], [11, 11, 11, 11, 11, 11], + [12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13], [14, 14, 14, 14, 14, 14], [17, 17, 17, 17, 17, 17], + [3, 3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7, 7], + [8, 8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10, 10], [11, 11, 11, 11, 11, 11, 11], + [12, 12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13, 13], [14, 14, 14, 14, 14, 14, 14], + [17, 17, 17, 17, 17, 17, 17]], + [[3, 3, 3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6, 6, 6], + [7, 7, 7, 7, 7, 7, 7, 7], [8, 8, 8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10, 10, 10], + [11, 11, 11, 11, 11, 11, 11, 11], [12, 12, 12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13, 13, 13], + [14, 14, 14, 14, 14, 14, 14, 14], [17, 17, 17, 17, 17, 17, 17, 17], + [20, 20, 30, 30]]] class GameEnv(object): @@ -22,7 +31,7 @@ class GameEnv(object): self.card_play_action_seq = [] - self.three_landlord_cards = None + # self.three_landlord_cards = None self.game_over = False self.acting_player_position = None @@ -32,10 +41,12 @@ class GameEnv(object): self.last_move_dict = {'landlord': [], 'landlord_up': [], + 'landlord_front': [], 'landlord_down': []} self.played_cards = {'landlord': [], 'landlord_up': [], + 'landlord_front': [], 'landlord_down': []} self.last_move = [] @@ -48,24 +59,28 @@ class GameEnv(object): 'farmer': 0} self.info_sets = {'landlord': InfoSet('landlord'), - 'landlord_up': InfoSet('landlord_up'), - 'landlord_down': InfoSet('landlord_down')} + 'landlord_up': InfoSet('landlord_up'), + 'landlord_front': InfoSet('landlord_front'), + 'landlord_down': InfoSet('landlord_down')} - self.bomb_num = 0 + self.bomb_num = [0, 0] self.pos_bomb_num = { "landlord": 0, "landlord_up": 0, + "landlord_front": 0, "landlord_down": 0 } self.last_pid = 'landlord' - self.bid_info = [[-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1]] + self.bid_info = [[-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1]] self.bid_count = 0 self.multiply_count = {'landlord': 0, 'landlord_up': 0, + 'landlord_front': 0, 'landlord_down': 0} self.step_count = 0 @@ -75,9 +90,11 @@ class GameEnv(object): card_play_data['landlord'] self.info_sets['landlord_up'].player_hand_cards = \ card_play_data['landlord_up'] + self.info_sets['landlord_front'].player_hand_cards = \ + card_play_data['landlord_front'] self.info_sets['landlord_down'].player_hand_cards = \ card_play_data['landlord_down'] - self.three_landlord_cards = card_play_data['three_landlord_cards'] + # self.three_landlord_cards = card_play_data['three_landlord_cards'] self.get_acting_player_position() self.game_infoset = self.get_infoset() @@ -85,6 +102,7 @@ class GameEnv(object): def game_done(self): if len(self.info_sets['landlord'].player_hand_cards) == 0 or \ len(self.info_sets['landlord_up'].player_hand_cards) == 0 or \ + len(self.info_sets['landlord_front'].player_hand_cards) == 0 or \ len(self.info_sets['landlord_down'].player_hand_cards) == 0: # if one of the three players discards his hand, # then game is over. @@ -96,21 +114,21 @@ class GameEnv(object): def compute_player_utility(self): if len(self.info_sets['landlord'].player_hand_cards) == 0: - self.player_utility_dict = {'landlord': 2, + self.player_utility_dict = {'landlord': 3, 'farmer': -1} else: - self.player_utility_dict = {'landlord': -2, + self.player_utility_dict = {'landlord': -3, 'farmer': 1} def update_num_wins_scores(self): for pos, utility in self.player_utility_dict.items(): - base_score = 2 if pos == 'landlord' else 1 + base_score = 3 if pos == 'landlord' else 1 if utility > 0: self.num_wins[pos] += 1 self.winner = pos - self.num_scores[pos] += base_score * (2 ** self.bomb_num) + self.num_scores[pos] += base_score * (2 ** self.bomb_num[0]) * (3 ** self.bomb_num[1]) else: - self.num_scores[pos] -= base_score * (2 ** self.bomb_num) + self.num_scores[pos] -= base_score * (2 ** self.bomb_num[0]) * (3 ** self.bomb_num[1]) def get_winner(self): return self.winner @@ -121,12 +139,17 @@ class GameEnv(object): def step(self): action = self.players[self.acting_player_position].act( self.game_infoset) - self.step_count += 1 + assert action in self.game_infoset.legal_actions + if len(action) > 0: self.last_pid = self.acting_player_position - if action in bombs: - self.bomb_num += 1 + if action in bombs[0]: + self.bomb_num[0] += 1 + self.pos_bomb_num[self.acting_player_position] += 1 + + if action in bombs[1]: + self.bomb_num[1] += 1 self.pos_bomb_num[self.acting_player_position] += 1 self.last_move_dict[ @@ -137,15 +160,15 @@ class GameEnv(object): self.played_cards[self.acting_player_position] += action - if self.acting_player_position == 'landlord' and \ - len(action) > 0 and \ - len(self.three_landlord_cards) > 0: - for card in action: - if len(self.three_landlord_cards) > 0: - if card in self.three_landlord_cards: - self.three_landlord_cards.remove(card) - else: - break + # if self.acting_player_position == 'landlord' and \ + # len(action) > 0 and \ + # len(self.three_landlord_cards) > 0: + # for card in action: + # if len(self.three_landlord_cards) > 0: + # if card in self.three_landlord_cards: + # self.three_landlord_cards.remove(card) + # else: + # break self.game_done() if not self.game_over: @@ -156,7 +179,7 @@ class GameEnv(object): def get_last_move(self): last_move = [] if len(self.card_play_action_seq) != 0: - if len(self.card_play_action_seq[-1][1]) == 0: + if len(self.card_play_action_seq[-1]) == 0: last_move = self.card_play_action_seq[-2][1] else: last_move = self.card_play_action_seq[-1][1] @@ -166,7 +189,7 @@ class GameEnv(object): def get_last_two_moves(self): last_two_moves = [[], []] for card in self.card_play_action_seq[-2:]: - last_two_moves.insert(0, card[1]) + last_two_moves.insert(0, card) last_two_moves = last_two_moves[:2] return last_two_moves @@ -179,6 +202,9 @@ class GameEnv(object): self.acting_player_position = 'landlord_down' elif self.acting_player_position == 'landlord_down': + self.acting_player_position = 'landlord_front' + + elif self.acting_player_position == 'landlord_front': self.acting_player_position = 'landlord_up' else: @@ -202,7 +228,10 @@ class GameEnv(object): rival_move = [] if len(action_sequence) != 0: if len(action_sequence[-1][1]) == 0: - rival_move = action_sequence[-2][1] + if len(action_sequence[-2][1]) == 0: + rival_move = action_sequence[-3][1] + else: + rival_move = action_sequence[-2][1] else: rival_move = action_sequence[-1][1] @@ -227,15 +256,36 @@ class GameEnv(object): moves = ms.filter_type_3_triple(all_moves, rival_move) elif rival_move_type == md.TYPE_4_BOMB: - all_moves = mg.gen_type_4_bomb() + mg.gen_type_5_king_bomb() + all_moves = mg.gen_type_4_bomb(4) moves = ms.filter_type_4_bomb(all_moves, rival_move) + all_moves += mg.gen_type_4_bomb(5) + mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb() + + elif rival_move_type == md.TYPE_4_BOMB5: + all_moves = mg.gen_type_4_bomb(5) + moves = ms.filter_type_4_bomb(all_moves, rival_move) + all_moves += mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb() + + elif rival_move_type == md.TYPE_4_BOMB6: + all_moves = mg.gen_type_4_bomb(6) + moves = ms.filter_type_4_bomb(all_moves, rival_move) + all_moves += mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb() + + elif rival_move_type == md.TYPE_4_BOMB7: + all_moves = mg.gen_type_4_bomb(7) + moves = ms.filter_type_4_bomb(all_moves, rival_move) + all_moves += mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb() + + elif rival_move_type == md.TYPE_4_BOMB8: + all_moves = mg.gen_type_4_bomb(8) + moves = ms.filter_type_4_bomb(all_moves, rival_move) + all_moves += mg.gen_type_5_king_bomb() elif rival_move_type == md.TYPE_5_KING_BOMB: moves = [] - elif rival_move_type == md.TYPE_6_3_1: - all_moves = mg.gen_type_6_3_1() - moves = ms.filter_type_6_3_1(all_moves, rival_move) + # elif rival_move_type == md.TYPE_6_3_1: + # all_moves = mg.gen_type_6_3_1() + # moves = ms.filter_type_6_3_1(all_moves, rival_move) elif rival_move_type == md.TYPE_7_3_2: all_moves = mg.gen_type_7_3_2() @@ -253,25 +303,24 @@ class GameEnv(object): all_moves = mg.gen_type_10_serial_triple(repeat_num=rival_move_len) moves = ms.filter_type_10_serial_triple(all_moves, rival_move) - elif rival_move_type == md.TYPE_11_SERIAL_3_1: - all_moves = mg.gen_type_11_serial_3_1(repeat_num=rival_move_len) - moves = ms.filter_type_11_serial_3_1(all_moves, rival_move) + # elif rival_move_type == md.TYPE_11_SERIAL_3_1: + # all_moves = mg.gen_type_11_serial_3_1(repeat_num=rival_move_len) + # moves = ms.filter_type_11_serial_3_1(all_moves, rival_move) elif rival_move_type == md.TYPE_12_SERIAL_3_2: all_moves = mg.gen_type_12_serial_3_2(repeat_num=rival_move_len) moves = ms.filter_type_12_serial_3_2(all_moves, rival_move) - elif rival_move_type == md.TYPE_13_4_2: - all_moves = mg.gen_type_13_4_2() - moves = ms.filter_type_13_4_2(all_moves, rival_move) + # elif rival_move_type == md.TYPE_13_4_2: + # all_moves = mg.gen_type_13_4_2() + # moves = ms.filter_type_13_4_2(all_moves, rival_move) - elif rival_move_type == md.TYPE_14_4_22: - all_moves = mg.gen_type_14_4_22() - moves = ms.filter_type_14_4_22(all_moves, rival_move) + # elif rival_move_type == md.TYPE_14_4_22: + # all_moves = mg.gen_type_14_4_22() + # moves = ms.filter_type_14_4_22(all_moves, rival_move) - if rival_move_type not in [md.TYPE_0_PASS, - md.TYPE_4_BOMB, md.TYPE_5_KING_BOMB]: - moves = moves + mg.gen_type_4_bomb() + mg.gen_type_5_king_bomb() + if rival_move_type != md.TYPE_0_PASS and rival_move_type < md.TYPE_4_BOMB: + moves = moves + mg.gen_type_4_bomb(4) + mg.gen_type_4_bomb(5) + mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb() if len(rival_move) != 0: # rival_move is not 'pass' moves = moves + [[]] @@ -284,7 +333,7 @@ class GameEnv(object): def reset(self): self.card_play_action_seq = [] - self.three_landlord_cards = None + # self.three_landlord_cards = None self.game_over = False self.acting_player_position = None @@ -292,33 +341,40 @@ class GameEnv(object): self.last_move_dict = {'landlord': [], 'landlord_up': [], + 'landlord_front': [], 'landlord_down': []} self.played_cards = {'landlord': [], 'landlord_up': [], + 'landlord_front': [], 'landlord_down': []} self.last_move = [] self.last_two_moves = [] self.info_sets = {'landlord': InfoSet('landlord'), - 'landlord_up': InfoSet('landlord_up'), - 'landlord_down': InfoSet('landlord_down')} + 'landlord_up': InfoSet('landlord_up'), + 'landlord_front': InfoSet('landlord_front'), + 'landlord_down': InfoSet('landlord_down')} - self.bomb_num = 0 + self.bomb_num = [0, 0] self.pos_bomb_num = { "landlord": 0, "landlord_up": 0, + "landlord_front": 0, "landlord_down": 0 } self.last_pid = 'landlord' - self.bid_info = [[-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1]] + + self.bid_info = [[-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1]] self.bid_count = 0 self.multiply_count = {'landlord': 0, 'landlord_up': 0, + 'landlord_front': 0, 'landlord_down': 0} self.step_count = 0 @@ -344,10 +400,10 @@ class GameEnv(object): self.info_sets[self.acting_player_position].num_cards_left_dict = \ {pos: len(self.info_sets[pos].player_hand_cards) - for pos in ['landlord', 'landlord_up', 'landlord_down']} + for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']} self.info_sets[self.acting_player_position].other_hand_cards = [] - for pos in ['landlord', 'landlord_up', 'landlord_down']: + for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']: if pos != self.acting_player_position: self.info_sets[ self.acting_player_position].other_hand_cards += \ @@ -355,15 +411,15 @@ class GameEnv(object): self.info_sets[self.acting_player_position].played_cards = \ self.played_cards - self.info_sets[self.acting_player_position].three_landlord_cards = \ - self.three_landlord_cards + # self.info_sets[self.acting_player_position].three_landlord_cards = \ + # self.three_landlord_cards self.info_sets[self.acting_player_position].card_play_action_seq = \ self.card_play_action_seq self.info_sets[ self.acting_player_position].all_handcards = \ {pos: self.info_sets[pos].player_hand_cards - for pos in ['landlord', 'landlord_up', 'landlord_down']} + for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']} return deepcopy(self.info_sets[self.acting_player_position]) @@ -379,13 +435,13 @@ class InfoSet(object): self.player_position = player_position # The hand cands of the current player. A list. self.player_hand_cards = None - # The number of cards left for each player. It is a dict with str-->int + # The number of cards left for each player. It is a dict with str-->int self.num_cards_left_dict = None # The three landload cards. A list. - self.three_landlord_cards = None + # self.three_landlord_cards = None # The historical moves. It is a list of list self.card_play_action_seq = None - # The union of the hand cards of the other two players for the current player + # The union of the hand cards of the other two players for the current player self.other_hand_cards = None # The legal actions for the current move. It is a list of list self.legal_actions = None @@ -397,18 +453,19 @@ class InfoSet(object): self.last_move_dict = None # The played cands so far. It is a list. self.played_cards = None - # The hand cards of all the players. It is a dict. + # The hand cards of all the players. It is a dict. self.all_handcards = None # Last player position that plays a valid move, i.e., not `pass` self.last_pid = None # The number of bombs played so far self.bomb_num = None - self.bid_info = [[-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1], - [-1, -1, -1]] + self.bid_info = [[-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1], + [-1, -1, -1, -1]] - self.multiply_info = [1, 0, 0] + self.multiply_info = [1, 0, 0, 0] self.player_id = None diff --git a/douzero/env/move_generator.py b/douzero/env/move_generator.py index ecebb6c..65cdafd 100644 --- a/douzero/env/move_generator.py +++ b/douzero/env/move_generator.py @@ -91,17 +91,17 @@ class MovesGener(object): self.triple_cards_moves.append([k, k, k]) return self.triple_cards_moves - def gen_type_4_bomb(self): + def gen_type_4_bomb(self, num = 4): self.bomb_moves = [] for k, v in self.cards_dict.items(): - if v == 4: - self.bomb_moves.append([k, k, k, k]) + if v == num: + self.bomb_moves.append([k] * num) return self.bomb_moves def gen_type_5_king_bomb(self): self.final_bomb_moves = [] - if 20 in self.cards_list and 30 in self.cards_list: - self.final_bomb_moves.append([20, 30]) + if 20 in self.cards_list and self.cards_dict[20] == 2 and 30 in self.cards_list and self.cards_dict[30] == 2: + self.final_bomb_moves.append([20, 20, 30, 30]) return self.final_bomb_moves def gen_type_6_3_1(self): @@ -205,15 +205,19 @@ class MovesGener(object): moves.extend(self.gen_type_1_single()) moves.extend(self.gen_type_2_pair()) moves.extend(self.gen_type_3_triple()) - moves.extend(self.gen_type_4_bomb()) + moves.extend(self.gen_type_4_bomb(4)) + moves.extend(self.gen_type_4_bomb(5)) + moves.extend(self.gen_type_4_bomb(6)) + moves.extend(self.gen_type_4_bomb(7)) + moves.extend(self.gen_type_4_bomb(8)) moves.extend(self.gen_type_5_king_bomb()) - moves.extend(self.gen_type_6_3_1()) + # moves.extend(self.gen_type_6_3_1()) moves.extend(self.gen_type_7_3_2()) moves.extend(self.gen_type_8_serial_single()) moves.extend(self.gen_type_9_serial_pair()) moves.extend(self.gen_type_10_serial_triple()) - moves.extend(self.gen_type_11_serial_3_1()) + # moves.extend(self.gen_type_11_serial_3_1()) moves.extend(self.gen_type_12_serial_3_2()) - moves.extend(self.gen_type_13_4_2()) - moves.extend(self.gen_type_14_4_22()) + # moves.extend(self.gen_type_13_4_2()) + # moves.extend(self.gen_type_14_4_22()) return moves diff --git a/douzero/env/utils.py b/douzero/env/utils.py index c3a2be7..2e83150 100644 --- a/douzero/env/utils.py +++ b/douzero/env/utils.py @@ -10,17 +10,22 @@ TYPE_0_PASS = 0 TYPE_1_SINGLE = 1 TYPE_2_PAIR = 2 TYPE_3_TRIPLE = 3 -TYPE_4_BOMB = 4 -TYPE_5_KING_BOMB = 5 +TYPE_4_BOMB = 44 +TYPE_4_BOMB5 = 45 +TYPE_4_BOMB6 = 46 +TYPE_4_BOMB7 = 47 +TYPE_4_BOMB8 = 48 +TYPE_5_KING_BOMB = 50 +#TYPE_6_3_1 = 6 TYPE_6_3_1 = 6 TYPE_7_3_2 = 7 TYPE_8_SERIAL_SINGLE = 8 TYPE_9_SERIAL_PAIR = 9 TYPE_10_SERIAL_TRIPLE = 10 -TYPE_11_SERIAL_3_1 = 11 +# TYPE_11_SERIAL_3_1 = 11 TYPE_12_SERIAL_3_2 = 12 -TYPE_13_4_2 = 13 -TYPE_14_4_22 = 14 +# TYPE_13_4_2 = 13 +# TYPE_14_4_22 = 14 TYPE_15_WRONG = 15 # betting round action diff --git a/douzero/evaluation/simulation.py b/douzero/evaluation/simulation.py index bac5cbd..1468746 100644 --- a/douzero/evaluation/simulation.py +++ b/douzero/evaluation/simulation.py @@ -10,7 +10,7 @@ import BidModel def load_card_play_models(card_play_model_path_dict): players = {} - for position in ['landlord', 'landlord_up', 'landlord_down']: + for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']: if card_play_model_path_dict[position] == 'rlcard': from .rlcard_agent import RLCardAgent players[position] = RLCardAgent(position) @@ -43,30 +43,34 @@ def mp_simulate(card_play_data_list, card_play_model_path_dict, q, output, bid_o bid_results = [] bid_values = [] bid_info_list = [ - np.array([[-1,-1,-1], - [-1,-1,-1], - [-1,-1,-1], - [-1,-1,-1]]), - np.array([[0,0,0], - [-1,-1,-1], - [-1,-1,-1], - [-1,-1,-1]]), - np.array([[1,0,0], - [-1,-1,-1], - [-1,-1,-1], - [-1,-1,-1]]), - np.array([[0,0,0], - [0,0,0], - [-1,-1,-1], - [-1,-1,-1]]), - np.array([[0,0,1], - [1,0,0], - [-1,-1,-1], - [-1,-1,-1]]), - np.array([[0,1,0], - [0,0,1], - [1,0,0], - [-1,-1,-1]]), + np.array([[-1,-1,-1,-1], + [-1,-1,-1,-1], + [-1,-1,-1,-1], + [-1,-1,-1,-1]]), + np.array([[0,0,0,0], + [-1,-1,-1,-1], + [-1,-1,-1,-1], + [-1,-1,-1,-1]]), + np.array([[1,0,0,0], + [-1,-1,-1,-1], + [-1,-1,-1,-1], + [-1,-1,-1,-1]]), + np.array([[0,0,0,0], + [0,0,0,0], + [-1,-1,-1,-1], + [-1,-1,-1,-1]]), + np.array([[0,0,1,0], + [0,0,0,1], + [-1,-1,-1,-1], + [-1,-1,-1,-1]]), + np.array([[0,1,0,0], + [0,0,1,0], + [0,0,0,1], + [-1,-1,-1,-1]]), + np.array([[0,1,0,0], + [0,0,1,0], + [1,0,0,0], + [-1,-1,-1,-1]]), ] for bid_info in bid_info_list: bid_obs = douzero.env.env._get_obs_for_bid(1, bid_info, card_play_data["landlord"]) @@ -82,6 +86,7 @@ def mp_simulate(card_play_data_list, card_play_model_path_dict, q, output, bid_o print("\nStart ------- " + title) print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord"]])) print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_down"]])) + print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_front"]])) print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_up"]])) # print(card_play_data) count = 0 @@ -127,6 +132,7 @@ def evaluate(landlord, landlord_up, landlord_down, eval_data, num_workers, outpu card_play_model_path_dict = { 'landlord': landlord, 'landlord_up': landlord_up, + 'landlord_front': landlord_up, 'landlord_down': landlord_down} num_landlord_wins = 0 diff --git a/evaluate.py b/evaluate.py index 5a838e4..45ea06e 100644 --- a/evaluate.py +++ b/evaluate.py @@ -10,6 +10,8 @@ if __name__ == '__main__': default='baselines/douzero_12/landlord_weights_39762328900.ckpt') parser.add_argument('--landlord_up', type=str, default='baselines/douzero_12/landlord_up_weights_39762328900.ckpt') + parser.add_argument('--landlord_front', type=str, + default='baselines/douzero_12/landlord_front_weights_39762328900.ckpt') parser.add_argument('--landlord_down', type=str, default='baselines/douzero_12/landlord_down_weights_39762328900.ckpt') parser.add_argument('--eval_data', type=str, @@ -25,7 +27,7 @@ if __name__ == '__main__': if args.output or args.bid: args.num_workers = 1 t = 3 - frame = 3085177900 + frame = 64000 adp_frame = 2511184300 # args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame @@ -44,6 +46,7 @@ if __name__ == '__main__': elif t == 3: args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame + args.landlord_front = 'baselines/resnet_landlord_front_%i.ckpt' % frame args.landlord_down = 'baselines/resnet_landlord_down_%i.ckpt' % frame elif t == 4: args.landlord = 'baselines/douzero_ADP/landlord.ckpt' diff --git a/generate_eval_data.py b/generate_eval_data.py index 6d0ff2a..a4d0afa 100644 --- a/generate_eval_data.py +++ b/generate_eval_data.py @@ -4,9 +4,9 @@ import numpy as np deck = [] for i in range(3, 15): - deck.extend([i for _ in range(4)]) -deck.extend([17 for _ in range(4)]) -deck.extend([20, 30]) + deck.extend([i for _ in range(8)]) +deck.extend([17 for _ in range(8)]) +deck.extend([20, 20, 30, 30]) def get_parser(): parser = argparse.ArgumentParser(description='DouZero: random data generator') @@ -17,10 +17,11 @@ def get_parser(): def generate(): _deck = deck.copy() np.random.shuffle(_deck) - card_play_data = {'landlord': _deck[:20], - 'landlord_up': _deck[20:37], - 'landlord_down': _deck[37:54], - 'three_landlord_cards': _deck[17:20], + card_play_data = {'landlord': _deck[:33], + 'landlord_up': _deck[33:58], + 'landlord_front': _deck[58:83], + 'landlord_down': _deck[83:108], + # 'three_landlord_cards': _deck[25:33], } for key in card_play_data: card_play_data[key].sort()