改造为4人斗地主

This commit is contained in:
zhiyang7 2021-12-05 12:03:30 +08:00
parent 9c1c56d91d
commit aab93d66c6
10 changed files with 629 additions and 317 deletions

View File

@ -16,7 +16,7 @@ from .file_writer import FileWriter
from .models import Model, OldModel
from .utils import get_batch, log, create_env, create_optimizers, act
mean_episode_return_buf = {p:deque(maxlen=100) for p in ['landlord', 'landlord_up', 'landlord_down', 'bidding']}
mean_episode_return_buf = {p:deque(maxlen=100) for p in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']}
def compute_loss(logits, targets):
loss = ((logits.squeeze(-1) - targets)**2).mean()
@ -27,7 +27,7 @@ def compute_loss_for_bid(outputs, reward):
def learn(position, actor_models, model, batch, optimizer, flags, lock):
"""Performs a learning (optimization) step."""
position_index = {"landlord": 31, "landlord_up": 32, "landlord_down": 33}
position_index = {"landlord": 31, "landlord_up": 32, 'landlord_front': 33, "landlord_down": 34}
print("Learn", position)
if flags.training_device != "cpu":
device = torch.device('cuda:'+str(flags.training_device))
@ -46,7 +46,8 @@ def learn(position, actor_models, model, batch, optimizer, flags, lock):
with lock:
learner_outputs = model(obs_z, obs_x, return_value=True)
if position == "bidding":
pass
loss = compute_loss(learner_outputs['values'], target)
# pass
else:
loss = compute_loss(learner_outputs['values'], target)
stats = {
@ -101,7 +102,7 @@ def train(flags):
# Initialize queues
actor_processes = []
ctx = mp.get_context('spawn')
batch_queues = {"landlord": ctx.SimpleQueue(), "landlord_up": ctx.SimpleQueue(), "landlord_down": ctx.SimpleQueue(), "bidding": ctx.SimpleQueue()}
batch_queues = {"landlord": ctx.SimpleQueue(), "landlord_up": ctx.SimpleQueue(), 'landlord_front': ctx.SimpleQueue(), "landlord_down": ctx.SimpleQueue(), "bidding": ctx.SimpleQueue()}
# Learner model for training
learner_model = Model(device=flags.training_device)
@ -115,20 +116,22 @@ def train(flags):
'loss_landlord',
'mean_episode_return_landlord_up',
'loss_landlord_up',
'mean_episode_return_landlord_front',
'loss_landlord_front',
'mean_episode_return_landlord_down',
'loss_landlord_down',
'mean_episode_return_bidding',
'loss_bidding',
]
frames, stats = 0, {k: 0 for k in stat_keys}
position_frames = {'landlord':0, 'landlord_up':0, 'landlord_down':0, 'bidding': 0}
position_frames = {'landlord':0, 'landlord_up':0, 'landlord_front':0, 'landlord_down':0, 'bidding': 0}
# Load models if any
if flags.load_model and os.path.exists(checkpointpath):
checkpoint_states = torch.load(
checkpointpath, map_location=("cuda:"+str(flags.training_device) if flags.training_device != "cpu" else "cpu")
)
for k in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down']
for k in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down']
learner_model.get_model(k).load_state_dict(checkpoint_states["model_state_dict"][k])
optimizers[k].load_state_dict(checkpoint_states["optimizer_state_dict"][k])
for device in device_iterator:
@ -176,12 +179,12 @@ def train(flags):
threads = []
locks = {}
for device in device_iterator:
locks[device] = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()}
position_locks = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()}
locks[device] = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_front': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()}
position_locks = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_front': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()}
for device in device_iterator:
for i in range(flags.num_threads):
for position in ['landlord', 'landlord_up', 'landlord_down', 'bidding']:
for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']:
thread = threading.Thread(
target=batch_and_learn, name='batch-and-learn-%d' % i, args=(i,device,position,locks[device][position],position_locks[position]))
thread.start()
@ -202,7 +205,7 @@ def train(flags):
}, checkpointpath)
# Save the weights for evaluation purpose
for position in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down']
for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']
model_weights_dir = os.path.expandvars(os.path.expanduser(
'%s/%s/%s' % (flags.savedir, flags.xpid, "general_"+position+'_'+str(frames)+'.ckpt')))
torch.save(learner_model.get_model(position).state_dict(), model_weights_dir)
@ -229,15 +232,17 @@ def train(flags):
fps_avg = np.mean(fps_log)
position_fps = {k:(position_frames[k]-position_start_frames[k])/(end_time-start_time) for k in position_frames}
log.info('After %i (L:%i U:%i D:%i) frames: @ %.1f fps (avg@ %.1f fps) (L:%.1f U:%.1f D:%.1f) Stats:\n%s',
log.info('After %i (L:%i U:%i F:%i D:%i) frames: @ %.1f fps (avg@ %.1f fps) (L:%.1f U:%.1f F:%.1f D:%.1f) Stats:\n%s',
frames,
position_frames['landlord'],
position_frames['landlord_up'],
position_frames['landlord_front'],
position_frames['landlord_down'],
fps,
fps_avg,
position_fps['landlord'],
position_fps['landlord_up'],
position_fps['landlord_front'],
position_fps['landlord_down'],
pprint.pformat(stats))

View File

@ -12,13 +12,13 @@ import torch.nn.functional as F
class LandlordLstmModel(nn.Module):
def __init__(self):
super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True)
self.dense1 = nn.Linear(373 + 128, 512)
self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512)
self.dense5 = nn.Linear(512, 512)
self.dense6 = nn.Linear(512, 1)
self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(846 + 128, 1024)
self.dense2 = nn.Linear(1024, 1024)
self.dense3 = nn.Linear(1024, 768)
self.dense4 = nn.Linear(768, 512)
self.dense5 = nn.Linear(512, 256)
self.dense6 = nn.Linear(256, 1)
def forward(self, z, x, return_value=False, flags=None):
lstm_out, (h_n, _) = self.lstm(z)
@ -47,13 +47,13 @@ class LandlordLstmModel(nn.Module):
class FarmerLstmModel(nn.Module):
def __init__(self):
super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True)
self.dense1 = nn.Linear(484 + 128, 512)
self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512)
self.dense5 = nn.Linear(512, 512)
self.dense6 = nn.Linear(512, 1)
self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(1178 + 128, 1024)
self.dense2 = nn.Linear(1024, 1024)
self.dense3 = nn.Linear(1024, 768)
self.dense4 = nn.Linear(768, 512)
self.dense5 = nn.Linear(512, 256)
self.dense6 = nn.Linear(256, 1)
def forward(self, z, x, return_value=False, flags=None):
lstm_out, (h_n, _) = self.lstm(z)
@ -82,8 +82,8 @@ class FarmerLstmModel(nn.Module):
class LandlordLstmNewModel(nn.Module):
def __init__(self):
super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True)
self.dense1 = nn.Linear(373 + 128, 512)
self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(846 + 128, 512)
self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512)
@ -117,8 +117,8 @@ class LandlordLstmNewModel(nn.Module):
class FarmerLstmNewModel(nn.Module):
def __init__(self):
super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True)
self.dense1 = nn.Linear(484 + 128, 512)
self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(1178 + 128, 512)
self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512)
@ -253,20 +253,22 @@ class GeneralModel(nn.Module):
def __init__(self):
super().__init__()
self.in_planes = 80
#input 1*54*41
#input 1*108*41
self.conv1 = nn.Conv1d(40, 80, kernel_size=(3,),
stride=(2,), padding=1, bias=False) #1*27*80
stride=(2,), padding=1, bias=False) #1*54*80
self.bn1 = nn.BatchNorm1d(80)
self.layer1 = self._make_layer(BasicBlock, 80, 2, stride=2)#1*14*80
self.layer2 = self._make_layer(BasicBlock, 160, 2, stride=2)#1*7*160
self.layer3 = self._make_layer(BasicBlock, 320, 2, stride=2)#1*4*320
self.layer1 = self._make_layer(BasicBlock, 80, 2, stride=2)#1*27*80
self.layer2 = self._make_layer(BasicBlock, 160, 2, stride=2)#1*14*160
self.layer3 = self._make_layer(BasicBlock, 320, 2, stride=2)#1*7*320
self.layer4 = self._make_layer(BasicBlock, 640, 2, stride=2)#1*4*320
# self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear1 = nn.Linear(320 * BasicBlock.expansion * 4 + 15 * 4, 1024)
self.linear2 = nn.Linear(1024, 512)
self.linear3 = nn.Linear(512, 256)
self.linear4 = nn.Linear(256, 1)
self.linear1 = nn.Linear(640 * BasicBlock.expansion * 4 + 24 * 4, 2048)
self.linear2 = nn.Linear(2048, 1024)
self.linear3 = nn.Linear(1024, 512)
self.linear4 = nn.Linear(512, 256)
self.linear5 = nn.Linear(256, 1)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
@ -281,12 +283,14 @@ class GeneralModel(nn.Module):
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = out.flatten(1,2)
out = torch.cat([x,x,x,x,out], dim=-1)
out = F.leaky_relu_(self.linear1(out))
out = F.leaky_relu_(self.linear2(out))
out = F.leaky_relu_(self.linear3(out))
out = F.leaky_relu_(self.linear4(out))
out = F.leaky_relu_(self.linear5(out))
if return_value:
return dict(values=out)
else:
@ -304,7 +308,7 @@ class BidModel(nn.Module):
def __init__(self):
super().__init__()
self.dense1 = nn.Linear(114, 512)
self.dense1 = nn.Linear(208, 512)
self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512)
@ -342,15 +346,18 @@ class BidModel(nn.Module):
model_dict = {}
model_dict['landlord'] = LandlordLstmModel
model_dict['landlord_up'] = FarmerLstmModel
model_dict['landlord_front'] = FarmerLstmModel
model_dict['landlord_down'] = FarmerLstmModel
model_dict_new = {}
model_dict_new['landlord'] = GeneralModel
model_dict_new['landlord_up'] = GeneralModel
model_dict_new['landlord_front'] = GeneralModel
model_dict_new['landlord_down'] = GeneralModel
model_dict_new['bidding'] = BidModel
model_dict_lstm = {}
model_dict_lstm['landlord'] = GeneralModel
model_dict_lstm['landlord_up'] = GeneralModel
model_dict_lstm['landlord_front'] = GeneralModel
model_dict_lstm['landlord_down'] = GeneralModel
class General_Model:
@ -365,6 +372,7 @@ class General_Model:
# model = GeneralModel().to(torch.device(device))
self.models['landlord'] = GeneralModel1().to(torch.device(device))
self.models['landlord_up'] = GeneralModel1().to(torch.device(device))
self.models['landlord_front'] = GeneralModel1().to(torch.device(device))
self.models['landlord_down'] = GeneralModel1().to(torch.device(device))
self.models['bidding'] = BidModel().to(torch.device(device))
@ -375,12 +383,14 @@ class General_Model:
def share_memory(self):
self.models['landlord'].share_memory()
self.models['landlord_up'].share_memory()
self.models['landlord_front'].share_memory()
self.models['landlord_down'].share_memory()
self.models['bidding'].share_memory()
def eval(self):
self.models['landlord'].eval()
self.models['landlord_up'].eval()
self.models['landlord_front'].eval()
self.models['landlord_down'].eval()
self.models['bidding'].eval()
@ -404,6 +414,7 @@ class OldModel:
device = 'cuda:' + str(device)
self.models['landlord'] = LandlordLstmModel().to(torch.device(device))
self.models['landlord_up'] = FarmerLstmModel().to(torch.device(device))
self.models['landlord_front'] = FarmerLstmModel().to(torch.device(device))
self.models['landlord_down'] = FarmerLstmModel().to(torch.device(device))
def forward(self, position, z, x, training=False, flags=None):
@ -413,11 +424,13 @@ class OldModel:
def share_memory(self):
self.models['landlord'].share_memory()
self.models['landlord_up'].share_memory()
self.models['landlord_front'].share_memory()
self.models['landlord_down'].share_memory()
def eval(self):
self.models['landlord'].eval()
self.models['landlord_up'].eval()
self.models['landlord_front'].eval()
self.models['landlord_down'].eval()
def parameters(self, position):
@ -442,6 +455,7 @@ class Model:
# model = GeneralModel().to(torch.device(device))
self.models['landlord'] = GeneralModel().to(torch.device(device))
self.models['landlord_up'] = GeneralModel().to(torch.device(device))
self.models['landlord_front'] = GeneralModel().to(torch.device(device))
self.models['landlord_down'] = GeneralModel().to(torch.device(device))
self.models['bidding'] = BidModel().to(torch.device(device))
@ -452,12 +466,14 @@ class Model:
def share_memory(self):
self.models['landlord'].share_memory()
self.models['landlord_up'].share_memory()
self.models['landlord_front'].share_memory()
self.models['landlord_down'].share_memory()
self.models['bidding'].share_memory()
def eval(self):
self.models['landlord'].eval()
self.models['landlord_up'].eval()
self.models['landlord_front'].eval()
self.models['landlord_down'].eval()
self.models['bidding'].eval()
@ -470,11 +486,3 @@ class Model:
def get_models(self):
return self.models

View File

@ -16,11 +16,15 @@ from douzero.env import Env
Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7,
11: 8, 12: 9, 13: 10, 14: 11, 17: 12}
NumOnes2Array = {0: np.array([0, 0, 0, 0]),
1: np.array([1, 0, 0, 0]),
2: np.array([1, 1, 0, 0]),
3: np.array([1, 1, 1, 0]),
4: np.array([1, 1, 1, 1])}
NumOnes2Array = {0: np.array([0, 0, 0, 0, 0, 0, 0, 0]),
1: np.array([1, 0, 0, 0, 0, 0, 0, 0]),
2: np.array([1, 1, 0, 0, 0, 0, 0, 0]),
3: np.array([1, 1, 1, 0, 0, 0, 0, 0]),
4: np.array([1, 1, 1, 1, 0, 0, 0, 0]),
5: np.array([1, 1, 1, 1, 1, 0, 0, 0]),
6: np.array([1, 1, 1, 1, 1, 1, 0, 0]),
7: np.array([1, 1, 1, 1, 1, 1, 1, 0]),
8: np.array([1, 1, 1, 1, 1, 1, 1, 1])}
shandle = logging.StreamHandler()
shandle.setFormatter(
@ -60,7 +64,7 @@ def create_optimizers(flags, learner_model):
"""
Create three optimizers for the three positions
"""
positions = ['landlord', 'landlord_up', 'landlord_down', 'bidding']
positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']
optimizers = {}
for position in positions:
optimizer = RAdam(
@ -72,7 +76,7 @@ def create_optimizers(flags, learner_model):
def act(i, device, batch_queues, model, flags):
positions = ['landlord', 'landlord_up', 'landlord_down', 'bidding']
positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']
for pos in positions:
model.models[pos].to(torch.device(device if device == "cpu" else ("cuda:"+str(device))))
try:
@ -90,9 +94,9 @@ def act(i, device, batch_queues, model, flags):
type_buf = {p: [] for p in positions}
obs_x_batch_buf = {p: [] for p in positions}
position_index = {"landlord": 31, "landlord_up": 32, "landlord_down": 33}
bid_type_index = {"landlord": 41, "landlord_up": 42, "landlord_down": 43}
bid_type_map = {41: "landlord", 42: "landlord_up", 43: "landlord_down"}
position_index = {"landlord": 31, "landlord_up": 32, "landlord_front": 33, "landlord_down": 34}
bid_type_index = {"landlord": 41, "landlord_up": 42, "landlord_front": 43, "landlord_down": 43}
bid_type_map = {41: "landlord", 42: "landlord_up", 43: "landlord_front", 44: "landlord_down"}
position, obs, env_output = env.initial(model, device, flags=flags)
bid_obs_buffer = env_output["begin_buf"]["bid_obs_buffer"]
@ -149,7 +153,7 @@ def act(i, device, batch_queues, model, flags):
target_buf[p].append(episode_return)
break
for p in positions:
if size[p] > T:
while size[p] > T:
# print(p, "epr", torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in episode_return_buf[p][:T]]),)
batch_queues[p].put({
"done": torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in done_buf[p][:T]]),
@ -182,18 +186,22 @@ def _cards2tensor(list_cards):
See Figure 2 in https://arxiv.org/pdf/2106.06135.pdf
"""
if len(list_cards) == 0:
return torch.zeros(54, dtype=torch.int8)
return torch.zeros(108, dtype=torch.int8)
matrix = np.zeros([4, 13], dtype=np.int8)
jokers = np.zeros(2, dtype=np.int8)
matrix = np.zeros([8, 13], dtype=np.int8)
jokers = np.zeros(4, dtype=np.int8)
counter = Counter(list_cards)
for card, num_times in counter.items():
if card < 20:
matrix[:, Card2Column[card]] = NumOnes2Array[num_times]
elif card == 20:
jokers[0] = 1
if num_times == 2:
jokers[1] = 1
elif card == 30:
jokers[1] = 1
jokers[2] = 1
if num_times == 2:
jokers[3] = 1
matrix = np.concatenate((matrix.flatten('F'), jokers))
matrix = torch.from_numpy(matrix)
return matrix

487
douzero/env/env.py vendored
View File

@ -11,17 +11,22 @@ env_url = "http://od.vcccz.com/hechuan/env.py"
Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7,
11: 8, 12: 9, 13: 10, 14: 11, 17: 12}
NumOnes2Array = {0: np.array([0, 0, 0, 0]),
1: np.array([1, 0, 0, 0]),
2: np.array([1, 1, 0, 0]),
3: np.array([1, 1, 1, 0]),
4: np.array([1, 1, 1, 1])}
NumOnes2Array = {0: np.array([0, 0, 0, 0, 0, 0, 0, 0]),
1: np.array([1, 0, 0, 0, 0, 0, 0, 0]),
2: np.array([1, 1, 0, 0, 0, 0, 0, 0]),
3: np.array([1, 1, 1, 0, 0, 0, 0, 0]),
4: np.array([1, 1, 1, 1, 0, 0, 0, 0]),
5: np.array([1, 1, 1, 1, 1, 0, 0, 0]),
6: np.array([1, 1, 1, 1, 1, 1, 0, 0]),
7: np.array([1, 1, 1, 1, 1, 1, 1, 0]),
8: np.array([1, 1, 1, 1, 1, 1, 1, 1])}
deck = []
for i in range(3, 15):
deck.extend([i for _ in range(4)])
deck.extend([17 for _ in range(4)])
deck.extend([20, 30])
deck.extend([i for _ in range(8)])
deck.extend([17 for _ in range(8)])
deck.extend([20, 20, 30, 30])
class Env:
@ -46,7 +51,7 @@ class Env:
# Initialize players
# We use three dummy player for the target position
self.players = {}
for position in ['landlord', 'landlord_up', 'landlord_down']:
for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']:
self.players[position] = DummyAgent(position)
# Initialize the internal environment
@ -67,10 +72,11 @@ class Env:
if model is None:
_deck = deck.copy()
np.random.shuffle(_deck)
card_play_data = {'landlord': _deck[:20],
'landlord_up': _deck[20:37],
'landlord_down': _deck[37:54],
'three_landlord_cards': _deck[17:20],
card_play_data = {'landlord': _deck[:33],
'landlord_up': _deck[33:58],
'landlord_front': _deck[58:83],
'landlord_down': _deck[83:108],
# 'three_landlord_cards': _deck[17:20],
}
for key in card_play_data:
card_play_data[key].sort()
@ -97,18 +103,20 @@ class Env:
_deck = deck.copy()
np.random.shuffle(_deck)
card_play_data = [
_deck[:17],
_deck[17:34],
_deck[34:51],
_deck[:25],
_deck[25:50],
_deck[50:75],
_deck[75:100],
]
for i in range(3):
for i in range(4):
card_play_data[i].sort()
landlord_cards = _deck[51:54]
landlord_cards = _deck[100:108]
landlord_cards.sort()
bid_info = np.array([[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1]])
bid_info = np.array([[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1]])
bidding_player = random.randint(0, 2)
# bidding_player = 0 # debug
first_bid = -1
@ -116,7 +124,7 @@ class Env:
bid_count = 0
if bid_limit <= 0:
force_bid = True
for r in range(3):
for r in range(4):
bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player])
with torch.no_grad():
action = model.forward("bidding", torch.tensor(bidding_obs["z_batch"], device=device),
@ -137,19 +145,19 @@ class Env:
bid_count += 1
if first_bid == -1:
first_bid = bidding_player
for p in range(3):
for p in range(4):
if p == bidding_player:
bid_info[r][p] = 1
else:
bid_info[r][p] = 0
else:
bid_info[r] = [0, 0, 0]
bidding_player = (bidding_player + 1) % 3
bid_info[r] = [0, 0, 0, 0]
bidding_player = (bidding_player + 1) % 4
one_count = np.count_nonzero(bid_info == 1)
if one_count == 0:
continue
elif one_count > 1:
r = 3
r = 4
bidding_player = first_bid
bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player])
with torch.no_grad():
@ -163,7 +171,7 @@ class Env:
if action["action"] == 1:
last_bid = bidding_player
bid_count += 1
for p in range(3):
for p in range(4):
if p == bidding_player:
bid_info[r][p] = 1
else:
@ -171,20 +179,23 @@ class Env:
break
card_play_data[last_bid].extend(landlord_cards)
card_play_data = {'landlord': card_play_data[last_bid],
'landlord_up': card_play_data[(last_bid - 1) % 3],
'landlord_down': card_play_data[(last_bid + 1) % 3],
'three_landlord_cards': landlord_cards,
'landlord_up': card_play_data[(last_bid - 1) % 4],
'landlord_down': card_play_data[(last_bid + 1) % 4],
'landlord_front': card_play_data[(last_bid + 2) % 4],
# 'three_landlord_cards': landlord_cards,
}
card_play_data["landlord"].sort()
player_ids = {
'landlord': last_bid,
'landlord_up': (last_bid - 1) % 3,
'landlord_down': (last_bid + 1) % 3,
'landlord_up': (last_bid - 1) % 4,
'landlord_down': (last_bid + 1) % 4,
'landlord_front': (last_bid + 2) % 4,
}
player_positions = {
last_bid: 'landlord',
(last_bid - 1) % 3: 'landlord_up',
(last_bid + 1) % 3: 'landlord_down'
(last_bid - 1) % 4: 'landlord_up',
(last_bid + 1) % 4: 'landlord_down',
(last_bid + 2) % 4: 'landlord_front',
}
for bid_obs in bid_obs_buffer:
bid_obs.update({"position": player_positions[bid_obs["pid"]]})
@ -192,14 +203,15 @@ class Env:
# Initialize the cards
self._env.card_play_init(card_play_data)
multiply_map = [
np.array([1, 0, 0]),
np.array([0, 1, 0]),
np.array([0, 0, 1])
np.array([1, 0, 0, 0]),
np.array([0, 1, 0, 0]),
np.array([0, 0, 1, 0]),
np.array([0, 0, 0, 1])
]
for pos in ["landlord", "landlord_up", "landlord_down"]:
for pos in ["landlord", "landlord_up", "landlord_front", "landlord_down"]:
pid = player_ids[pos]
self._env.info_sets[pos].player_id = pid
self._env.info_sets[pos].bid_info = bid_info[:, [(pid - 1) % 3, pid, (pid + 1) % 3]]
self._env.info_sets[pos].bid_info = bid_info[:, [(pid - 1) % 4, pid, (pid + 1) % 4, (pid + 2) % 4]]
self._env.bid_count = bid_count
# multiply_obs = _get_obs_for_multiply(pos, self._env.info_sets[pos].bid_info, card_play_data[pos],
# landlord_cards)
@ -245,11 +257,13 @@ class Env:
"play": {
"landlord": self._get_reward("landlord"),
"landlord_up": self._get_reward("landlord_up"),
"landlord_front": self._get_reward("landlord_front"),
"landlord_down": self._get_reward("landlord_down")
},
"bid": {
"landlord": self._get_reward_bidding("landlord")*2,
"landlord": self._get_reward_bidding("landlord")*3,
"landlord_up": self._get_reward_bidding("landlord_up"),
"landlord_front": self._get_reward_bidding("landlord_front"),
"landlord_down": self._get_reward_bidding("landlord_down")
}
}
@ -269,16 +283,19 @@ class Env:
self_bomb_num = self._env.pos_bomb_num[pos]
if winner == 'landlord':
if self.objective == 'adp':
return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num +self._env.multiply_count[pos]) /8
return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] + self._env.multiply_count[pos]) /8
return (2.0 ** bomb_num[0]) * (3.0 ** bomb_num[1])
elif self.objective == 'logadp':
return (1.0 - self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4
return bomb_num[0] + bomb_num[1] + 1.0
else:
return 1.0 - self._env.step_count * 0.0033
return 1.0
else:
if self.objective == 'adp':
return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num +self._env.multiply_count[pos]) /8
return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] +self._env.multiply_count[pos]) /8
elif self.objective == 'logadp':
return (-1.0 + self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4
return (-1.0 + self._env.step_count * 0.0033) * 1.3**(self_bomb_num) * 2**self._env.multiply_count[pos] / 4
else:
return -1.0 + self._env.step_count * 0.0033
@ -371,12 +388,12 @@ def get_obs(infoset, use_general=True):
This function obtains observations with imperfect information
from the infoset. It has three branches since we encode
different features for different positions.
This function will return dictionary named `obs`. It contains
several fields. These fields will be used to train the model.
One can play with those features to improve the performance.
`position` is a string that can be landlord/landlord_down/landlord_up
`position` is a string that can be landlord/landlord_down/landlord_front/landlord_up
`x_batch` is a batch of features (excluding the hisorical moves).
It also encodes the action feature
@ -391,7 +408,7 @@ def get_obs(infoset, use_general=True):
`z`: same as z_batch but not a batch.
"""
if use_general:
if infoset.player_position not in ["landlord", "landlord_up", "landlord_down"]:
if infoset.player_position not in ["landlord", "landlord_up", "landlord_front", "landlord_down"]:
raise ValueError('')
return _get_obs_general(infoset, infoset.player_position)
else:
@ -399,6 +416,8 @@ def get_obs(infoset, use_general=True):
return _get_obs_landlord(infoset)
elif infoset.player_position == 'landlord_up':
return _get_obs_landlord_up(infoset)
elif infoset.player_position == 'landlord_front':
return _get_obs_landlord_front(infoset)
elif infoset.player_position == 'landlord_down':
return _get_obs_landlord_down(infoset)
else:
@ -424,18 +443,22 @@ def _cards2array(list_cards):
the representations.
"""
if len(list_cards) == 0:
return np.zeros(54, dtype=np.int8)
return np.zeros(108, dtype=np.int8)
matrix = np.zeros([4, 13], dtype=np.int8)
jokers = np.zeros(2, dtype=np.int8)
matrix = np.zeros([8, 13], dtype=np.int8)
jokers = np.zeros(4, dtype=np.int8)
counter = Counter(list_cards)
for card, num_times in counter.items():
if card < 20:
matrix[:, Card2Column[card]] = NumOnes2Array[num_times]
elif card == 20:
jokers[0] = 1
if num_times == 2:
jokers[1] = 1
elif card == 30:
jokers[1] = 1
jokers[2] = 1
if num_times == 2:
jokers[3] = 1
return np.concatenate((matrix.flatten('F'), jokers))
@ -449,7 +472,7 @@ def _cards2array(list_cards):
# Finally, we obtain a 5x162 matrix, which will be fed
# into LSTM for encoding.
# """
# action_seq_array = np.zeros((len(action_seq_list), 54))
# action_seq_array = np.zeros((len(action_seq_list), 108))
# for row, list_cards in enumerate(action_seq_list):
# action_seq_array[row, :] = _cards2array(list_cards)
# # action_seq_array = action_seq_array.reshape(5, 162)
@ -458,26 +481,26 @@ def _cards2array(list_cards):
def _action_seq_list2array(action_seq_list, new_model=True):
"""
A utility function to encode the historical moves.
We encode the historical 15 actions. If there is
no 15 actions, we pad the features with 0. Since
We encode the historical 20 actions. If there is
no 20 actions, we pad the features with 0. Since
three moves is a round in DouDizhu, we concatenate
the representations for each consecutive three moves.
Finally, we obtain a 5x162 matrix, which will be fed
Finally, we obtain a 5x432 matrix, which will be fed
into LSTM for encoding.
"""
if new_model:
position_map = {"landlord": 0, "landlord_up": 1, "landlord_down": 2}
action_seq_array = np.ones((len(action_seq_list), 54)) * -1 # Default Value -1 for not using area
# position_map = {"landlord": 0, "landlord_up": 1, "landlord_front": 2, "landlord_down": 3}
action_seq_array = np.ones((len(action_seq_list), 108)) * -1 # Default Value -1 for not using area
for row, list_cards in enumerate(action_seq_list):
if list_cards != []:
action_seq_array[row, :54] = _cards2array(list_cards[1])
action_seq_array[row, :108] = _cards2array(list_cards[1])
else:
action_seq_array = np.zeros((len(action_seq_list), 54))
action_seq_array = np.zeros((len(action_seq_list), 108))
for row, list_cards in enumerate(action_seq_list):
if list_cards != []:
action_seq_array[row, :] = _cards2array(list_cards[1])
action_seq_array = action_seq_array.reshape(5, 162)
action_seq_array = action_seq_array.reshape(5, 432)
return action_seq_array
# action_seq_array = np.zeros((len(action_seq_list), 54))
@ -487,10 +510,10 @@ def _action_seq_list2array(action_seq_list, new_model=True):
# return action_seq_array
def _process_action_seq(sequence, length=15, new_model=True):
def _process_action_seq(sequence, length=20, new_model=True):
"""
A utility function encoding historical moves. We
encode 15 moves. If there is no 15 moves, we pad
encode 20 moves. If there is no 20 moves, we pad
with zeros.
"""
sequence = sequence[-length:].copy()
@ -508,8 +531,8 @@ def _get_one_hot_bomb(bomb_num):
A utility function to encode the number of bombs
into one-hot representation.
"""
one_hot = np.zeros(15)
one_hot[bomb_num] = 1
one_hot = np.zeros(29)
one_hot[bomb_num[0] + bomb_num[1]] = 1
return one_hot
@ -536,13 +559,19 @@ def _get_obs_landlord(infoset):
my_action_batch[j, :] = _cards2array(action)
landlord_up_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_up'], 17)
infoset.num_cards_left_dict['landlord_up'], 25)
landlord_up_num_cards_left_batch = np.repeat(
landlord_up_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
landlord_front_num_cards_left_batch = np.repeat(
landlord_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 17)
infoset.num_cards_left_dict['landlord_down'], 25)
landlord_down_num_cards_left_batch = np.repeat(
landlord_down_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
@ -553,6 +582,12 @@ def _get_obs_landlord(infoset):
landlord_up_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
landlord_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
landlord_front_played_cards_batch = np.repeat(
landlord_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_played_cards = _cards2array(
infoset.played_cards['landlord_down'])
landlord_down_played_cards_batch = np.repeat(
@ -569,8 +604,10 @@ def _get_obs_landlord(infoset):
other_handcards_batch,
last_action_batch,
landlord_up_played_cards_batch,
landlord_front_played_cards_batch,
landlord_down_played_cards_batch,
landlord_up_num_cards_left_batch,
landlord_front_num_cards_left_batch,
landlord_down_num_cards_left_batch,
bomb_num_batch,
my_action_batch))
@ -578,23 +615,25 @@ def _get_obs_landlord(infoset):
other_handcards,
last_action,
landlord_up_played_cards,
landlord_front_played_cards,
landlord_down_played_cards,
landlord_up_num_cards_left,
landlord_front_num_cards_left,
landlord_down_num_cards_left,
bomb_num))
z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False)
infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat(
z[np.newaxis, :, :],
num_legal_actions, axis=0)
obs = {
'position': 'landlord',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
'position': 'landlord',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
return obs
def _get_obs_landlord_up(infoset):
@ -625,7 +664,7 @@ def _get_obs_landlord_up(infoset):
last_landlord_action[np.newaxis, :],
num_legal_actions, axis=0)
landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 20)
infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
@ -642,7 +681,7 @@ def _get_obs_landlord_up(infoset):
last_teammate_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 17)
infoset.num_cards_left_dict['landlord_down'], 25)
teammate_num_cards_left_batch = np.repeat(
teammate_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
@ -653,6 +692,144 @@ def _get_obs_landlord_up(infoset):
teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
last_teammate_front_action = _cards2array(
infoset.last_move_dict['landlord_front'])
last_teammate_front_action_batch = np.repeat(
last_teammate_front_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
teammate_front_num_cards_left_batch = np.repeat(
teammate_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
teammate_front_played_cards_batch = np.repeat(
teammate_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
bomb_num = _get_one_hot_bomb(
infoset.bomb_num)
bomb_num_batch = np.repeat(
bomb_num[np.newaxis, :],
num_legal_actions, axis=0)
x_batch = np.hstack((my_handcards_batch,
other_handcards_batch,
landlord_played_cards_batch,
teammate_played_cards_batch,
teammate_front_played_cards_batch,
last_action_batch,
last_landlord_action_batch,
last_teammate_action_batch,
last_teammate_front_action_batch,
landlord_num_cards_left_batch,
teammate_num_cards_left_batch,
teammate_front_num_cards_left_batch,
bomb_num_batch,
my_action_batch))
x_no_action = np.hstack((my_handcards,
other_handcards,
landlord_played_cards,
teammate_played_cards,
teammate_front_played_cards,
last_action,
last_landlord_action,
last_teammate_action,
last_teammate_front_action,
landlord_num_cards_left,
teammate_num_cards_left,
teammate_front_num_cards_left,
bomb_num))
z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat(
z[np.newaxis, :, :],
num_legal_actions, axis=0)
obs = {
'position': 'landlord_up',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
return obs
def _get_obs_landlord_front(infoset):
"""
Obttain the landlord_front features. See Table 5 in
https://arxiv.org/pdf/2106.06135.pdf
"""
num_legal_actions = len(infoset.legal_actions)
my_handcards = _cards2array(infoset.player_hand_cards)
my_handcards_batch = np.repeat(my_handcards[np.newaxis, :],
num_legal_actions, axis=0)
other_handcards = _cards2array(infoset.other_hand_cards)
other_handcards_batch = np.repeat(other_handcards[np.newaxis, :],
num_legal_actions, axis=0)
last_action = _cards2array(infoset.last_move)
last_action_batch = np.repeat(last_action[np.newaxis, :],
num_legal_actions, axis=0)
my_action_batch = np.zeros(my_handcards_batch.shape)
for j, action in enumerate(infoset.legal_actions):
my_action_batch[j, :] = _cards2array(action)
last_landlord_action = _cards2array(
infoset.last_move_dict['landlord'])
last_landlord_action_batch = np.repeat(
last_landlord_action[np.newaxis, :],
num_legal_actions, axis=0)
landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_played_cards = _cards2array(
infoset.played_cards['landlord'])
landlord_played_cards_batch = np.repeat(
landlord_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
last_teammate_action = _cards2array(
infoset.last_move_dict['landlord_down'])
last_teammate_action_batch = np.repeat(
last_teammate_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 25)
teammate_num_cards_left_batch = np.repeat(
teammate_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_played_cards = _cards2array(
infoset.played_cards['landlord_down'])
teammate_played_cards_batch = np.repeat(
teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
last_teammate_front_action = _cards2array(
infoset.last_move_dict['landlord_front'])
last_teammate_front_action_batch = np.repeat(
last_teammate_front_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
teammate_front_num_cards_left_batch = np.repeat(
teammate_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
teammate_front_played_cards_batch = np.repeat(
teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
bomb_num = _get_one_hot_bomb(
infoset.bomb_num)
bomb_num_batch = np.repeat(
@ -663,36 +840,42 @@ def _get_obs_landlord_up(infoset):
other_handcards_batch,
landlord_played_cards_batch,
teammate_played_cards_batch,
teammate_front_played_cards_batch,
last_action_batch,
last_landlord_action_batch,
last_teammate_action_batch,
last_teammate_front_action_batch,
landlord_num_cards_left_batch,
teammate_num_cards_left_batch,
teammate_front_num_cards_left_batch,
bomb_num_batch,
my_action_batch))
x_no_action = np.hstack((my_handcards,
other_handcards,
landlord_played_cards,
teammate_played_cards,
teammate_front_played_cards,
last_action,
last_landlord_action,
last_teammate_action,
last_teammate_front_action,
landlord_num_cards_left,
teammate_num_cards_left,
teammate_front_num_cards_left,
bomb_num))
z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False)
infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat(
z[np.newaxis, :, :],
num_legal_actions, axis=0)
obs = {
'position': 'landlord_up',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
'position': 'landlord_front',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
return obs
def _get_obs_landlord_down(infoset):
@ -723,7 +906,7 @@ def _get_obs_landlord_down(infoset):
last_landlord_action[np.newaxis, :],
num_legal_actions, axis=0)
landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 20)
infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
@ -740,7 +923,7 @@ def _get_obs_landlord_down(infoset):
last_teammate_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_up'], 17)
infoset.num_cards_left_dict['landlord_up'], 25)
teammate_num_cards_left_batch = np.repeat(
teammate_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
@ -751,10 +934,21 @@ def _get_obs_landlord_down(infoset):
teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
landlord_played_cards = _cards2array(
infoset.played_cards['landlord'])
landlord_played_cards_batch = np.repeat(
landlord_played_cards[np.newaxis, :],
last_teammate_front_action = _cards2array(
infoset.last_move_dict['landlord_front'])
last_teammate_front_action_batch = np.repeat(
last_teammate_front_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
teammate_front_num_cards_left_batch = np.repeat(
teammate_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
teammate_front_played_cards_batch = np.repeat(
teammate_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
bomb_num = _get_one_hot_bomb(
@ -767,36 +961,42 @@ def _get_obs_landlord_down(infoset):
other_handcards_batch,
landlord_played_cards_batch,
teammate_played_cards_batch,
teammate_front_played_cards_batch,
last_action_batch,
last_landlord_action_batch,
last_teammate_action_batch,
last_teammate_front_action_batch,
landlord_num_cards_left_batch,
teammate_num_cards_left_batch,
teammate_front_num_cards_left_batch,
bomb_num_batch,
my_action_batch))
x_no_action = np.hstack((my_handcards,
other_handcards,
landlord_played_cards,
teammate_played_cards,
teammate_front_played_cards,
last_action,
last_landlord_action,
last_teammate_action,
last_teammate_front_action,
landlord_num_cards_left,
teammate_num_cards_left,
teammate_front_num_cards_left,
bomb_num))
z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False)
infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat(
z[np.newaxis, :, :],
num_legal_actions, axis=0)
obs = {
'position': 'landlord_down',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
'position': 'landlord_down',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
return obs
def _get_obs_landlord_withbid(infoset):
@ -869,7 +1069,7 @@ def _get_obs_landlord_withbid(infoset):
landlord_down_num_cards_left,
bomb_num))
z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False)
infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat(
z[np.newaxis, :, :],
num_legal_actions, axis=0)
@ -970,21 +1170,21 @@ def _get_obs_general1(infoset, position):
bomb_num[np.newaxis, :],
num_legal_actions, axis=0)
x_batch = np.hstack((position_info_batch, # 3
my_handcards_batch, # 54
other_handcards_batch, # 54
three_landlord_cards_batch, # 54
last_action_batch, # 54
landlord_played_cards_batch, # 54
landlord_up_played_cards_batch, # 54
landlord_down_played_cards_batch, # 54
x_batch = np.hstack((position_info_batch, # 4
my_handcards_batch, # 108
other_handcards_batch, # 108
three_landlord_cards_batch, # 108
last_action_batch, # 108
landlord_played_cards_batch, # 108
landlord_up_played_cards_batch, # 108
landlord_down_played_cards_batch, # 108
landlord_num_cards_left_batch, # 20
landlord_up_num_cards_left_batch, # 17
landlord_down_num_cards_left_batch, # 17
bomb_num_batch, # 15
bid_info_batch, # 12
multiply_info_batch, # 3
my_action_batch)) # 54
my_action_batch)) # 108
x_no_action = np.hstack((position_info,
my_handcards,
other_handcards,
@ -1025,9 +1225,10 @@ def _get_obs_general(infoset, position):
num_legal_actions, axis=0)
position_map = {
"landlord": [1, 0, 0],
"landlord_up": [0, 1, 0],
"landlord_down": [0, 0, 1]
"landlord": [1, 0, 0, 0],
"landlord_up": [0, 1, 0, 0],
"landlord_front": [0, 0, 1, 0],
"landlord_down": [0, 0, 0, 1]
}
position_info = np.array(position_map[position])
position_info_batch = np.repeat(position_info[np.newaxis, :],
@ -1041,9 +1242,9 @@ def _get_obs_general(infoset, position):
multiply_info_batch = np.repeat(multiply_info[np.newaxis, :],
num_legal_actions, axis=0)
three_landlord_cards = _cards2array(infoset.three_landlord_cards)
three_landlord_cards_batch = np.repeat(three_landlord_cards[np.newaxis, :],
num_legal_actions, axis=0)
# three_landlord_cards = _cards2array(infoset.three_landlord_cards)
# three_landlord_cards_batch = np.repeat(three_landlord_cards[np.newaxis, :],
# num_legal_actions, axis=0)
last_action = _cards2array(infoset.last_move)
last_action_batch = np.repeat(last_action[np.newaxis, :],
@ -1054,25 +1255,31 @@ def _get_obs_general(infoset, position):
my_action_batch[j, :] = _cards2array(action)
landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 20)
infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_up_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_up'], 17)
infoset.num_cards_left_dict['landlord_up'], 25)
landlord_up_num_cards_left_batch = np.repeat(
landlord_up_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
landlord_front_num_cards_left_batch = np.repeat(
landlord_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 17)
infoset.num_cards_left_dict['landlord_down'], 25)
landlord_down_num_cards_left_batch = np.repeat(
landlord_down_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
other_handcards_left_list = []
for pos in ["landlord", "landlord_up", "landlord_up"]:
for pos in ["landlord", "landlord_up", "landlord_front", "landlord_down"]:
if pos != position:
other_handcards_left_list.extend(infoset.all_handcards[pos])
@ -1088,6 +1295,12 @@ def _get_obs_general(infoset, position):
landlord_up_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
landlord_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
landlord_front_played_cards_batch = np.repeat(
landlord_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_played_cards = _cards2array(
infoset.played_cards['landlord_down'])
landlord_down_played_cards_batch = np.repeat(
@ -1100,24 +1313,26 @@ def _get_obs_general(infoset, position):
bomb_num[np.newaxis, :],
num_legal_actions, axis=0)
num_cards_left = np.hstack((
landlord_num_cards_left, # 20
landlord_up_num_cards_left, # 17
landlord_num_cards_left, # 33
landlord_up_num_cards_left, # 25
landlord_front_num_cards_left, # 25
landlord_down_num_cards_left))
x_batch = np.hstack((
bid_info_batch, # 12
multiply_info_batch)) # 3
bid_info_batch, # 16
multiply_info_batch)) # 4
x_no_action = np.hstack((
bid_info,
multiply_info))
z =np.vstack((
num_cards_left,
my_handcards, # 54
other_handcards, # 54
three_landlord_cards, # 54
landlord_played_cards, # 54
landlord_up_played_cards, # 54
landlord_down_played_cards, # 54
my_handcards, # 108
other_handcards, # 108
# three_landlord_cards, # 108
landlord_played_cards, # 108
landlord_up_played_cards, # 108
landlord_front_played_cards, # 108
landlord_down_played_cards, # 108
_action_seq_list2array(_process_action_seq(infoset.card_play_action_seq, 32))
))
@ -1125,7 +1340,7 @@ def _get_obs_general(infoset, position):
z[np.newaxis, :, :],
num_legal_actions, axis=0)
my_action_batch = my_action_batch[:,np.newaxis,:]
z_batch = np.zeros([len(_z_batch),40,54],int)
z_batch = np.zeros([len(_z_batch),40,108],int)
for i in range(0,len(_z_batch)):
z_batch[i] = np.vstack((my_action_batch[i],_z_batch[i]))
obs = {
@ -1139,17 +1354,17 @@ def _get_obs_general(infoset, position):
return obs
def gen_bid_legal_actions(player_id, bid_info):
self_bid_info = bid_info[:, [(player_id - 1) % 3, player_id, (player_id + 1) % 3]]
self_bid_info = bid_info[:, [(player_id - 1) % 4, player_id, (player_id + 1) % 4, (player_id + 2) % 4]]
curr_round = -1
for r in range(4):
for r in range(5):
if -1 in self_bid_info[r]:
curr_round = r
break
bid_actions = []
if curr_round != -1:
self_bid_info[curr_round] = [0, 0, 0]
self_bid_info[curr_round] = [0, 0, 0, 0]
bid_actions.append(np.array(self_bid_info).flatten())
self_bid_info[curr_round] = [0, 1, 0]
self_bid_info[curr_round] = [0, 1, 0, 0]
bid_actions.append(np.array(self_bid_info).flatten())
return np.array(bid_actions)
@ -1273,9 +1488,9 @@ def _get_obs_for_bid_legacy(player_id, bid_info, hand_cards):
return obs
def _get_obs_for_bid(player_id, bid_info, hand_cards):
all_cards = [3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 20, 30]
# all_cards = [3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
# 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
# 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 20, 30]
num_legal_actions = 2
my_handcards = _cards2array(hand_cards)
my_handcards_batch = np.repeat(my_handcards[np.newaxis, :],

199
douzero/env/game.py vendored
View File

@ -11,10 +11,19 @@ RealCard2EnvCard = {'3': 3, '4': 4, '5': 5, '6': 6, '7': 7,
'8': 8, '9': 9, '10': 10, 'J': 11, 'Q': 12,
'K': 13, 'A': 14, '2': 17, 'X': 20, 'D': 30}
bombs = [[3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6],
[7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9], [10, 10, 10, 10],
[11, 11, 11, 11], [12, 12, 12, 12], [13, 13, 13, 13], [14, 14, 14, 14],
[17, 17, 17, 17], [20, 30]]
bombs = [
[[3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7],
[8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10], [11, 11, 11, 11, 11, 11],
[12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13], [14, 14, 14, 14, 14, 14], [17, 17, 17, 17, 17, 17],
[3, 3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7, 7],
[8, 8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10, 10], [11, 11, 11, 11, 11, 11, 11],
[12, 12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13, 13], [14, 14, 14, 14, 14, 14, 14],
[17, 17, 17, 17, 17, 17, 17]],
[[3, 3, 3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6, 6, 6],
[7, 7, 7, 7, 7, 7, 7, 7], [8, 8, 8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10, 10, 10],
[11, 11, 11, 11, 11, 11, 11, 11], [12, 12, 12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13, 13, 13],
[14, 14, 14, 14, 14, 14, 14, 14], [17, 17, 17, 17, 17, 17, 17, 17],
[20, 20, 30, 30]]]
class GameEnv(object):
@ -22,7 +31,7 @@ class GameEnv(object):
self.card_play_action_seq = []
self.three_landlord_cards = None
# self.three_landlord_cards = None
self.game_over = False
self.acting_player_position = None
@ -32,10 +41,12 @@ class GameEnv(object):
self.last_move_dict = {'landlord': [],
'landlord_up': [],
'landlord_front': [],
'landlord_down': []}
self.played_cards = {'landlord': [],
'landlord_up': [],
'landlord_front': [],
'landlord_down': []}
self.last_move = []
@ -48,24 +59,28 @@ class GameEnv(object):
'farmer': 0}
self.info_sets = {'landlord': InfoSet('landlord'),
'landlord_up': InfoSet('landlord_up'),
'landlord_down': InfoSet('landlord_down')}
'landlord_up': InfoSet('landlord_up'),
'landlord_front': InfoSet('landlord_front'),
'landlord_down': InfoSet('landlord_down')}
self.bomb_num = 0
self.bomb_num = [0, 0]
self.pos_bomb_num = {
"landlord": 0,
"landlord_up": 0,
"landlord_front": 0,
"landlord_down": 0
}
self.last_pid = 'landlord'
self.bid_info = [[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1]]
self.bid_info = [[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1]]
self.bid_count = 0
self.multiply_count = {'landlord': 0,
'landlord_up': 0,
'landlord_front': 0,
'landlord_down': 0}
self.step_count = 0
@ -75,9 +90,11 @@ class GameEnv(object):
card_play_data['landlord']
self.info_sets['landlord_up'].player_hand_cards = \
card_play_data['landlord_up']
self.info_sets['landlord_front'].player_hand_cards = \
card_play_data['landlord_front']
self.info_sets['landlord_down'].player_hand_cards = \
card_play_data['landlord_down']
self.three_landlord_cards = card_play_data['three_landlord_cards']
# self.three_landlord_cards = card_play_data['three_landlord_cards']
self.get_acting_player_position()
self.game_infoset = self.get_infoset()
@ -85,6 +102,7 @@ class GameEnv(object):
def game_done(self):
if len(self.info_sets['landlord'].player_hand_cards) == 0 or \
len(self.info_sets['landlord_up'].player_hand_cards) == 0 or \
len(self.info_sets['landlord_front'].player_hand_cards) == 0 or \
len(self.info_sets['landlord_down'].player_hand_cards) == 0:
# if one of the three players discards his hand,
# then game is over.
@ -96,21 +114,21 @@ class GameEnv(object):
def compute_player_utility(self):
if len(self.info_sets['landlord'].player_hand_cards) == 0:
self.player_utility_dict = {'landlord': 2,
self.player_utility_dict = {'landlord': 3,
'farmer': -1}
else:
self.player_utility_dict = {'landlord': -2,
self.player_utility_dict = {'landlord': -3,
'farmer': 1}
def update_num_wins_scores(self):
for pos, utility in self.player_utility_dict.items():
base_score = 2 if pos == 'landlord' else 1
base_score = 3 if pos == 'landlord' else 1
if utility > 0:
self.num_wins[pos] += 1
self.winner = pos
self.num_scores[pos] += base_score * (2 ** self.bomb_num)
self.num_scores[pos] += base_score * (2 ** self.bomb_num[0]) * (3 ** self.bomb_num[1])
else:
self.num_scores[pos] -= base_score * (2 ** self.bomb_num)
self.num_scores[pos] -= base_score * (2 ** self.bomb_num[0]) * (3 ** self.bomb_num[1])
def get_winner(self):
return self.winner
@ -121,12 +139,17 @@ class GameEnv(object):
def step(self):
action = self.players[self.acting_player_position].act(
self.game_infoset)
self.step_count += 1
assert action in self.game_infoset.legal_actions
if len(action) > 0:
self.last_pid = self.acting_player_position
if action in bombs:
self.bomb_num += 1
if action in bombs[0]:
self.bomb_num[0] += 1
self.pos_bomb_num[self.acting_player_position] += 1
if action in bombs[1]:
self.bomb_num[1] += 1
self.pos_bomb_num[self.acting_player_position] += 1
self.last_move_dict[
@ -137,15 +160,15 @@ class GameEnv(object):
self.played_cards[self.acting_player_position] += action
if self.acting_player_position == 'landlord' and \
len(action) > 0 and \
len(self.three_landlord_cards) > 0:
for card in action:
if len(self.three_landlord_cards) > 0:
if card in self.three_landlord_cards:
self.three_landlord_cards.remove(card)
else:
break
# if self.acting_player_position == 'landlord' and \
# len(action) > 0 and \
# len(self.three_landlord_cards) > 0:
# for card in action:
# if len(self.three_landlord_cards) > 0:
# if card in self.three_landlord_cards:
# self.three_landlord_cards.remove(card)
# else:
# break
self.game_done()
if not self.game_over:
@ -156,7 +179,7 @@ class GameEnv(object):
def get_last_move(self):
last_move = []
if len(self.card_play_action_seq) != 0:
if len(self.card_play_action_seq[-1][1]) == 0:
if len(self.card_play_action_seq[-1]) == 0:
last_move = self.card_play_action_seq[-2][1]
else:
last_move = self.card_play_action_seq[-1][1]
@ -166,7 +189,7 @@ class GameEnv(object):
def get_last_two_moves(self):
last_two_moves = [[], []]
for card in self.card_play_action_seq[-2:]:
last_two_moves.insert(0, card[1])
last_two_moves.insert(0, card)
last_two_moves = last_two_moves[:2]
return last_two_moves
@ -179,6 +202,9 @@ class GameEnv(object):
self.acting_player_position = 'landlord_down'
elif self.acting_player_position == 'landlord_down':
self.acting_player_position = 'landlord_front'
elif self.acting_player_position == 'landlord_front':
self.acting_player_position = 'landlord_up'
else:
@ -202,7 +228,10 @@ class GameEnv(object):
rival_move = []
if len(action_sequence) != 0:
if len(action_sequence[-1][1]) == 0:
rival_move = action_sequence[-2][1]
if len(action_sequence[-2][1]) == 0:
rival_move = action_sequence[-3][1]
else:
rival_move = action_sequence[-2][1]
else:
rival_move = action_sequence[-1][1]
@ -227,15 +256,36 @@ class GameEnv(object):
moves = ms.filter_type_3_triple(all_moves, rival_move)
elif rival_move_type == md.TYPE_4_BOMB:
all_moves = mg.gen_type_4_bomb() + mg.gen_type_5_king_bomb()
all_moves = mg.gen_type_4_bomb(4)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(5) + mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB5:
all_moves = mg.gen_type_4_bomb(5)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB6:
all_moves = mg.gen_type_4_bomb(6)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB7:
all_moves = mg.gen_type_4_bomb(7)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB8:
all_moves = mg.gen_type_4_bomb(8)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_5_KING_BOMB:
moves = []
elif rival_move_type == md.TYPE_6_3_1:
all_moves = mg.gen_type_6_3_1()
moves = ms.filter_type_6_3_1(all_moves, rival_move)
# elif rival_move_type == md.TYPE_6_3_1:
# all_moves = mg.gen_type_6_3_1()
# moves = ms.filter_type_6_3_1(all_moves, rival_move)
elif rival_move_type == md.TYPE_7_3_2:
all_moves = mg.gen_type_7_3_2()
@ -253,25 +303,24 @@ class GameEnv(object):
all_moves = mg.gen_type_10_serial_triple(repeat_num=rival_move_len)
moves = ms.filter_type_10_serial_triple(all_moves, rival_move)
elif rival_move_type == md.TYPE_11_SERIAL_3_1:
all_moves = mg.gen_type_11_serial_3_1(repeat_num=rival_move_len)
moves = ms.filter_type_11_serial_3_1(all_moves, rival_move)
# elif rival_move_type == md.TYPE_11_SERIAL_3_1:
# all_moves = mg.gen_type_11_serial_3_1(repeat_num=rival_move_len)
# moves = ms.filter_type_11_serial_3_1(all_moves, rival_move)
elif rival_move_type == md.TYPE_12_SERIAL_3_2:
all_moves = mg.gen_type_12_serial_3_2(repeat_num=rival_move_len)
moves = ms.filter_type_12_serial_3_2(all_moves, rival_move)
elif rival_move_type == md.TYPE_13_4_2:
all_moves = mg.gen_type_13_4_2()
moves = ms.filter_type_13_4_2(all_moves, rival_move)
# elif rival_move_type == md.TYPE_13_4_2:
# all_moves = mg.gen_type_13_4_2()
# moves = ms.filter_type_13_4_2(all_moves, rival_move)
elif rival_move_type == md.TYPE_14_4_22:
all_moves = mg.gen_type_14_4_22()
moves = ms.filter_type_14_4_22(all_moves, rival_move)
# elif rival_move_type == md.TYPE_14_4_22:
# all_moves = mg.gen_type_14_4_22()
# moves = ms.filter_type_14_4_22(all_moves, rival_move)
if rival_move_type not in [md.TYPE_0_PASS,
md.TYPE_4_BOMB, md.TYPE_5_KING_BOMB]:
moves = moves + mg.gen_type_4_bomb() + mg.gen_type_5_king_bomb()
if rival_move_type != md.TYPE_0_PASS and rival_move_type < md.TYPE_4_BOMB:
moves = moves + mg.gen_type_4_bomb(4) + mg.gen_type_4_bomb(5) + mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
if len(rival_move) != 0: # rival_move is not 'pass'
moves = moves + [[]]
@ -284,7 +333,7 @@ class GameEnv(object):
def reset(self):
self.card_play_action_seq = []
self.three_landlord_cards = None
# self.three_landlord_cards = None
self.game_over = False
self.acting_player_position = None
@ -292,33 +341,40 @@ class GameEnv(object):
self.last_move_dict = {'landlord': [],
'landlord_up': [],
'landlord_front': [],
'landlord_down': []}
self.played_cards = {'landlord': [],
'landlord_up': [],
'landlord_front': [],
'landlord_down': []}
self.last_move = []
self.last_two_moves = []
self.info_sets = {'landlord': InfoSet('landlord'),
'landlord_up': InfoSet('landlord_up'),
'landlord_down': InfoSet('landlord_down')}
'landlord_up': InfoSet('landlord_up'),
'landlord_front': InfoSet('landlord_front'),
'landlord_down': InfoSet('landlord_down')}
self.bomb_num = 0
self.bomb_num = [0, 0]
self.pos_bomb_num = {
"landlord": 0,
"landlord_up": 0,
"landlord_front": 0,
"landlord_down": 0
}
self.last_pid = 'landlord'
self.bid_info = [[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1]]
self.bid_info = [[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1]]
self.bid_count = 0
self.multiply_count = {'landlord': 0,
'landlord_up': 0,
'landlord_front': 0,
'landlord_down': 0}
self.step_count = 0
@ -344,10 +400,10 @@ class GameEnv(object):
self.info_sets[self.acting_player_position].num_cards_left_dict = \
{pos: len(self.info_sets[pos].player_hand_cards)
for pos in ['landlord', 'landlord_up', 'landlord_down']}
for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']}
self.info_sets[self.acting_player_position].other_hand_cards = []
for pos in ['landlord', 'landlord_up', 'landlord_down']:
for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']:
if pos != self.acting_player_position:
self.info_sets[
self.acting_player_position].other_hand_cards += \
@ -355,15 +411,15 @@ class GameEnv(object):
self.info_sets[self.acting_player_position].played_cards = \
self.played_cards
self.info_sets[self.acting_player_position].three_landlord_cards = \
self.three_landlord_cards
# self.info_sets[self.acting_player_position].three_landlord_cards = \
# self.three_landlord_cards
self.info_sets[self.acting_player_position].card_play_action_seq = \
self.card_play_action_seq
self.info_sets[
self.acting_player_position].all_handcards = \
{pos: self.info_sets[pos].player_hand_cards
for pos in ['landlord', 'landlord_up', 'landlord_down']}
for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']}
return deepcopy(self.info_sets[self.acting_player_position])
@ -379,13 +435,13 @@ class InfoSet(object):
self.player_position = player_position
# The hand cands of the current player. A list.
self.player_hand_cards = None
# The number of cards left for each player. It is a dict with str-->int
# The number of cards left for each player. It is a dict with str-->int
self.num_cards_left_dict = None
# The three landload cards. A list.
self.three_landlord_cards = None
# self.three_landlord_cards = None
# The historical moves. It is a list of list
self.card_play_action_seq = None
# The union of the hand cards of the other two players for the current player
# The union of the hand cards of the other two players for the current player
self.other_hand_cards = None
# The legal actions for the current move. It is a list of list
self.legal_actions = None
@ -397,18 +453,19 @@ class InfoSet(object):
self.last_move_dict = None
# The played cands so far. It is a list.
self.played_cards = None
# The hand cards of all the players. It is a dict.
# The hand cards of all the players. It is a dict.
self.all_handcards = None
# Last player position that plays a valid move, i.e., not `pass`
self.last_pid = None
# The number of bombs played so far
self.bomb_num = None
self.bid_info = [[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1],
[-1, -1, -1]]
self.bid_info = [[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1]]
self.multiply_info = [1, 0, 0]
self.multiply_info = [1, 0, 0, 0]
self.player_id = None

View File

@ -91,17 +91,17 @@ class MovesGener(object):
self.triple_cards_moves.append([k, k, k])
return self.triple_cards_moves
def gen_type_4_bomb(self):
def gen_type_4_bomb(self, num = 4):
self.bomb_moves = []
for k, v in self.cards_dict.items():
if v == 4:
self.bomb_moves.append([k, k, k, k])
if v == num:
self.bomb_moves.append([k] * num)
return self.bomb_moves
def gen_type_5_king_bomb(self):
self.final_bomb_moves = []
if 20 in self.cards_list and 30 in self.cards_list:
self.final_bomb_moves.append([20, 30])
if 20 in self.cards_list and self.cards_dict[20] == 2 and 30 in self.cards_list and self.cards_dict[30] == 2:
self.final_bomb_moves.append([20, 20, 30, 30])
return self.final_bomb_moves
def gen_type_6_3_1(self):
@ -205,15 +205,19 @@ class MovesGener(object):
moves.extend(self.gen_type_1_single())
moves.extend(self.gen_type_2_pair())
moves.extend(self.gen_type_3_triple())
moves.extend(self.gen_type_4_bomb())
moves.extend(self.gen_type_4_bomb(4))
moves.extend(self.gen_type_4_bomb(5))
moves.extend(self.gen_type_4_bomb(6))
moves.extend(self.gen_type_4_bomb(7))
moves.extend(self.gen_type_4_bomb(8))
moves.extend(self.gen_type_5_king_bomb())
moves.extend(self.gen_type_6_3_1())
# moves.extend(self.gen_type_6_3_1())
moves.extend(self.gen_type_7_3_2())
moves.extend(self.gen_type_8_serial_single())
moves.extend(self.gen_type_9_serial_pair())
moves.extend(self.gen_type_10_serial_triple())
moves.extend(self.gen_type_11_serial_3_1())
# moves.extend(self.gen_type_11_serial_3_1())
moves.extend(self.gen_type_12_serial_3_2())
moves.extend(self.gen_type_13_4_2())
moves.extend(self.gen_type_14_4_22())
# moves.extend(self.gen_type_13_4_2())
# moves.extend(self.gen_type_14_4_22())
return moves

15
douzero/env/utils.py vendored
View File

@ -10,17 +10,22 @@ TYPE_0_PASS = 0
TYPE_1_SINGLE = 1
TYPE_2_PAIR = 2
TYPE_3_TRIPLE = 3
TYPE_4_BOMB = 4
TYPE_5_KING_BOMB = 5
TYPE_4_BOMB = 44
TYPE_4_BOMB5 = 45
TYPE_4_BOMB6 = 46
TYPE_4_BOMB7 = 47
TYPE_4_BOMB8 = 48
TYPE_5_KING_BOMB = 50
#TYPE_6_3_1 = 6
TYPE_6_3_1 = 6
TYPE_7_3_2 = 7
TYPE_8_SERIAL_SINGLE = 8
TYPE_9_SERIAL_PAIR = 9
TYPE_10_SERIAL_TRIPLE = 10
TYPE_11_SERIAL_3_1 = 11
# TYPE_11_SERIAL_3_1 = 11
TYPE_12_SERIAL_3_2 = 12
TYPE_13_4_2 = 13
TYPE_14_4_22 = 14
# TYPE_13_4_2 = 13
# TYPE_14_4_22 = 14
TYPE_15_WRONG = 15
# betting round action

View File

@ -10,7 +10,7 @@ import BidModel
def load_card_play_models(card_play_model_path_dict):
players = {}
for position in ['landlord', 'landlord_up', 'landlord_down']:
for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']:
if card_play_model_path_dict[position] == 'rlcard':
from .rlcard_agent import RLCardAgent
players[position] = RLCardAgent(position)
@ -43,30 +43,34 @@ def mp_simulate(card_play_data_list, card_play_model_path_dict, q, output, bid_o
bid_results = []
bid_values = []
bid_info_list = [
np.array([[-1,-1,-1],
[-1,-1,-1],
[-1,-1,-1],
[-1,-1,-1]]),
np.array([[0,0,0],
[-1,-1,-1],
[-1,-1,-1],
[-1,-1,-1]]),
np.array([[1,0,0],
[-1,-1,-1],
[-1,-1,-1],
[-1,-1,-1]]),
np.array([[0,0,0],
[0,0,0],
[-1,-1,-1],
[-1,-1,-1]]),
np.array([[0,0,1],
[1,0,0],
[-1,-1,-1],
[-1,-1,-1]]),
np.array([[0,1,0],
[0,0,1],
[1,0,0],
[-1,-1,-1]]),
np.array([[-1,-1,-1,-1],
[-1,-1,-1,-1],
[-1,-1,-1,-1],
[-1,-1,-1,-1]]),
np.array([[0,0,0,0],
[-1,-1,-1,-1],
[-1,-1,-1,-1],
[-1,-1,-1,-1]]),
np.array([[1,0,0,0],
[-1,-1,-1,-1],
[-1,-1,-1,-1],
[-1,-1,-1,-1]]),
np.array([[0,0,0,0],
[0,0,0,0],
[-1,-1,-1,-1],
[-1,-1,-1,-1]]),
np.array([[0,0,1,0],
[0,0,0,1],
[-1,-1,-1,-1],
[-1,-1,-1,-1]]),
np.array([[0,1,0,0],
[0,0,1,0],
[0,0,0,1],
[-1,-1,-1,-1]]),
np.array([[0,1,0,0],
[0,0,1,0],
[1,0,0,0],
[-1,-1,-1,-1]]),
]
for bid_info in bid_info_list:
bid_obs = douzero.env.env._get_obs_for_bid(1, bid_info, card_play_data["landlord"])
@ -82,6 +86,7 @@ def mp_simulate(card_play_data_list, card_play_model_path_dict, q, output, bid_o
print("\nStart ------- " + title)
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord"]]))
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_down"]]))
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_front"]]))
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_up"]]))
# print(card_play_data)
count = 0
@ -127,6 +132,7 @@ def evaluate(landlord, landlord_up, landlord_down, eval_data, num_workers, outpu
card_play_model_path_dict = {
'landlord': landlord,
'landlord_up': landlord_up,
'landlord_front': landlord_up,
'landlord_down': landlord_down}
num_landlord_wins = 0

View File

@ -10,6 +10,8 @@ if __name__ == '__main__':
default='baselines/douzero_12/landlord_weights_39762328900.ckpt')
parser.add_argument('--landlord_up', type=str,
default='baselines/douzero_12/landlord_up_weights_39762328900.ckpt')
parser.add_argument('--landlord_front', type=str,
default='baselines/douzero_12/landlord_front_weights_39762328900.ckpt')
parser.add_argument('--landlord_down', type=str,
default='baselines/douzero_12/landlord_down_weights_39762328900.ckpt')
parser.add_argument('--eval_data', type=str,
@ -25,7 +27,7 @@ if __name__ == '__main__':
if args.output or args.bid:
args.num_workers = 1
t = 3
frame = 3085177900
frame = 64000
adp_frame = 2511184300
# args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame
args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame
@ -44,6 +46,7 @@ if __name__ == '__main__':
elif t == 3:
args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame
args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame
args.landlord_front = 'baselines/resnet_landlord_front_%i.ckpt' % frame
args.landlord_down = 'baselines/resnet_landlord_down_%i.ckpt' % frame
elif t == 4:
args.landlord = 'baselines/douzero_ADP/landlord.ckpt'

View File

@ -4,9 +4,9 @@ import numpy as np
deck = []
for i in range(3, 15):
deck.extend([i for _ in range(4)])
deck.extend([17 for _ in range(4)])
deck.extend([20, 30])
deck.extend([i for _ in range(8)])
deck.extend([17 for _ in range(8)])
deck.extend([20, 20, 30, 30])
def get_parser():
parser = argparse.ArgumentParser(description='DouZero: random data generator')
@ -17,10 +17,11 @@ def get_parser():
def generate():
_deck = deck.copy()
np.random.shuffle(_deck)
card_play_data = {'landlord': _deck[:20],
'landlord_up': _deck[20:37],
'landlord_down': _deck[37:54],
'three_landlord_cards': _deck[17:20],
card_play_data = {'landlord': _deck[:33],
'landlord_up': _deck[33:58],
'landlord_front': _deck[58:83],
'landlord_down': _deck[83:108],
# 'three_landlord_cards': _deck[25:33],
}
for key in card_play_data:
card_play_data[key].sort()