改造为4人斗地主

This commit is contained in:
zhiyang7 2021-12-05 12:03:30 +08:00
parent 9c1c56d91d
commit aab93d66c6
10 changed files with 629 additions and 317 deletions

View File

@ -16,7 +16,7 @@ from .file_writer import FileWriter
from .models import Model, OldModel from .models import Model, OldModel
from .utils import get_batch, log, create_env, create_optimizers, act from .utils import get_batch, log, create_env, create_optimizers, act
mean_episode_return_buf = {p:deque(maxlen=100) for p in ['landlord', 'landlord_up', 'landlord_down', 'bidding']} mean_episode_return_buf = {p:deque(maxlen=100) for p in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']}
def compute_loss(logits, targets): def compute_loss(logits, targets):
loss = ((logits.squeeze(-1) - targets)**2).mean() loss = ((logits.squeeze(-1) - targets)**2).mean()
@ -27,7 +27,7 @@ def compute_loss_for_bid(outputs, reward):
def learn(position, actor_models, model, batch, optimizer, flags, lock): def learn(position, actor_models, model, batch, optimizer, flags, lock):
"""Performs a learning (optimization) step.""" """Performs a learning (optimization) step."""
position_index = {"landlord": 31, "landlord_up": 32, "landlord_down": 33} position_index = {"landlord": 31, "landlord_up": 32, 'landlord_front': 33, "landlord_down": 34}
print("Learn", position) print("Learn", position)
if flags.training_device != "cpu": if flags.training_device != "cpu":
device = torch.device('cuda:'+str(flags.training_device)) device = torch.device('cuda:'+str(flags.training_device))
@ -46,7 +46,8 @@ def learn(position, actor_models, model, batch, optimizer, flags, lock):
with lock: with lock:
learner_outputs = model(obs_z, obs_x, return_value=True) learner_outputs = model(obs_z, obs_x, return_value=True)
if position == "bidding": if position == "bidding":
pass loss = compute_loss(learner_outputs['values'], target)
# pass
else: else:
loss = compute_loss(learner_outputs['values'], target) loss = compute_loss(learner_outputs['values'], target)
stats = { stats = {
@ -101,7 +102,7 @@ def train(flags):
# Initialize queues # Initialize queues
actor_processes = [] actor_processes = []
ctx = mp.get_context('spawn') ctx = mp.get_context('spawn')
batch_queues = {"landlord": ctx.SimpleQueue(), "landlord_up": ctx.SimpleQueue(), "landlord_down": ctx.SimpleQueue(), "bidding": ctx.SimpleQueue()} batch_queues = {"landlord": ctx.SimpleQueue(), "landlord_up": ctx.SimpleQueue(), 'landlord_front': ctx.SimpleQueue(), "landlord_down": ctx.SimpleQueue(), "bidding": ctx.SimpleQueue()}
# Learner model for training # Learner model for training
learner_model = Model(device=flags.training_device) learner_model = Model(device=flags.training_device)
@ -115,20 +116,22 @@ def train(flags):
'loss_landlord', 'loss_landlord',
'mean_episode_return_landlord_up', 'mean_episode_return_landlord_up',
'loss_landlord_up', 'loss_landlord_up',
'mean_episode_return_landlord_front',
'loss_landlord_front',
'mean_episode_return_landlord_down', 'mean_episode_return_landlord_down',
'loss_landlord_down', 'loss_landlord_down',
'mean_episode_return_bidding', 'mean_episode_return_bidding',
'loss_bidding', 'loss_bidding',
] ]
frames, stats = 0, {k: 0 for k in stat_keys} frames, stats = 0, {k: 0 for k in stat_keys}
position_frames = {'landlord':0, 'landlord_up':0, 'landlord_down':0, 'bidding': 0} position_frames = {'landlord':0, 'landlord_up':0, 'landlord_front':0, 'landlord_down':0, 'bidding': 0}
# Load models if any # Load models if any
if flags.load_model and os.path.exists(checkpointpath): if flags.load_model and os.path.exists(checkpointpath):
checkpoint_states = torch.load( checkpoint_states = torch.load(
checkpointpath, map_location=("cuda:"+str(flags.training_device) if flags.training_device != "cpu" else "cpu") checkpointpath, map_location=("cuda:"+str(flags.training_device) if flags.training_device != "cpu" else "cpu")
) )
for k in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down'] for k in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down']
learner_model.get_model(k).load_state_dict(checkpoint_states["model_state_dict"][k]) learner_model.get_model(k).load_state_dict(checkpoint_states["model_state_dict"][k])
optimizers[k].load_state_dict(checkpoint_states["optimizer_state_dict"][k]) optimizers[k].load_state_dict(checkpoint_states["optimizer_state_dict"][k])
for device in device_iterator: for device in device_iterator:
@ -176,12 +179,12 @@ def train(flags):
threads = [] threads = []
locks = {} locks = {}
for device in device_iterator: for device in device_iterator:
locks[device] = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()} locks[device] = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_front': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()}
position_locks = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()} position_locks = {'landlord': threading.Lock(), 'landlord_up': threading.Lock(), 'landlord_front': threading.Lock(), 'landlord_down': threading.Lock(), 'bidding': threading.Lock()}
for device in device_iterator: for device in device_iterator:
for i in range(flags.num_threads): for i in range(flags.num_threads):
for position in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']:
thread = threading.Thread( thread = threading.Thread(
target=batch_and_learn, name='batch-and-learn-%d' % i, args=(i,device,position,locks[device][position],position_locks[position])) target=batch_and_learn, name='batch-and-learn-%d' % i, args=(i,device,position,locks[device][position],position_locks[position]))
thread.start() thread.start()
@ -202,7 +205,7 @@ def train(flags):
}, checkpointpath) }, checkpointpath)
# Save the weights for evaluation purpose # Save the weights for evaluation purpose
for position in ['landlord', 'landlord_up', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down'] for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']
model_weights_dir = os.path.expandvars(os.path.expanduser( model_weights_dir = os.path.expandvars(os.path.expanduser(
'%s/%s/%s' % (flags.savedir, flags.xpid, "general_"+position+'_'+str(frames)+'.ckpt'))) '%s/%s/%s' % (flags.savedir, flags.xpid, "general_"+position+'_'+str(frames)+'.ckpt')))
torch.save(learner_model.get_model(position).state_dict(), model_weights_dir) torch.save(learner_model.get_model(position).state_dict(), model_weights_dir)
@ -229,15 +232,17 @@ def train(flags):
fps_avg = np.mean(fps_log) fps_avg = np.mean(fps_log)
position_fps = {k:(position_frames[k]-position_start_frames[k])/(end_time-start_time) for k in position_frames} position_fps = {k:(position_frames[k]-position_start_frames[k])/(end_time-start_time) for k in position_frames}
log.info('After %i (L:%i U:%i D:%i) frames: @ %.1f fps (avg@ %.1f fps) (L:%.1f U:%.1f D:%.1f) Stats:\n%s', log.info('After %i (L:%i U:%i F:%i D:%i) frames: @ %.1f fps (avg@ %.1f fps) (L:%.1f U:%.1f F:%.1f D:%.1f) Stats:\n%s',
frames, frames,
position_frames['landlord'], position_frames['landlord'],
position_frames['landlord_up'], position_frames['landlord_up'],
position_frames['landlord_front'],
position_frames['landlord_down'], position_frames['landlord_down'],
fps, fps,
fps_avg, fps_avg,
position_fps['landlord'], position_fps['landlord'],
position_fps['landlord_up'], position_fps['landlord_up'],
position_fps['landlord_front'],
position_fps['landlord_down'], position_fps['landlord_down'],
pprint.pformat(stats)) pprint.pformat(stats))

View File

@ -12,13 +12,13 @@ import torch.nn.functional as F
class LandlordLstmModel(nn.Module): class LandlordLstmModel(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True) self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(373 + 128, 512) self.dense1 = nn.Linear(846 + 128, 1024)
self.dense2 = nn.Linear(512, 512) self.dense2 = nn.Linear(1024, 1024)
self.dense3 = nn.Linear(512, 512) self.dense3 = nn.Linear(1024, 768)
self.dense4 = nn.Linear(512, 512) self.dense4 = nn.Linear(768, 512)
self.dense5 = nn.Linear(512, 512) self.dense5 = nn.Linear(512, 256)
self.dense6 = nn.Linear(512, 1) self.dense6 = nn.Linear(256, 1)
def forward(self, z, x, return_value=False, flags=None): def forward(self, z, x, return_value=False, flags=None):
lstm_out, (h_n, _) = self.lstm(z) lstm_out, (h_n, _) = self.lstm(z)
@ -47,13 +47,13 @@ class LandlordLstmModel(nn.Module):
class FarmerLstmModel(nn.Module): class FarmerLstmModel(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True) self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(484 + 128, 512) self.dense1 = nn.Linear(1178 + 128, 1024)
self.dense2 = nn.Linear(512, 512) self.dense2 = nn.Linear(1024, 1024)
self.dense3 = nn.Linear(512, 512) self.dense3 = nn.Linear(1024, 768)
self.dense4 = nn.Linear(512, 512) self.dense4 = nn.Linear(768, 512)
self.dense5 = nn.Linear(512, 512) self.dense5 = nn.Linear(512, 256)
self.dense6 = nn.Linear(512, 1) self.dense6 = nn.Linear(256, 1)
def forward(self, z, x, return_value=False, flags=None): def forward(self, z, x, return_value=False, flags=None):
lstm_out, (h_n, _) = self.lstm(z) lstm_out, (h_n, _) = self.lstm(z)
@ -82,8 +82,8 @@ class FarmerLstmModel(nn.Module):
class LandlordLstmNewModel(nn.Module): class LandlordLstmNewModel(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True) self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(373 + 128, 512) self.dense1 = nn.Linear(846 + 128, 512)
self.dense2 = nn.Linear(512, 512) self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512) self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512) self.dense4 = nn.Linear(512, 512)
@ -117,8 +117,8 @@ class LandlordLstmNewModel(nn.Module):
class FarmerLstmNewModel(nn.Module): class FarmerLstmNewModel(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.lstm = nn.LSTM(162, 128, batch_first=True) self.lstm = nn.LSTM(432, 128, batch_first=True)
self.dense1 = nn.Linear(484 + 128, 512) self.dense1 = nn.Linear(1178 + 128, 512)
self.dense2 = nn.Linear(512, 512) self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512) self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512) self.dense4 = nn.Linear(512, 512)
@ -253,20 +253,22 @@ class GeneralModel(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.in_planes = 80 self.in_planes = 80
#input 1*54*41 #input 1*108*41
self.conv1 = nn.Conv1d(40, 80, kernel_size=(3,), self.conv1 = nn.Conv1d(40, 80, kernel_size=(3,),
stride=(2,), padding=1, bias=False) #1*27*80 stride=(2,), padding=1, bias=False) #1*54*80
self.bn1 = nn.BatchNorm1d(80) self.bn1 = nn.BatchNorm1d(80)
self.layer1 = self._make_layer(BasicBlock, 80, 2, stride=2)#1*14*80 self.layer1 = self._make_layer(BasicBlock, 80, 2, stride=2)#1*27*80
self.layer2 = self._make_layer(BasicBlock, 160, 2, stride=2)#1*7*160 self.layer2 = self._make_layer(BasicBlock, 160, 2, stride=2)#1*14*160
self.layer3 = self._make_layer(BasicBlock, 320, 2, stride=2)#1*4*320 self.layer3 = self._make_layer(BasicBlock, 320, 2, stride=2)#1*7*320
self.layer4 = self._make_layer(BasicBlock, 640, 2, stride=2)#1*4*320
# self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) # self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear1 = nn.Linear(320 * BasicBlock.expansion * 4 + 15 * 4, 1024) self.linear1 = nn.Linear(640 * BasicBlock.expansion * 4 + 24 * 4, 2048)
self.linear2 = nn.Linear(1024, 512) self.linear2 = nn.Linear(2048, 1024)
self.linear3 = nn.Linear(512, 256) self.linear3 = nn.Linear(1024, 512)
self.linear4 = nn.Linear(256, 1) self.linear4 = nn.Linear(512, 256)
self.linear5 = nn.Linear(256, 1)
def _make_layer(self, block, planes, num_blocks, stride): def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1) strides = [stride] + [1] * (num_blocks - 1)
@ -281,12 +283,14 @@ class GeneralModel(nn.Module):
out = self.layer1(out) out = self.layer1(out)
out = self.layer2(out) out = self.layer2(out)
out = self.layer3(out) out = self.layer3(out)
out = self.layer4(out)
out = out.flatten(1,2) out = out.flatten(1,2)
out = torch.cat([x,x,x,x,out], dim=-1) out = torch.cat([x,x,x,x,out], dim=-1)
out = F.leaky_relu_(self.linear1(out)) out = F.leaky_relu_(self.linear1(out))
out = F.leaky_relu_(self.linear2(out)) out = F.leaky_relu_(self.linear2(out))
out = F.leaky_relu_(self.linear3(out)) out = F.leaky_relu_(self.linear3(out))
out = F.leaky_relu_(self.linear4(out)) out = F.leaky_relu_(self.linear4(out))
out = F.leaky_relu_(self.linear5(out))
if return_value: if return_value:
return dict(values=out) return dict(values=out)
else: else:
@ -304,7 +308,7 @@ class BidModel(nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.dense1 = nn.Linear(114, 512) self.dense1 = nn.Linear(208, 512)
self.dense2 = nn.Linear(512, 512) self.dense2 = nn.Linear(512, 512)
self.dense3 = nn.Linear(512, 512) self.dense3 = nn.Linear(512, 512)
self.dense4 = nn.Linear(512, 512) self.dense4 = nn.Linear(512, 512)
@ -342,15 +346,18 @@ class BidModel(nn.Module):
model_dict = {} model_dict = {}
model_dict['landlord'] = LandlordLstmModel model_dict['landlord'] = LandlordLstmModel
model_dict['landlord_up'] = FarmerLstmModel model_dict['landlord_up'] = FarmerLstmModel
model_dict['landlord_front'] = FarmerLstmModel
model_dict['landlord_down'] = FarmerLstmModel model_dict['landlord_down'] = FarmerLstmModel
model_dict_new = {} model_dict_new = {}
model_dict_new['landlord'] = GeneralModel model_dict_new['landlord'] = GeneralModel
model_dict_new['landlord_up'] = GeneralModel model_dict_new['landlord_up'] = GeneralModel
model_dict_new['landlord_front'] = GeneralModel
model_dict_new['landlord_down'] = GeneralModel model_dict_new['landlord_down'] = GeneralModel
model_dict_new['bidding'] = BidModel model_dict_new['bidding'] = BidModel
model_dict_lstm = {} model_dict_lstm = {}
model_dict_lstm['landlord'] = GeneralModel model_dict_lstm['landlord'] = GeneralModel
model_dict_lstm['landlord_up'] = GeneralModel model_dict_lstm['landlord_up'] = GeneralModel
model_dict_lstm['landlord_front'] = GeneralModel
model_dict_lstm['landlord_down'] = GeneralModel model_dict_lstm['landlord_down'] = GeneralModel
class General_Model: class General_Model:
@ -365,6 +372,7 @@ class General_Model:
# model = GeneralModel().to(torch.device(device)) # model = GeneralModel().to(torch.device(device))
self.models['landlord'] = GeneralModel1().to(torch.device(device)) self.models['landlord'] = GeneralModel1().to(torch.device(device))
self.models['landlord_up'] = GeneralModel1().to(torch.device(device)) self.models['landlord_up'] = GeneralModel1().to(torch.device(device))
self.models['landlord_front'] = GeneralModel1().to(torch.device(device))
self.models['landlord_down'] = GeneralModel1().to(torch.device(device)) self.models['landlord_down'] = GeneralModel1().to(torch.device(device))
self.models['bidding'] = BidModel().to(torch.device(device)) self.models['bidding'] = BidModel().to(torch.device(device))
@ -375,12 +383,14 @@ class General_Model:
def share_memory(self): def share_memory(self):
self.models['landlord'].share_memory() self.models['landlord'].share_memory()
self.models['landlord_up'].share_memory() self.models['landlord_up'].share_memory()
self.models['landlord_front'].share_memory()
self.models['landlord_down'].share_memory() self.models['landlord_down'].share_memory()
self.models['bidding'].share_memory() self.models['bidding'].share_memory()
def eval(self): def eval(self):
self.models['landlord'].eval() self.models['landlord'].eval()
self.models['landlord_up'].eval() self.models['landlord_up'].eval()
self.models['landlord_front'].eval()
self.models['landlord_down'].eval() self.models['landlord_down'].eval()
self.models['bidding'].eval() self.models['bidding'].eval()
@ -404,6 +414,7 @@ class OldModel:
device = 'cuda:' + str(device) device = 'cuda:' + str(device)
self.models['landlord'] = LandlordLstmModel().to(torch.device(device)) self.models['landlord'] = LandlordLstmModel().to(torch.device(device))
self.models['landlord_up'] = FarmerLstmModel().to(torch.device(device)) self.models['landlord_up'] = FarmerLstmModel().to(torch.device(device))
self.models['landlord_front'] = FarmerLstmModel().to(torch.device(device))
self.models['landlord_down'] = FarmerLstmModel().to(torch.device(device)) self.models['landlord_down'] = FarmerLstmModel().to(torch.device(device))
def forward(self, position, z, x, training=False, flags=None): def forward(self, position, z, x, training=False, flags=None):
@ -413,11 +424,13 @@ class OldModel:
def share_memory(self): def share_memory(self):
self.models['landlord'].share_memory() self.models['landlord'].share_memory()
self.models['landlord_up'].share_memory() self.models['landlord_up'].share_memory()
self.models['landlord_front'].share_memory()
self.models['landlord_down'].share_memory() self.models['landlord_down'].share_memory()
def eval(self): def eval(self):
self.models['landlord'].eval() self.models['landlord'].eval()
self.models['landlord_up'].eval() self.models['landlord_up'].eval()
self.models['landlord_front'].eval()
self.models['landlord_down'].eval() self.models['landlord_down'].eval()
def parameters(self, position): def parameters(self, position):
@ -442,6 +455,7 @@ class Model:
# model = GeneralModel().to(torch.device(device)) # model = GeneralModel().to(torch.device(device))
self.models['landlord'] = GeneralModel().to(torch.device(device)) self.models['landlord'] = GeneralModel().to(torch.device(device))
self.models['landlord_up'] = GeneralModel().to(torch.device(device)) self.models['landlord_up'] = GeneralModel().to(torch.device(device))
self.models['landlord_front'] = GeneralModel().to(torch.device(device))
self.models['landlord_down'] = GeneralModel().to(torch.device(device)) self.models['landlord_down'] = GeneralModel().to(torch.device(device))
self.models['bidding'] = BidModel().to(torch.device(device)) self.models['bidding'] = BidModel().to(torch.device(device))
@ -452,12 +466,14 @@ class Model:
def share_memory(self): def share_memory(self):
self.models['landlord'].share_memory() self.models['landlord'].share_memory()
self.models['landlord_up'].share_memory() self.models['landlord_up'].share_memory()
self.models['landlord_front'].share_memory()
self.models['landlord_down'].share_memory() self.models['landlord_down'].share_memory()
self.models['bidding'].share_memory() self.models['bidding'].share_memory()
def eval(self): def eval(self):
self.models['landlord'].eval() self.models['landlord'].eval()
self.models['landlord_up'].eval() self.models['landlord_up'].eval()
self.models['landlord_front'].eval()
self.models['landlord_down'].eval() self.models['landlord_down'].eval()
self.models['bidding'].eval() self.models['bidding'].eval()
@ -470,11 +486,3 @@ class Model:
def get_models(self): def get_models(self):
return self.models return self.models

View File

@ -16,11 +16,15 @@ from douzero.env import Env
Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7,
11: 8, 12: 9, 13: 10, 14: 11, 17: 12} 11: 8, 12: 9, 13: 10, 14: 11, 17: 12}
NumOnes2Array = {0: np.array([0, 0, 0, 0]), NumOnes2Array = {0: np.array([0, 0, 0, 0, 0, 0, 0, 0]),
1: np.array([1, 0, 0, 0]), 1: np.array([1, 0, 0, 0, 0, 0, 0, 0]),
2: np.array([1, 1, 0, 0]), 2: np.array([1, 1, 0, 0, 0, 0, 0, 0]),
3: np.array([1, 1, 1, 0]), 3: np.array([1, 1, 1, 0, 0, 0, 0, 0]),
4: np.array([1, 1, 1, 1])} 4: np.array([1, 1, 1, 1, 0, 0, 0, 0]),
5: np.array([1, 1, 1, 1, 1, 0, 0, 0]),
6: np.array([1, 1, 1, 1, 1, 1, 0, 0]),
7: np.array([1, 1, 1, 1, 1, 1, 1, 0]),
8: np.array([1, 1, 1, 1, 1, 1, 1, 1])}
shandle = logging.StreamHandler() shandle = logging.StreamHandler()
shandle.setFormatter( shandle.setFormatter(
@ -60,7 +64,7 @@ def create_optimizers(flags, learner_model):
""" """
Create three optimizers for the three positions Create three optimizers for the three positions
""" """
positions = ['landlord', 'landlord_up', 'landlord_down', 'bidding'] positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']
optimizers = {} optimizers = {}
for position in positions: for position in positions:
optimizer = RAdam( optimizer = RAdam(
@ -72,7 +76,7 @@ def create_optimizers(flags, learner_model):
def act(i, device, batch_queues, model, flags): def act(i, device, batch_queues, model, flags):
positions = ['landlord', 'landlord_up', 'landlord_down', 'bidding'] positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']
for pos in positions: for pos in positions:
model.models[pos].to(torch.device(device if device == "cpu" else ("cuda:"+str(device)))) model.models[pos].to(torch.device(device if device == "cpu" else ("cuda:"+str(device))))
try: try:
@ -90,9 +94,9 @@ def act(i, device, batch_queues, model, flags):
type_buf = {p: [] for p in positions} type_buf = {p: [] for p in positions}
obs_x_batch_buf = {p: [] for p in positions} obs_x_batch_buf = {p: [] for p in positions}
position_index = {"landlord": 31, "landlord_up": 32, "landlord_down": 33} position_index = {"landlord": 31, "landlord_up": 32, "landlord_front": 33, "landlord_down": 34}
bid_type_index = {"landlord": 41, "landlord_up": 42, "landlord_down": 43} bid_type_index = {"landlord": 41, "landlord_up": 42, "landlord_front": 43, "landlord_down": 43}
bid_type_map = {41: "landlord", 42: "landlord_up", 43: "landlord_down"} bid_type_map = {41: "landlord", 42: "landlord_up", 43: "landlord_front", 44: "landlord_down"}
position, obs, env_output = env.initial(model, device, flags=flags) position, obs, env_output = env.initial(model, device, flags=flags)
bid_obs_buffer = env_output["begin_buf"]["bid_obs_buffer"] bid_obs_buffer = env_output["begin_buf"]["bid_obs_buffer"]
@ -149,7 +153,7 @@ def act(i, device, batch_queues, model, flags):
target_buf[p].append(episode_return) target_buf[p].append(episode_return)
break break
for p in positions: for p in positions:
if size[p] > T: while size[p] > T:
# print(p, "epr", torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in episode_return_buf[p][:T]]),) # print(p, "epr", torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in episode_return_buf[p][:T]]),)
batch_queues[p].put({ batch_queues[p].put({
"done": torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in done_buf[p][:T]]), "done": torch.stack([torch.tensor(ndarr, device="cpu") for ndarr in done_buf[p][:T]]),
@ -182,18 +186,22 @@ def _cards2tensor(list_cards):
See Figure 2 in https://arxiv.org/pdf/2106.06135.pdf See Figure 2 in https://arxiv.org/pdf/2106.06135.pdf
""" """
if len(list_cards) == 0: if len(list_cards) == 0:
return torch.zeros(54, dtype=torch.int8) return torch.zeros(108, dtype=torch.int8)
matrix = np.zeros([4, 13], dtype=np.int8) matrix = np.zeros([8, 13], dtype=np.int8)
jokers = np.zeros(2, dtype=np.int8) jokers = np.zeros(4, dtype=np.int8)
counter = Counter(list_cards) counter = Counter(list_cards)
for card, num_times in counter.items(): for card, num_times in counter.items():
if card < 20: if card < 20:
matrix[:, Card2Column[card]] = NumOnes2Array[num_times] matrix[:, Card2Column[card]] = NumOnes2Array[num_times]
elif card == 20: elif card == 20:
jokers[0] = 1 jokers[0] = 1
if num_times == 2:
jokers[1] = 1
elif card == 30: elif card == 30:
jokers[1] = 1 jokers[2] = 1
if num_times == 2:
jokers[3] = 1
matrix = np.concatenate((matrix.flatten('F'), jokers)) matrix = np.concatenate((matrix.flatten('F'), jokers))
matrix = torch.from_numpy(matrix) matrix = torch.from_numpy(matrix)
return matrix return matrix

487
douzero/env/env.py vendored
View File

@ -11,17 +11,22 @@ env_url = "http://od.vcccz.com/hechuan/env.py"
Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, Card2Column = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7,
11: 8, 12: 9, 13: 10, 14: 11, 17: 12} 11: 8, 12: 9, 13: 10, 14: 11, 17: 12}
NumOnes2Array = {0: np.array([0, 0, 0, 0]), NumOnes2Array = {0: np.array([0, 0, 0, 0, 0, 0, 0, 0]),
1: np.array([1, 0, 0, 0]), 1: np.array([1, 0, 0, 0, 0, 0, 0, 0]),
2: np.array([1, 1, 0, 0]), 2: np.array([1, 1, 0, 0, 0, 0, 0, 0]),
3: np.array([1, 1, 1, 0]), 3: np.array([1, 1, 1, 0, 0, 0, 0, 0]),
4: np.array([1, 1, 1, 1])} 4: np.array([1, 1, 1, 1, 0, 0, 0, 0]),
5: np.array([1, 1, 1, 1, 1, 0, 0, 0]),
6: np.array([1, 1, 1, 1, 1, 1, 0, 0]),
7: np.array([1, 1, 1, 1, 1, 1, 1, 0]),
8: np.array([1, 1, 1, 1, 1, 1, 1, 1])}
deck = [] deck = []
for i in range(3, 15): for i in range(3, 15):
deck.extend([i for _ in range(4)]) deck.extend([i for _ in range(8)])
deck.extend([17 for _ in range(4)]) deck.extend([17 for _ in range(8)])
deck.extend([20, 30]) deck.extend([20, 20, 30, 30])
class Env: class Env:
@ -46,7 +51,7 @@ class Env:
# Initialize players # Initialize players
# We use three dummy player for the target position # We use three dummy player for the target position
self.players = {} self.players = {}
for position in ['landlord', 'landlord_up', 'landlord_down']: for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']:
self.players[position] = DummyAgent(position) self.players[position] = DummyAgent(position)
# Initialize the internal environment # Initialize the internal environment
@ -67,10 +72,11 @@ class Env:
if model is None: if model is None:
_deck = deck.copy() _deck = deck.copy()
np.random.shuffle(_deck) np.random.shuffle(_deck)
card_play_data = {'landlord': _deck[:20], card_play_data = {'landlord': _deck[:33],
'landlord_up': _deck[20:37], 'landlord_up': _deck[33:58],
'landlord_down': _deck[37:54], 'landlord_front': _deck[58:83],
'three_landlord_cards': _deck[17:20], 'landlord_down': _deck[83:108],
# 'three_landlord_cards': _deck[17:20],
} }
for key in card_play_data: for key in card_play_data:
card_play_data[key].sort() card_play_data[key].sort()
@ -97,18 +103,20 @@ class Env:
_deck = deck.copy() _deck = deck.copy()
np.random.shuffle(_deck) np.random.shuffle(_deck)
card_play_data = [ card_play_data = [
_deck[:17], _deck[:25],
_deck[17:34], _deck[25:50],
_deck[34:51], _deck[50:75],
_deck[75:100],
] ]
for i in range(3): for i in range(4):
card_play_data[i].sort() card_play_data[i].sort()
landlord_cards = _deck[51:54] landlord_cards = _deck[100:108]
landlord_cards.sort() landlord_cards.sort()
bid_info = np.array([[-1, -1, -1], bid_info = np.array([[-1, -1, -1, -1],
[-1, -1, -1], [-1, -1, -1, -1],
[-1, -1, -1], [-1, -1, -1, -1],
[-1, -1, -1]]) [-1, -1, -1, -1],
[-1, -1, -1, -1]])
bidding_player = random.randint(0, 2) bidding_player = random.randint(0, 2)
# bidding_player = 0 # debug # bidding_player = 0 # debug
first_bid = -1 first_bid = -1
@ -116,7 +124,7 @@ class Env:
bid_count = 0 bid_count = 0
if bid_limit <= 0: if bid_limit <= 0:
force_bid = True force_bid = True
for r in range(3): for r in range(4):
bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player]) bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player])
with torch.no_grad(): with torch.no_grad():
action = model.forward("bidding", torch.tensor(bidding_obs["z_batch"], device=device), action = model.forward("bidding", torch.tensor(bidding_obs["z_batch"], device=device),
@ -137,19 +145,19 @@ class Env:
bid_count += 1 bid_count += 1
if first_bid == -1: if first_bid == -1:
first_bid = bidding_player first_bid = bidding_player
for p in range(3): for p in range(4):
if p == bidding_player: if p == bidding_player:
bid_info[r][p] = 1 bid_info[r][p] = 1
else: else:
bid_info[r][p] = 0 bid_info[r][p] = 0
else: else:
bid_info[r] = [0, 0, 0] bid_info[r] = [0, 0, 0, 0]
bidding_player = (bidding_player + 1) % 3 bidding_player = (bidding_player + 1) % 4
one_count = np.count_nonzero(bid_info == 1) one_count = np.count_nonzero(bid_info == 1)
if one_count == 0: if one_count == 0:
continue continue
elif one_count > 1: elif one_count > 1:
r = 3 r = 4
bidding_player = first_bid bidding_player = first_bid
bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player]) bidding_obs = _get_obs_for_bid(bidding_player, bid_info, card_play_data[bidding_player])
with torch.no_grad(): with torch.no_grad():
@ -163,7 +171,7 @@ class Env:
if action["action"] == 1: if action["action"] == 1:
last_bid = bidding_player last_bid = bidding_player
bid_count += 1 bid_count += 1
for p in range(3): for p in range(4):
if p == bidding_player: if p == bidding_player:
bid_info[r][p] = 1 bid_info[r][p] = 1
else: else:
@ -171,20 +179,23 @@ class Env:
break break
card_play_data[last_bid].extend(landlord_cards) card_play_data[last_bid].extend(landlord_cards)
card_play_data = {'landlord': card_play_data[last_bid], card_play_data = {'landlord': card_play_data[last_bid],
'landlord_up': card_play_data[(last_bid - 1) % 3], 'landlord_up': card_play_data[(last_bid - 1) % 4],
'landlord_down': card_play_data[(last_bid + 1) % 3], 'landlord_down': card_play_data[(last_bid + 1) % 4],
'three_landlord_cards': landlord_cards, 'landlord_front': card_play_data[(last_bid + 2) % 4],
# 'three_landlord_cards': landlord_cards,
} }
card_play_data["landlord"].sort() card_play_data["landlord"].sort()
player_ids = { player_ids = {
'landlord': last_bid, 'landlord': last_bid,
'landlord_up': (last_bid - 1) % 3, 'landlord_up': (last_bid - 1) % 4,
'landlord_down': (last_bid + 1) % 3, 'landlord_down': (last_bid + 1) % 4,
'landlord_front': (last_bid + 2) % 4,
} }
player_positions = { player_positions = {
last_bid: 'landlord', last_bid: 'landlord',
(last_bid - 1) % 3: 'landlord_up', (last_bid - 1) % 4: 'landlord_up',
(last_bid + 1) % 3: 'landlord_down' (last_bid + 1) % 4: 'landlord_down',
(last_bid + 2) % 4: 'landlord_front',
} }
for bid_obs in bid_obs_buffer: for bid_obs in bid_obs_buffer:
bid_obs.update({"position": player_positions[bid_obs["pid"]]}) bid_obs.update({"position": player_positions[bid_obs["pid"]]})
@ -192,14 +203,15 @@ class Env:
# Initialize the cards # Initialize the cards
self._env.card_play_init(card_play_data) self._env.card_play_init(card_play_data)
multiply_map = [ multiply_map = [
np.array([1, 0, 0]), np.array([1, 0, 0, 0]),
np.array([0, 1, 0]), np.array([0, 1, 0, 0]),
np.array([0, 0, 1]) np.array([0, 0, 1, 0]),
np.array([0, 0, 0, 1])
] ]
for pos in ["landlord", "landlord_up", "landlord_down"]: for pos in ["landlord", "landlord_up", "landlord_front", "landlord_down"]:
pid = player_ids[pos] pid = player_ids[pos]
self._env.info_sets[pos].player_id = pid self._env.info_sets[pos].player_id = pid
self._env.info_sets[pos].bid_info = bid_info[:, [(pid - 1) % 3, pid, (pid + 1) % 3]] self._env.info_sets[pos].bid_info = bid_info[:, [(pid - 1) % 4, pid, (pid + 1) % 4, (pid + 2) % 4]]
self._env.bid_count = bid_count self._env.bid_count = bid_count
# multiply_obs = _get_obs_for_multiply(pos, self._env.info_sets[pos].bid_info, card_play_data[pos], # multiply_obs = _get_obs_for_multiply(pos, self._env.info_sets[pos].bid_info, card_play_data[pos],
# landlord_cards) # landlord_cards)
@ -245,11 +257,13 @@ class Env:
"play": { "play": {
"landlord": self._get_reward("landlord"), "landlord": self._get_reward("landlord"),
"landlord_up": self._get_reward("landlord_up"), "landlord_up": self._get_reward("landlord_up"),
"landlord_front": self._get_reward("landlord_front"),
"landlord_down": self._get_reward("landlord_down") "landlord_down": self._get_reward("landlord_down")
}, },
"bid": { "bid": {
"landlord": self._get_reward_bidding("landlord")*2, "landlord": self._get_reward_bidding("landlord")*3,
"landlord_up": self._get_reward_bidding("landlord_up"), "landlord_up": self._get_reward_bidding("landlord_up"),
"landlord_front": self._get_reward_bidding("landlord_front"),
"landlord_down": self._get_reward_bidding("landlord_down") "landlord_down": self._get_reward_bidding("landlord_down")
} }
} }
@ -269,16 +283,19 @@ class Env:
self_bomb_num = self._env.pos_bomb_num[pos] self_bomb_num = self._env.pos_bomb_num[pos]
if winner == 'landlord': if winner == 'landlord':
if self.objective == 'adp': if self.objective == 'adp':
return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num +self._env.multiply_count[pos]) /8 return (1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] + self._env.multiply_count[pos]) /8
return (2.0 ** bomb_num[0]) * (3.0 ** bomb_num[1])
elif self.objective == 'logadp': elif self.objective == 'logadp':
return (1.0 - self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4 return (1.0 - self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4
return bomb_num[0] + bomb_num[1] + 1.0
else: else:
return 1.0 - self._env.step_count * 0.0033 return 1.0 - self._env.step_count * 0.0033
return 1.0
else: else:
if self.objective == 'adp': if self.objective == 'adp':
return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num +self._env.multiply_count[pos]) /8 return (-1.1 - self._env.step_count * 0.0033) * 1.3 ** (bomb_num[0] + bomb_num[1] +self._env.multiply_count[pos]) /8
elif self.objective == 'logadp': elif self.objective == 'logadp':
return (-1.0 + self._env.step_count * 0.0033) * 1.3**self_bomb_num * 2**self._env.multiply_count[pos] / 4 return (-1.0 + self._env.step_count * 0.0033) * 1.3**(self_bomb_num) * 2**self._env.multiply_count[pos] / 4
else: else:
return -1.0 + self._env.step_count * 0.0033 return -1.0 + self._env.step_count * 0.0033
@ -371,12 +388,12 @@ def get_obs(infoset, use_general=True):
This function obtains observations with imperfect information This function obtains observations with imperfect information
from the infoset. It has three branches since we encode from the infoset. It has three branches since we encode
different features for different positions. different features for different positions.
This function will return dictionary named `obs`. It contains This function will return dictionary named `obs`. It contains
several fields. These fields will be used to train the model. several fields. These fields will be used to train the model.
One can play with those features to improve the performance. One can play with those features to improve the performance.
`position` is a string that can be landlord/landlord_down/landlord_up `position` is a string that can be landlord/landlord_down/landlord_front/landlord_up
`x_batch` is a batch of features (excluding the hisorical moves). `x_batch` is a batch of features (excluding the hisorical moves).
It also encodes the action feature It also encodes the action feature
@ -391,7 +408,7 @@ def get_obs(infoset, use_general=True):
`z`: same as z_batch but not a batch. `z`: same as z_batch but not a batch.
""" """
if use_general: if use_general:
if infoset.player_position not in ["landlord", "landlord_up", "landlord_down"]: if infoset.player_position not in ["landlord", "landlord_up", "landlord_front", "landlord_down"]:
raise ValueError('') raise ValueError('')
return _get_obs_general(infoset, infoset.player_position) return _get_obs_general(infoset, infoset.player_position)
else: else:
@ -399,6 +416,8 @@ def get_obs(infoset, use_general=True):
return _get_obs_landlord(infoset) return _get_obs_landlord(infoset)
elif infoset.player_position == 'landlord_up': elif infoset.player_position == 'landlord_up':
return _get_obs_landlord_up(infoset) return _get_obs_landlord_up(infoset)
elif infoset.player_position == 'landlord_front':
return _get_obs_landlord_front(infoset)
elif infoset.player_position == 'landlord_down': elif infoset.player_position == 'landlord_down':
return _get_obs_landlord_down(infoset) return _get_obs_landlord_down(infoset)
else: else:
@ -424,18 +443,22 @@ def _cards2array(list_cards):
the representations. the representations.
""" """
if len(list_cards) == 0: if len(list_cards) == 0:
return np.zeros(54, dtype=np.int8) return np.zeros(108, dtype=np.int8)
matrix = np.zeros([4, 13], dtype=np.int8) matrix = np.zeros([8, 13], dtype=np.int8)
jokers = np.zeros(2, dtype=np.int8) jokers = np.zeros(4, dtype=np.int8)
counter = Counter(list_cards) counter = Counter(list_cards)
for card, num_times in counter.items(): for card, num_times in counter.items():
if card < 20: if card < 20:
matrix[:, Card2Column[card]] = NumOnes2Array[num_times] matrix[:, Card2Column[card]] = NumOnes2Array[num_times]
elif card == 20: elif card == 20:
jokers[0] = 1 jokers[0] = 1
if num_times == 2:
jokers[1] = 1
elif card == 30: elif card == 30:
jokers[1] = 1 jokers[2] = 1
if num_times == 2:
jokers[3] = 1
return np.concatenate((matrix.flatten('F'), jokers)) return np.concatenate((matrix.flatten('F'), jokers))
@ -449,7 +472,7 @@ def _cards2array(list_cards):
# Finally, we obtain a 5x162 matrix, which will be fed # Finally, we obtain a 5x162 matrix, which will be fed
# into LSTM for encoding. # into LSTM for encoding.
# """ # """
# action_seq_array = np.zeros((len(action_seq_list), 54)) # action_seq_array = np.zeros((len(action_seq_list), 108))
# for row, list_cards in enumerate(action_seq_list): # for row, list_cards in enumerate(action_seq_list):
# action_seq_array[row, :] = _cards2array(list_cards) # action_seq_array[row, :] = _cards2array(list_cards)
# # action_seq_array = action_seq_array.reshape(5, 162) # # action_seq_array = action_seq_array.reshape(5, 162)
@ -458,26 +481,26 @@ def _cards2array(list_cards):
def _action_seq_list2array(action_seq_list, new_model=True): def _action_seq_list2array(action_seq_list, new_model=True):
""" """
A utility function to encode the historical moves. A utility function to encode the historical moves.
We encode the historical 15 actions. If there is We encode the historical 20 actions. If there is
no 15 actions, we pad the features with 0. Since no 20 actions, we pad the features with 0. Since
three moves is a round in DouDizhu, we concatenate three moves is a round in DouDizhu, we concatenate
the representations for each consecutive three moves. the representations for each consecutive three moves.
Finally, we obtain a 5x162 matrix, which will be fed Finally, we obtain a 5x432 matrix, which will be fed
into LSTM for encoding. into LSTM for encoding.
""" """
if new_model: if new_model:
position_map = {"landlord": 0, "landlord_up": 1, "landlord_down": 2} # position_map = {"landlord": 0, "landlord_up": 1, "landlord_front": 2, "landlord_down": 3}
action_seq_array = np.ones((len(action_seq_list), 54)) * -1 # Default Value -1 for not using area action_seq_array = np.ones((len(action_seq_list), 108)) * -1 # Default Value -1 for not using area
for row, list_cards in enumerate(action_seq_list): for row, list_cards in enumerate(action_seq_list):
if list_cards != []: if list_cards != []:
action_seq_array[row, :54] = _cards2array(list_cards[1]) action_seq_array[row, :108] = _cards2array(list_cards[1])
else: else:
action_seq_array = np.zeros((len(action_seq_list), 54)) action_seq_array = np.zeros((len(action_seq_list), 108))
for row, list_cards in enumerate(action_seq_list): for row, list_cards in enumerate(action_seq_list):
if list_cards != []: if list_cards != []:
action_seq_array[row, :] = _cards2array(list_cards[1]) action_seq_array[row, :] = _cards2array(list_cards[1])
action_seq_array = action_seq_array.reshape(5, 162) action_seq_array = action_seq_array.reshape(5, 432)
return action_seq_array return action_seq_array
# action_seq_array = np.zeros((len(action_seq_list), 54)) # action_seq_array = np.zeros((len(action_seq_list), 54))
@ -487,10 +510,10 @@ def _action_seq_list2array(action_seq_list, new_model=True):
# return action_seq_array # return action_seq_array
def _process_action_seq(sequence, length=15, new_model=True): def _process_action_seq(sequence, length=20, new_model=True):
""" """
A utility function encoding historical moves. We A utility function encoding historical moves. We
encode 15 moves. If there is no 15 moves, we pad encode 20 moves. If there is no 20 moves, we pad
with zeros. with zeros.
""" """
sequence = sequence[-length:].copy() sequence = sequence[-length:].copy()
@ -508,8 +531,8 @@ def _get_one_hot_bomb(bomb_num):
A utility function to encode the number of bombs A utility function to encode the number of bombs
into one-hot representation. into one-hot representation.
""" """
one_hot = np.zeros(15) one_hot = np.zeros(29)
one_hot[bomb_num] = 1 one_hot[bomb_num[0] + bomb_num[1]] = 1
return one_hot return one_hot
@ -536,13 +559,19 @@ def _get_obs_landlord(infoset):
my_action_batch[j, :] = _cards2array(action) my_action_batch[j, :] = _cards2array(action)
landlord_up_num_cards_left = _get_one_hot_array( landlord_up_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_up'], 17) infoset.num_cards_left_dict['landlord_up'], 25)
landlord_up_num_cards_left_batch = np.repeat( landlord_up_num_cards_left_batch = np.repeat(
landlord_up_num_cards_left[np.newaxis, :], landlord_up_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
landlord_front_num_cards_left_batch = np.repeat(
landlord_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_num_cards_left = _get_one_hot_array( landlord_down_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 17) infoset.num_cards_left_dict['landlord_down'], 25)
landlord_down_num_cards_left_batch = np.repeat( landlord_down_num_cards_left_batch = np.repeat(
landlord_down_num_cards_left[np.newaxis, :], landlord_down_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
@ -553,6 +582,12 @@ def _get_obs_landlord(infoset):
landlord_up_played_cards[np.newaxis, :], landlord_up_played_cards[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
landlord_front_played_cards_batch = np.repeat(
landlord_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_played_cards = _cards2array( landlord_down_played_cards = _cards2array(
infoset.played_cards['landlord_down']) infoset.played_cards['landlord_down'])
landlord_down_played_cards_batch = np.repeat( landlord_down_played_cards_batch = np.repeat(
@ -569,8 +604,10 @@ def _get_obs_landlord(infoset):
other_handcards_batch, other_handcards_batch,
last_action_batch, last_action_batch,
landlord_up_played_cards_batch, landlord_up_played_cards_batch,
landlord_front_played_cards_batch,
landlord_down_played_cards_batch, landlord_down_played_cards_batch,
landlord_up_num_cards_left_batch, landlord_up_num_cards_left_batch,
landlord_front_num_cards_left_batch,
landlord_down_num_cards_left_batch, landlord_down_num_cards_left_batch,
bomb_num_batch, bomb_num_batch,
my_action_batch)) my_action_batch))
@ -578,23 +615,25 @@ def _get_obs_landlord(infoset):
other_handcards, other_handcards,
last_action, last_action,
landlord_up_played_cards, landlord_up_played_cards,
landlord_front_played_cards,
landlord_down_played_cards, landlord_down_played_cards,
landlord_up_num_cards_left, landlord_up_num_cards_left,
landlord_front_num_cards_left,
landlord_down_num_cards_left, landlord_down_num_cards_left,
bomb_num)) bomb_num))
z = _action_seq_list2array(_process_action_seq( z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False) infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat( z_batch = np.repeat(
z[np.newaxis, :, :], z[np.newaxis, :, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
obs = { obs = {
'position': 'landlord', 'position': 'landlord',
'x_batch': x_batch.astype(np.float32), 'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32), 'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions, 'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8), 'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8), 'z': z.astype(np.int8),
} }
return obs return obs
def _get_obs_landlord_up(infoset): def _get_obs_landlord_up(infoset):
@ -625,7 +664,7 @@ def _get_obs_landlord_up(infoset):
last_landlord_action[np.newaxis, :], last_landlord_action[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_num_cards_left = _get_one_hot_array( landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 20) infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat( landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :], landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
@ -642,7 +681,7 @@ def _get_obs_landlord_up(infoset):
last_teammate_action[np.newaxis, :], last_teammate_action[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
teammate_num_cards_left = _get_one_hot_array( teammate_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 17) infoset.num_cards_left_dict['landlord_down'], 25)
teammate_num_cards_left_batch = np.repeat( teammate_num_cards_left_batch = np.repeat(
teammate_num_cards_left[np.newaxis, :], teammate_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
@ -653,6 +692,144 @@ def _get_obs_landlord_up(infoset):
teammate_played_cards[np.newaxis, :], teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
last_teammate_front_action = _cards2array(
infoset.last_move_dict['landlord_front'])
last_teammate_front_action_batch = np.repeat(
last_teammate_front_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
teammate_front_num_cards_left_batch = np.repeat(
teammate_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
teammate_front_played_cards_batch = np.repeat(
teammate_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
bomb_num = _get_one_hot_bomb(
infoset.bomb_num)
bomb_num_batch = np.repeat(
bomb_num[np.newaxis, :],
num_legal_actions, axis=0)
x_batch = np.hstack((my_handcards_batch,
other_handcards_batch,
landlord_played_cards_batch,
teammate_played_cards_batch,
teammate_front_played_cards_batch,
last_action_batch,
last_landlord_action_batch,
last_teammate_action_batch,
last_teammate_front_action_batch,
landlord_num_cards_left_batch,
teammate_num_cards_left_batch,
teammate_front_num_cards_left_batch,
bomb_num_batch,
my_action_batch))
x_no_action = np.hstack((my_handcards,
other_handcards,
landlord_played_cards,
teammate_played_cards,
teammate_front_played_cards,
last_action,
last_landlord_action,
last_teammate_action,
last_teammate_front_action,
landlord_num_cards_left,
teammate_num_cards_left,
teammate_front_num_cards_left,
bomb_num))
z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat(
z[np.newaxis, :, :],
num_legal_actions, axis=0)
obs = {
'position': 'landlord_up',
'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8),
}
return obs
def _get_obs_landlord_front(infoset):
"""
Obttain the landlord_front features. See Table 5 in
https://arxiv.org/pdf/2106.06135.pdf
"""
num_legal_actions = len(infoset.legal_actions)
my_handcards = _cards2array(infoset.player_hand_cards)
my_handcards_batch = np.repeat(my_handcards[np.newaxis, :],
num_legal_actions, axis=0)
other_handcards = _cards2array(infoset.other_hand_cards)
other_handcards_batch = np.repeat(other_handcards[np.newaxis, :],
num_legal_actions, axis=0)
last_action = _cards2array(infoset.last_move)
last_action_batch = np.repeat(last_action[np.newaxis, :],
num_legal_actions, axis=0)
my_action_batch = np.zeros(my_handcards_batch.shape)
for j, action in enumerate(infoset.legal_actions):
my_action_batch[j, :] = _cards2array(action)
last_landlord_action = _cards2array(
infoset.last_move_dict['landlord'])
last_landlord_action_batch = np.repeat(
last_landlord_action[np.newaxis, :],
num_legal_actions, axis=0)
landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_played_cards = _cards2array(
infoset.played_cards['landlord'])
landlord_played_cards_batch = np.repeat(
landlord_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
last_teammate_action = _cards2array(
infoset.last_move_dict['landlord_down'])
last_teammate_action_batch = np.repeat(
last_teammate_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 25)
teammate_num_cards_left_batch = np.repeat(
teammate_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_played_cards = _cards2array(
infoset.played_cards['landlord_down'])
teammate_played_cards_batch = np.repeat(
teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
last_teammate_front_action = _cards2array(
infoset.last_move_dict['landlord_front'])
last_teammate_front_action_batch = np.repeat(
last_teammate_front_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
teammate_front_num_cards_left_batch = np.repeat(
teammate_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
teammate_front_played_cards_batch = np.repeat(
teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
bomb_num = _get_one_hot_bomb( bomb_num = _get_one_hot_bomb(
infoset.bomb_num) infoset.bomb_num)
bomb_num_batch = np.repeat( bomb_num_batch = np.repeat(
@ -663,36 +840,42 @@ def _get_obs_landlord_up(infoset):
other_handcards_batch, other_handcards_batch,
landlord_played_cards_batch, landlord_played_cards_batch,
teammate_played_cards_batch, teammate_played_cards_batch,
teammate_front_played_cards_batch,
last_action_batch, last_action_batch,
last_landlord_action_batch, last_landlord_action_batch,
last_teammate_action_batch, last_teammate_action_batch,
last_teammate_front_action_batch,
landlord_num_cards_left_batch, landlord_num_cards_left_batch,
teammate_num_cards_left_batch, teammate_num_cards_left_batch,
teammate_front_num_cards_left_batch,
bomb_num_batch, bomb_num_batch,
my_action_batch)) my_action_batch))
x_no_action = np.hstack((my_handcards, x_no_action = np.hstack((my_handcards,
other_handcards, other_handcards,
landlord_played_cards, landlord_played_cards,
teammate_played_cards, teammate_played_cards,
teammate_front_played_cards,
last_action, last_action,
last_landlord_action, last_landlord_action,
last_teammate_action, last_teammate_action,
last_teammate_front_action,
landlord_num_cards_left, landlord_num_cards_left,
teammate_num_cards_left, teammate_num_cards_left,
teammate_front_num_cards_left,
bomb_num)) bomb_num))
z = _action_seq_list2array(_process_action_seq( z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False) infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat( z_batch = np.repeat(
z[np.newaxis, :, :], z[np.newaxis, :, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
obs = { obs = {
'position': 'landlord_up', 'position': 'landlord_front',
'x_batch': x_batch.astype(np.float32), 'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32), 'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions, 'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8), 'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8), 'z': z.astype(np.int8),
} }
return obs return obs
def _get_obs_landlord_down(infoset): def _get_obs_landlord_down(infoset):
@ -723,7 +906,7 @@ def _get_obs_landlord_down(infoset):
last_landlord_action[np.newaxis, :], last_landlord_action[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_num_cards_left = _get_one_hot_array( landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 20) infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat( landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :], landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
@ -740,7 +923,7 @@ def _get_obs_landlord_down(infoset):
last_teammate_action[np.newaxis, :], last_teammate_action[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
teammate_num_cards_left = _get_one_hot_array( teammate_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_up'], 17) infoset.num_cards_left_dict['landlord_up'], 25)
teammate_num_cards_left_batch = np.repeat( teammate_num_cards_left_batch = np.repeat(
teammate_num_cards_left[np.newaxis, :], teammate_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
@ -751,10 +934,21 @@ def _get_obs_landlord_down(infoset):
teammate_played_cards[np.newaxis, :], teammate_played_cards[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_played_cards = _cards2array( last_teammate_front_action = _cards2array(
infoset.played_cards['landlord']) infoset.last_move_dict['landlord_front'])
landlord_played_cards_batch = np.repeat( last_teammate_front_action_batch = np.repeat(
landlord_played_cards[np.newaxis, :], last_teammate_front_action[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
teammate_front_num_cards_left_batch = np.repeat(
teammate_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
teammate_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
teammate_front_played_cards_batch = np.repeat(
teammate_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
bomb_num = _get_one_hot_bomb( bomb_num = _get_one_hot_bomb(
@ -767,36 +961,42 @@ def _get_obs_landlord_down(infoset):
other_handcards_batch, other_handcards_batch,
landlord_played_cards_batch, landlord_played_cards_batch,
teammate_played_cards_batch, teammate_played_cards_batch,
teammate_front_played_cards_batch,
last_action_batch, last_action_batch,
last_landlord_action_batch, last_landlord_action_batch,
last_teammate_action_batch, last_teammate_action_batch,
last_teammate_front_action_batch,
landlord_num_cards_left_batch, landlord_num_cards_left_batch,
teammate_num_cards_left_batch, teammate_num_cards_left_batch,
teammate_front_num_cards_left_batch,
bomb_num_batch, bomb_num_batch,
my_action_batch)) my_action_batch))
x_no_action = np.hstack((my_handcards, x_no_action = np.hstack((my_handcards,
other_handcards, other_handcards,
landlord_played_cards, landlord_played_cards,
teammate_played_cards, teammate_played_cards,
teammate_front_played_cards,
last_action, last_action,
last_landlord_action, last_landlord_action,
last_teammate_action, last_teammate_action,
last_teammate_front_action,
landlord_num_cards_left, landlord_num_cards_left,
teammate_num_cards_left, teammate_num_cards_left,
teammate_front_num_cards_left,
bomb_num)) bomb_num))
z = _action_seq_list2array(_process_action_seq( z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False) infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat( z_batch = np.repeat(
z[np.newaxis, :, :], z[np.newaxis, :, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
obs = { obs = {
'position': 'landlord_down', 'position': 'landlord_down',
'x_batch': x_batch.astype(np.float32), 'x_batch': x_batch.astype(np.float32),
'z_batch': z_batch.astype(np.float32), 'z_batch': z_batch.astype(np.float32),
'legal_actions': infoset.legal_actions, 'legal_actions': infoset.legal_actions,
'x_no_action': x_no_action.astype(np.int8), 'x_no_action': x_no_action.astype(np.int8),
'z': z.astype(np.int8), 'z': z.astype(np.int8),
} }
return obs return obs
def _get_obs_landlord_withbid(infoset): def _get_obs_landlord_withbid(infoset):
@ -869,7 +1069,7 @@ def _get_obs_landlord_withbid(infoset):
landlord_down_num_cards_left, landlord_down_num_cards_left,
bomb_num)) bomb_num))
z = _action_seq_list2array(_process_action_seq( z = _action_seq_list2array(_process_action_seq(
infoset.card_play_action_seq, 15, False), False) infoset.card_play_action_seq, 20, False), False)
z_batch = np.repeat( z_batch = np.repeat(
z[np.newaxis, :, :], z[np.newaxis, :, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
@ -970,21 +1170,21 @@ def _get_obs_general1(infoset, position):
bomb_num[np.newaxis, :], bomb_num[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
x_batch = np.hstack((position_info_batch, # 3 x_batch = np.hstack((position_info_batch, # 4
my_handcards_batch, # 54 my_handcards_batch, # 108
other_handcards_batch, # 54 other_handcards_batch, # 108
three_landlord_cards_batch, # 54 three_landlord_cards_batch, # 108
last_action_batch, # 54 last_action_batch, # 108
landlord_played_cards_batch, # 54 landlord_played_cards_batch, # 108
landlord_up_played_cards_batch, # 54 landlord_up_played_cards_batch, # 108
landlord_down_played_cards_batch, # 54 landlord_down_played_cards_batch, # 108
landlord_num_cards_left_batch, # 20 landlord_num_cards_left_batch, # 20
landlord_up_num_cards_left_batch, # 17 landlord_up_num_cards_left_batch, # 17
landlord_down_num_cards_left_batch, # 17 landlord_down_num_cards_left_batch, # 17
bomb_num_batch, # 15 bomb_num_batch, # 15
bid_info_batch, # 12 bid_info_batch, # 12
multiply_info_batch, # 3 multiply_info_batch, # 3
my_action_batch)) # 54 my_action_batch)) # 108
x_no_action = np.hstack((position_info, x_no_action = np.hstack((position_info,
my_handcards, my_handcards,
other_handcards, other_handcards,
@ -1025,9 +1225,10 @@ def _get_obs_general(infoset, position):
num_legal_actions, axis=0) num_legal_actions, axis=0)
position_map = { position_map = {
"landlord": [1, 0, 0], "landlord": [1, 0, 0, 0],
"landlord_up": [0, 1, 0], "landlord_up": [0, 1, 0, 0],
"landlord_down": [0, 0, 1] "landlord_front": [0, 0, 1, 0],
"landlord_down": [0, 0, 0, 1]
} }
position_info = np.array(position_map[position]) position_info = np.array(position_map[position])
position_info_batch = np.repeat(position_info[np.newaxis, :], position_info_batch = np.repeat(position_info[np.newaxis, :],
@ -1041,9 +1242,9 @@ def _get_obs_general(infoset, position):
multiply_info_batch = np.repeat(multiply_info[np.newaxis, :], multiply_info_batch = np.repeat(multiply_info[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
three_landlord_cards = _cards2array(infoset.three_landlord_cards) # three_landlord_cards = _cards2array(infoset.three_landlord_cards)
three_landlord_cards_batch = np.repeat(three_landlord_cards[np.newaxis, :], # three_landlord_cards_batch = np.repeat(three_landlord_cards[np.newaxis, :],
num_legal_actions, axis=0) # num_legal_actions, axis=0)
last_action = _cards2array(infoset.last_move) last_action = _cards2array(infoset.last_move)
last_action_batch = np.repeat(last_action[np.newaxis, :], last_action_batch = np.repeat(last_action[np.newaxis, :],
@ -1054,25 +1255,31 @@ def _get_obs_general(infoset, position):
my_action_batch[j, :] = _cards2array(action) my_action_batch[j, :] = _cards2array(action)
landlord_num_cards_left = _get_one_hot_array( landlord_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord'], 20) infoset.num_cards_left_dict['landlord'], 33)
landlord_num_cards_left_batch = np.repeat( landlord_num_cards_left_batch = np.repeat(
landlord_num_cards_left[np.newaxis, :], landlord_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_up_num_cards_left = _get_one_hot_array( landlord_up_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_up'], 17) infoset.num_cards_left_dict['landlord_up'], 25)
landlord_up_num_cards_left_batch = np.repeat( landlord_up_num_cards_left_batch = np.repeat(
landlord_up_num_cards_left[np.newaxis, :], landlord_up_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_front_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_front'], 25)
landlord_front_num_cards_left_batch = np.repeat(
landlord_front_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_num_cards_left = _get_one_hot_array( landlord_down_num_cards_left = _get_one_hot_array(
infoset.num_cards_left_dict['landlord_down'], 17) infoset.num_cards_left_dict['landlord_down'], 25)
landlord_down_num_cards_left_batch = np.repeat( landlord_down_num_cards_left_batch = np.repeat(
landlord_down_num_cards_left[np.newaxis, :], landlord_down_num_cards_left[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
other_handcards_left_list = [] other_handcards_left_list = []
for pos in ["landlord", "landlord_up", "landlord_up"]: for pos in ["landlord", "landlord_up", "landlord_front", "landlord_down"]:
if pos != position: if pos != position:
other_handcards_left_list.extend(infoset.all_handcards[pos]) other_handcards_left_list.extend(infoset.all_handcards[pos])
@ -1088,6 +1295,12 @@ def _get_obs_general(infoset, position):
landlord_up_played_cards[np.newaxis, :], landlord_up_played_cards[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
landlord_front_played_cards = _cards2array(
infoset.played_cards['landlord_front'])
landlord_front_played_cards_batch = np.repeat(
landlord_front_played_cards[np.newaxis, :],
num_legal_actions, axis=0)
landlord_down_played_cards = _cards2array( landlord_down_played_cards = _cards2array(
infoset.played_cards['landlord_down']) infoset.played_cards['landlord_down'])
landlord_down_played_cards_batch = np.repeat( landlord_down_played_cards_batch = np.repeat(
@ -1100,24 +1313,26 @@ def _get_obs_general(infoset, position):
bomb_num[np.newaxis, :], bomb_num[np.newaxis, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
num_cards_left = np.hstack(( num_cards_left = np.hstack((
landlord_num_cards_left, # 20 landlord_num_cards_left, # 33
landlord_up_num_cards_left, # 17 landlord_up_num_cards_left, # 25
landlord_front_num_cards_left, # 25
landlord_down_num_cards_left)) landlord_down_num_cards_left))
x_batch = np.hstack(( x_batch = np.hstack((
bid_info_batch, # 12 bid_info_batch, # 16
multiply_info_batch)) # 3 multiply_info_batch)) # 4
x_no_action = np.hstack(( x_no_action = np.hstack((
bid_info, bid_info,
multiply_info)) multiply_info))
z =np.vstack(( z =np.vstack((
num_cards_left, num_cards_left,
my_handcards, # 54 my_handcards, # 108
other_handcards, # 54 other_handcards, # 108
three_landlord_cards, # 54 # three_landlord_cards, # 108
landlord_played_cards, # 54 landlord_played_cards, # 108
landlord_up_played_cards, # 54 landlord_up_played_cards, # 108
landlord_down_played_cards, # 54 landlord_front_played_cards, # 108
landlord_down_played_cards, # 108
_action_seq_list2array(_process_action_seq(infoset.card_play_action_seq, 32)) _action_seq_list2array(_process_action_seq(infoset.card_play_action_seq, 32))
)) ))
@ -1125,7 +1340,7 @@ def _get_obs_general(infoset, position):
z[np.newaxis, :, :], z[np.newaxis, :, :],
num_legal_actions, axis=0) num_legal_actions, axis=0)
my_action_batch = my_action_batch[:,np.newaxis,:] my_action_batch = my_action_batch[:,np.newaxis,:]
z_batch = np.zeros([len(_z_batch),40,54],int) z_batch = np.zeros([len(_z_batch),40,108],int)
for i in range(0,len(_z_batch)): for i in range(0,len(_z_batch)):
z_batch[i] = np.vstack((my_action_batch[i],_z_batch[i])) z_batch[i] = np.vstack((my_action_batch[i],_z_batch[i]))
obs = { obs = {
@ -1139,17 +1354,17 @@ def _get_obs_general(infoset, position):
return obs return obs
def gen_bid_legal_actions(player_id, bid_info): def gen_bid_legal_actions(player_id, bid_info):
self_bid_info = bid_info[:, [(player_id - 1) % 3, player_id, (player_id + 1) % 3]] self_bid_info = bid_info[:, [(player_id - 1) % 4, player_id, (player_id + 1) % 4, (player_id + 2) % 4]]
curr_round = -1 curr_round = -1
for r in range(4): for r in range(5):
if -1 in self_bid_info[r]: if -1 in self_bid_info[r]:
curr_round = r curr_round = r
break break
bid_actions = [] bid_actions = []
if curr_round != -1: if curr_round != -1:
self_bid_info[curr_round] = [0, 0, 0] self_bid_info[curr_round] = [0, 0, 0, 0]
bid_actions.append(np.array(self_bid_info).flatten()) bid_actions.append(np.array(self_bid_info).flatten())
self_bid_info[curr_round] = [0, 1, 0] self_bid_info[curr_round] = [0, 1, 0, 0]
bid_actions.append(np.array(self_bid_info).flatten()) bid_actions.append(np.array(self_bid_info).flatten())
return np.array(bid_actions) return np.array(bid_actions)
@ -1273,9 +1488,9 @@ def _get_obs_for_bid_legacy(player_id, bid_info, hand_cards):
return obs return obs
def _get_obs_for_bid(player_id, bid_info, hand_cards): def _get_obs_for_bid(player_id, bid_info, hand_cards):
all_cards = [3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, # all_cards = [3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, # 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12,
12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 20, 30] # 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 17, 17, 17, 17, 20, 30]
num_legal_actions = 2 num_legal_actions = 2
my_handcards = _cards2array(hand_cards) my_handcards = _cards2array(hand_cards)
my_handcards_batch = np.repeat(my_handcards[np.newaxis, :], my_handcards_batch = np.repeat(my_handcards[np.newaxis, :],

199
douzero/env/game.py vendored
View File

@ -11,10 +11,19 @@ RealCard2EnvCard = {'3': 3, '4': 4, '5': 5, '6': 6, '7': 7,
'8': 8, '9': 9, '10': 10, 'J': 11, 'Q': 12, '8': 8, '9': 9, '10': 10, 'J': 11, 'Q': 12,
'K': 13, 'A': 14, '2': 17, 'X': 20, 'D': 30} 'K': 13, 'A': 14, '2': 17, 'X': 20, 'D': 30}
bombs = [[3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], bombs = [
[7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9], [10, 10, 10, 10], [[3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7],
[11, 11, 11, 11], [12, 12, 12, 12], [13, 13, 13, 13], [14, 14, 14, 14], [8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10], [11, 11, 11, 11, 11, 11],
[17, 17, 17, 17], [20, 30]] [12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13], [14, 14, 14, 14, 14, 14], [17, 17, 17, 17, 17, 17],
[3, 3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7, 7],
[8, 8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10, 10], [11, 11, 11, 11, 11, 11, 11],
[12, 12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13, 13], [14, 14, 14, 14, 14, 14, 14],
[17, 17, 17, 17, 17, 17, 17]],
[[3, 3, 3, 3, 3, 3, 3, 3], [4, 4, 4, 4, 4, 4, 4, 4], [5, 5, 5, 5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6, 6, 6],
[7, 7, 7, 7, 7, 7, 7, 7], [8, 8, 8, 8, 8, 8, 8, 8], [9, 9, 9, 9, 9, 9, 9, 9], [10, 10, 10, 10, 10, 10, 10, 10],
[11, 11, 11, 11, 11, 11, 11, 11], [12, 12, 12, 12, 12, 12, 12, 12], [13, 13, 13, 13, 13, 13, 13, 13],
[14, 14, 14, 14, 14, 14, 14, 14], [17, 17, 17, 17, 17, 17, 17, 17],
[20, 20, 30, 30]]]
class GameEnv(object): class GameEnv(object):
@ -22,7 +31,7 @@ class GameEnv(object):
self.card_play_action_seq = [] self.card_play_action_seq = []
self.three_landlord_cards = None # self.three_landlord_cards = None
self.game_over = False self.game_over = False
self.acting_player_position = None self.acting_player_position = None
@ -32,10 +41,12 @@ class GameEnv(object):
self.last_move_dict = {'landlord': [], self.last_move_dict = {'landlord': [],
'landlord_up': [], 'landlord_up': [],
'landlord_front': [],
'landlord_down': []} 'landlord_down': []}
self.played_cards = {'landlord': [], self.played_cards = {'landlord': [],
'landlord_up': [], 'landlord_up': [],
'landlord_front': [],
'landlord_down': []} 'landlord_down': []}
self.last_move = [] self.last_move = []
@ -48,24 +59,28 @@ class GameEnv(object):
'farmer': 0} 'farmer': 0}
self.info_sets = {'landlord': InfoSet('landlord'), self.info_sets = {'landlord': InfoSet('landlord'),
'landlord_up': InfoSet('landlord_up'), 'landlord_up': InfoSet('landlord_up'),
'landlord_down': InfoSet('landlord_down')} 'landlord_front': InfoSet('landlord_front'),
'landlord_down': InfoSet('landlord_down')}
self.bomb_num = 0 self.bomb_num = [0, 0]
self.pos_bomb_num = { self.pos_bomb_num = {
"landlord": 0, "landlord": 0,
"landlord_up": 0, "landlord_up": 0,
"landlord_front": 0,
"landlord_down": 0 "landlord_down": 0
} }
self.last_pid = 'landlord' self.last_pid = 'landlord'
self.bid_info = [[-1, -1, -1], self.bid_info = [[-1, -1, -1, -1],
[-1, -1, -1], [-1, -1, -1, -1],
[-1, -1, -1], [-1, -1, -1, -1],
[-1, -1, -1]] [-1, -1, -1, -1],
[-1, -1, -1, -1]]
self.bid_count = 0 self.bid_count = 0
self.multiply_count = {'landlord': 0, self.multiply_count = {'landlord': 0,
'landlord_up': 0, 'landlord_up': 0,
'landlord_front': 0,
'landlord_down': 0} 'landlord_down': 0}
self.step_count = 0 self.step_count = 0
@ -75,9 +90,11 @@ class GameEnv(object):
card_play_data['landlord'] card_play_data['landlord']
self.info_sets['landlord_up'].player_hand_cards = \ self.info_sets['landlord_up'].player_hand_cards = \
card_play_data['landlord_up'] card_play_data['landlord_up']
self.info_sets['landlord_front'].player_hand_cards = \
card_play_data['landlord_front']
self.info_sets['landlord_down'].player_hand_cards = \ self.info_sets['landlord_down'].player_hand_cards = \
card_play_data['landlord_down'] card_play_data['landlord_down']
self.three_landlord_cards = card_play_data['three_landlord_cards'] # self.three_landlord_cards = card_play_data['three_landlord_cards']
self.get_acting_player_position() self.get_acting_player_position()
self.game_infoset = self.get_infoset() self.game_infoset = self.get_infoset()
@ -85,6 +102,7 @@ class GameEnv(object):
def game_done(self): def game_done(self):
if len(self.info_sets['landlord'].player_hand_cards) == 0 or \ if len(self.info_sets['landlord'].player_hand_cards) == 0 or \
len(self.info_sets['landlord_up'].player_hand_cards) == 0 or \ len(self.info_sets['landlord_up'].player_hand_cards) == 0 or \
len(self.info_sets['landlord_front'].player_hand_cards) == 0 or \
len(self.info_sets['landlord_down'].player_hand_cards) == 0: len(self.info_sets['landlord_down'].player_hand_cards) == 0:
# if one of the three players discards his hand, # if one of the three players discards his hand,
# then game is over. # then game is over.
@ -96,21 +114,21 @@ class GameEnv(object):
def compute_player_utility(self): def compute_player_utility(self):
if len(self.info_sets['landlord'].player_hand_cards) == 0: if len(self.info_sets['landlord'].player_hand_cards) == 0:
self.player_utility_dict = {'landlord': 2, self.player_utility_dict = {'landlord': 3,
'farmer': -1} 'farmer': -1}
else: else:
self.player_utility_dict = {'landlord': -2, self.player_utility_dict = {'landlord': -3,
'farmer': 1} 'farmer': 1}
def update_num_wins_scores(self): def update_num_wins_scores(self):
for pos, utility in self.player_utility_dict.items(): for pos, utility in self.player_utility_dict.items():
base_score = 2 if pos == 'landlord' else 1 base_score = 3 if pos == 'landlord' else 1
if utility > 0: if utility > 0:
self.num_wins[pos] += 1 self.num_wins[pos] += 1
self.winner = pos self.winner = pos
self.num_scores[pos] += base_score * (2 ** self.bomb_num) self.num_scores[pos] += base_score * (2 ** self.bomb_num[0]) * (3 ** self.bomb_num[1])
else: else:
self.num_scores[pos] -= base_score * (2 ** self.bomb_num) self.num_scores[pos] -= base_score * (2 ** self.bomb_num[0]) * (3 ** self.bomb_num[1])
def get_winner(self): def get_winner(self):
return self.winner return self.winner
@ -121,12 +139,17 @@ class GameEnv(object):
def step(self): def step(self):
action = self.players[self.acting_player_position].act( action = self.players[self.acting_player_position].act(
self.game_infoset) self.game_infoset)
self.step_count += 1 assert action in self.game_infoset.legal_actions
if len(action) > 0: if len(action) > 0:
self.last_pid = self.acting_player_position self.last_pid = self.acting_player_position
if action in bombs: if action in bombs[0]:
self.bomb_num += 1 self.bomb_num[0] += 1
self.pos_bomb_num[self.acting_player_position] += 1
if action in bombs[1]:
self.bomb_num[1] += 1
self.pos_bomb_num[self.acting_player_position] += 1 self.pos_bomb_num[self.acting_player_position] += 1
self.last_move_dict[ self.last_move_dict[
@ -137,15 +160,15 @@ class GameEnv(object):
self.played_cards[self.acting_player_position] += action self.played_cards[self.acting_player_position] += action
if self.acting_player_position == 'landlord' and \ # if self.acting_player_position == 'landlord' and \
len(action) > 0 and \ # len(action) > 0 and \
len(self.three_landlord_cards) > 0: # len(self.three_landlord_cards) > 0:
for card in action: # for card in action:
if len(self.three_landlord_cards) > 0: # if len(self.three_landlord_cards) > 0:
if card in self.three_landlord_cards: # if card in self.three_landlord_cards:
self.three_landlord_cards.remove(card) # self.three_landlord_cards.remove(card)
else: # else:
break # break
self.game_done() self.game_done()
if not self.game_over: if not self.game_over:
@ -156,7 +179,7 @@ class GameEnv(object):
def get_last_move(self): def get_last_move(self):
last_move = [] last_move = []
if len(self.card_play_action_seq) != 0: if len(self.card_play_action_seq) != 0:
if len(self.card_play_action_seq[-1][1]) == 0: if len(self.card_play_action_seq[-1]) == 0:
last_move = self.card_play_action_seq[-2][1] last_move = self.card_play_action_seq[-2][1]
else: else:
last_move = self.card_play_action_seq[-1][1] last_move = self.card_play_action_seq[-1][1]
@ -166,7 +189,7 @@ class GameEnv(object):
def get_last_two_moves(self): def get_last_two_moves(self):
last_two_moves = [[], []] last_two_moves = [[], []]
for card in self.card_play_action_seq[-2:]: for card in self.card_play_action_seq[-2:]:
last_two_moves.insert(0, card[1]) last_two_moves.insert(0, card)
last_two_moves = last_two_moves[:2] last_two_moves = last_two_moves[:2]
return last_two_moves return last_two_moves
@ -179,6 +202,9 @@ class GameEnv(object):
self.acting_player_position = 'landlord_down' self.acting_player_position = 'landlord_down'
elif self.acting_player_position == 'landlord_down': elif self.acting_player_position == 'landlord_down':
self.acting_player_position = 'landlord_front'
elif self.acting_player_position == 'landlord_front':
self.acting_player_position = 'landlord_up' self.acting_player_position = 'landlord_up'
else: else:
@ -202,7 +228,10 @@ class GameEnv(object):
rival_move = [] rival_move = []
if len(action_sequence) != 0: if len(action_sequence) != 0:
if len(action_sequence[-1][1]) == 0: if len(action_sequence[-1][1]) == 0:
rival_move = action_sequence[-2][1] if len(action_sequence[-2][1]) == 0:
rival_move = action_sequence[-3][1]
else:
rival_move = action_sequence[-2][1]
else: else:
rival_move = action_sequence[-1][1] rival_move = action_sequence[-1][1]
@ -227,15 +256,36 @@ class GameEnv(object):
moves = ms.filter_type_3_triple(all_moves, rival_move) moves = ms.filter_type_3_triple(all_moves, rival_move)
elif rival_move_type == md.TYPE_4_BOMB: elif rival_move_type == md.TYPE_4_BOMB:
all_moves = mg.gen_type_4_bomb() + mg.gen_type_5_king_bomb() all_moves = mg.gen_type_4_bomb(4)
moves = ms.filter_type_4_bomb(all_moves, rival_move) moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(5) + mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB5:
all_moves = mg.gen_type_4_bomb(5)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB6:
all_moves = mg.gen_type_4_bomb(6)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB7:
all_moves = mg.gen_type_4_bomb(7)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_4_BOMB8:
all_moves = mg.gen_type_4_bomb(8)
moves = ms.filter_type_4_bomb(all_moves, rival_move)
all_moves += mg.gen_type_5_king_bomb()
elif rival_move_type == md.TYPE_5_KING_BOMB: elif rival_move_type == md.TYPE_5_KING_BOMB:
moves = [] moves = []
elif rival_move_type == md.TYPE_6_3_1: # elif rival_move_type == md.TYPE_6_3_1:
all_moves = mg.gen_type_6_3_1() # all_moves = mg.gen_type_6_3_1()
moves = ms.filter_type_6_3_1(all_moves, rival_move) # moves = ms.filter_type_6_3_1(all_moves, rival_move)
elif rival_move_type == md.TYPE_7_3_2: elif rival_move_type == md.TYPE_7_3_2:
all_moves = mg.gen_type_7_3_2() all_moves = mg.gen_type_7_3_2()
@ -253,25 +303,24 @@ class GameEnv(object):
all_moves = mg.gen_type_10_serial_triple(repeat_num=rival_move_len) all_moves = mg.gen_type_10_serial_triple(repeat_num=rival_move_len)
moves = ms.filter_type_10_serial_triple(all_moves, rival_move) moves = ms.filter_type_10_serial_triple(all_moves, rival_move)
elif rival_move_type == md.TYPE_11_SERIAL_3_1: # elif rival_move_type == md.TYPE_11_SERIAL_3_1:
all_moves = mg.gen_type_11_serial_3_1(repeat_num=rival_move_len) # all_moves = mg.gen_type_11_serial_3_1(repeat_num=rival_move_len)
moves = ms.filter_type_11_serial_3_1(all_moves, rival_move) # moves = ms.filter_type_11_serial_3_1(all_moves, rival_move)
elif rival_move_type == md.TYPE_12_SERIAL_3_2: elif rival_move_type == md.TYPE_12_SERIAL_3_2:
all_moves = mg.gen_type_12_serial_3_2(repeat_num=rival_move_len) all_moves = mg.gen_type_12_serial_3_2(repeat_num=rival_move_len)
moves = ms.filter_type_12_serial_3_2(all_moves, rival_move) moves = ms.filter_type_12_serial_3_2(all_moves, rival_move)
elif rival_move_type == md.TYPE_13_4_2: # elif rival_move_type == md.TYPE_13_4_2:
all_moves = mg.gen_type_13_4_2() # all_moves = mg.gen_type_13_4_2()
moves = ms.filter_type_13_4_2(all_moves, rival_move) # moves = ms.filter_type_13_4_2(all_moves, rival_move)
elif rival_move_type == md.TYPE_14_4_22: # elif rival_move_type == md.TYPE_14_4_22:
all_moves = mg.gen_type_14_4_22() # all_moves = mg.gen_type_14_4_22()
moves = ms.filter_type_14_4_22(all_moves, rival_move) # moves = ms.filter_type_14_4_22(all_moves, rival_move)
if rival_move_type not in [md.TYPE_0_PASS, if rival_move_type != md.TYPE_0_PASS and rival_move_type < md.TYPE_4_BOMB:
md.TYPE_4_BOMB, md.TYPE_5_KING_BOMB]: moves = moves + mg.gen_type_4_bomb(4) + mg.gen_type_4_bomb(5) + mg.gen_type_4_bomb(6) + mg.gen_type_4_bomb(7) + mg.gen_type_4_bomb(8) + mg.gen_type_5_king_bomb()
moves = moves + mg.gen_type_4_bomb() + mg.gen_type_5_king_bomb()
if len(rival_move) != 0: # rival_move is not 'pass' if len(rival_move) != 0: # rival_move is not 'pass'
moves = moves + [[]] moves = moves + [[]]
@ -284,7 +333,7 @@ class GameEnv(object):
def reset(self): def reset(self):
self.card_play_action_seq = [] self.card_play_action_seq = []
self.three_landlord_cards = None # self.three_landlord_cards = None
self.game_over = False self.game_over = False
self.acting_player_position = None self.acting_player_position = None
@ -292,33 +341,40 @@ class GameEnv(object):
self.last_move_dict = {'landlord': [], self.last_move_dict = {'landlord': [],
'landlord_up': [], 'landlord_up': [],
'landlord_front': [],
'landlord_down': []} 'landlord_down': []}
self.played_cards = {'landlord': [], self.played_cards = {'landlord': [],
'landlord_up': [], 'landlord_up': [],
'landlord_front': [],
'landlord_down': []} 'landlord_down': []}
self.last_move = [] self.last_move = []
self.last_two_moves = [] self.last_two_moves = []
self.info_sets = {'landlord': InfoSet('landlord'), self.info_sets = {'landlord': InfoSet('landlord'),
'landlord_up': InfoSet('landlord_up'), 'landlord_up': InfoSet('landlord_up'),
'landlord_down': InfoSet('landlord_down')} 'landlord_front': InfoSet('landlord_front'),
'landlord_down': InfoSet('landlord_down')}
self.bomb_num = 0 self.bomb_num = [0, 0]
self.pos_bomb_num = { self.pos_bomb_num = {
"landlord": 0, "landlord": 0,
"landlord_up": 0, "landlord_up": 0,
"landlord_front": 0,
"landlord_down": 0 "landlord_down": 0
} }
self.last_pid = 'landlord' self.last_pid = 'landlord'
self.bid_info = [[-1, -1, -1],
[-1, -1, -1], self.bid_info = [[-1, -1, -1, -1],
[-1, -1, -1], [-1, -1, -1, -1],
[-1, -1, -1]] [-1, -1, -1, -1],
[-1, -1, -1, -1],
[-1, -1, -1, -1]]
self.bid_count = 0 self.bid_count = 0
self.multiply_count = {'landlord': 0, self.multiply_count = {'landlord': 0,
'landlord_up': 0, 'landlord_up': 0,
'landlord_front': 0,
'landlord_down': 0} 'landlord_down': 0}
self.step_count = 0 self.step_count = 0
@ -344,10 +400,10 @@ class GameEnv(object):
self.info_sets[self.acting_player_position].num_cards_left_dict = \ self.info_sets[self.acting_player_position].num_cards_left_dict = \
{pos: len(self.info_sets[pos].player_hand_cards) {pos: len(self.info_sets[pos].player_hand_cards)
for pos in ['landlord', 'landlord_up', 'landlord_down']} for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']}
self.info_sets[self.acting_player_position].other_hand_cards = [] self.info_sets[self.acting_player_position].other_hand_cards = []
for pos in ['landlord', 'landlord_up', 'landlord_down']: for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']:
if pos != self.acting_player_position: if pos != self.acting_player_position:
self.info_sets[ self.info_sets[
self.acting_player_position].other_hand_cards += \ self.acting_player_position].other_hand_cards += \
@ -355,15 +411,15 @@ class GameEnv(object):
self.info_sets[self.acting_player_position].played_cards = \ self.info_sets[self.acting_player_position].played_cards = \
self.played_cards self.played_cards
self.info_sets[self.acting_player_position].three_landlord_cards = \ # self.info_sets[self.acting_player_position].three_landlord_cards = \
self.three_landlord_cards # self.three_landlord_cards
self.info_sets[self.acting_player_position].card_play_action_seq = \ self.info_sets[self.acting_player_position].card_play_action_seq = \
self.card_play_action_seq self.card_play_action_seq
self.info_sets[ self.info_sets[
self.acting_player_position].all_handcards = \ self.acting_player_position].all_handcards = \
{pos: self.info_sets[pos].player_hand_cards {pos: self.info_sets[pos].player_hand_cards
for pos in ['landlord', 'landlord_up', 'landlord_down']} for pos in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']}
return deepcopy(self.info_sets[self.acting_player_position]) return deepcopy(self.info_sets[self.acting_player_position])
@ -379,13 +435,13 @@ class InfoSet(object):
self.player_position = player_position self.player_position = player_position
# The hand cands of the current player. A list. # The hand cands of the current player. A list.
self.player_hand_cards = None self.player_hand_cards = None
# The number of cards left for each player. It is a dict with str-->int # The number of cards left for each player. It is a dict with str-->int
self.num_cards_left_dict = None self.num_cards_left_dict = None
# The three landload cards. A list. # The three landload cards. A list.
self.three_landlord_cards = None # self.three_landlord_cards = None
# The historical moves. It is a list of list # The historical moves. It is a list of list
self.card_play_action_seq = None self.card_play_action_seq = None
# The union of the hand cards of the other two players for the current player # The union of the hand cards of the other two players for the current player
self.other_hand_cards = None self.other_hand_cards = None
# The legal actions for the current move. It is a list of list # The legal actions for the current move. It is a list of list
self.legal_actions = None self.legal_actions = None
@ -397,18 +453,19 @@ class InfoSet(object):
self.last_move_dict = None self.last_move_dict = None
# The played cands so far. It is a list. # The played cands so far. It is a list.
self.played_cards = None self.played_cards = None
# The hand cards of all the players. It is a dict. # The hand cards of all the players. It is a dict.
self.all_handcards = None self.all_handcards = None
# Last player position that plays a valid move, i.e., not `pass` # Last player position that plays a valid move, i.e., not `pass`
self.last_pid = None self.last_pid = None
# The number of bombs played so far # The number of bombs played so far
self.bomb_num = None self.bomb_num = None
self.bid_info = [[-1, -1, -1], self.bid_info = [[-1, -1, -1, -1],
[-1, -1, -1], [-1, -1, -1, -1],
[-1, -1, -1], [-1, -1, -1, -1],
[-1, -1, -1]] [-1, -1, -1, -1],
[-1, -1, -1, -1]]
self.multiply_info = [1, 0, 0] self.multiply_info = [1, 0, 0, 0]
self.player_id = None self.player_id = None

View File

@ -91,17 +91,17 @@ class MovesGener(object):
self.triple_cards_moves.append([k, k, k]) self.triple_cards_moves.append([k, k, k])
return self.triple_cards_moves return self.triple_cards_moves
def gen_type_4_bomb(self): def gen_type_4_bomb(self, num = 4):
self.bomb_moves = [] self.bomb_moves = []
for k, v in self.cards_dict.items(): for k, v in self.cards_dict.items():
if v == 4: if v == num:
self.bomb_moves.append([k, k, k, k]) self.bomb_moves.append([k] * num)
return self.bomb_moves return self.bomb_moves
def gen_type_5_king_bomb(self): def gen_type_5_king_bomb(self):
self.final_bomb_moves = [] self.final_bomb_moves = []
if 20 in self.cards_list and 30 in self.cards_list: if 20 in self.cards_list and self.cards_dict[20] == 2 and 30 in self.cards_list and self.cards_dict[30] == 2:
self.final_bomb_moves.append([20, 30]) self.final_bomb_moves.append([20, 20, 30, 30])
return self.final_bomb_moves return self.final_bomb_moves
def gen_type_6_3_1(self): def gen_type_6_3_1(self):
@ -205,15 +205,19 @@ class MovesGener(object):
moves.extend(self.gen_type_1_single()) moves.extend(self.gen_type_1_single())
moves.extend(self.gen_type_2_pair()) moves.extend(self.gen_type_2_pair())
moves.extend(self.gen_type_3_triple()) moves.extend(self.gen_type_3_triple())
moves.extend(self.gen_type_4_bomb()) moves.extend(self.gen_type_4_bomb(4))
moves.extend(self.gen_type_4_bomb(5))
moves.extend(self.gen_type_4_bomb(6))
moves.extend(self.gen_type_4_bomb(7))
moves.extend(self.gen_type_4_bomb(8))
moves.extend(self.gen_type_5_king_bomb()) moves.extend(self.gen_type_5_king_bomb())
moves.extend(self.gen_type_6_3_1()) # moves.extend(self.gen_type_6_3_1())
moves.extend(self.gen_type_7_3_2()) moves.extend(self.gen_type_7_3_2())
moves.extend(self.gen_type_8_serial_single()) moves.extend(self.gen_type_8_serial_single())
moves.extend(self.gen_type_9_serial_pair()) moves.extend(self.gen_type_9_serial_pair())
moves.extend(self.gen_type_10_serial_triple()) moves.extend(self.gen_type_10_serial_triple())
moves.extend(self.gen_type_11_serial_3_1()) # moves.extend(self.gen_type_11_serial_3_1())
moves.extend(self.gen_type_12_serial_3_2()) moves.extend(self.gen_type_12_serial_3_2())
moves.extend(self.gen_type_13_4_2()) # moves.extend(self.gen_type_13_4_2())
moves.extend(self.gen_type_14_4_22()) # moves.extend(self.gen_type_14_4_22())
return moves return moves

15
douzero/env/utils.py vendored
View File

@ -10,17 +10,22 @@ TYPE_0_PASS = 0
TYPE_1_SINGLE = 1 TYPE_1_SINGLE = 1
TYPE_2_PAIR = 2 TYPE_2_PAIR = 2
TYPE_3_TRIPLE = 3 TYPE_3_TRIPLE = 3
TYPE_4_BOMB = 4 TYPE_4_BOMB = 44
TYPE_5_KING_BOMB = 5 TYPE_4_BOMB5 = 45
TYPE_4_BOMB6 = 46
TYPE_4_BOMB7 = 47
TYPE_4_BOMB8 = 48
TYPE_5_KING_BOMB = 50
#TYPE_6_3_1 = 6
TYPE_6_3_1 = 6 TYPE_6_3_1 = 6
TYPE_7_3_2 = 7 TYPE_7_3_2 = 7
TYPE_8_SERIAL_SINGLE = 8 TYPE_8_SERIAL_SINGLE = 8
TYPE_9_SERIAL_PAIR = 9 TYPE_9_SERIAL_PAIR = 9
TYPE_10_SERIAL_TRIPLE = 10 TYPE_10_SERIAL_TRIPLE = 10
TYPE_11_SERIAL_3_1 = 11 # TYPE_11_SERIAL_3_1 = 11
TYPE_12_SERIAL_3_2 = 12 TYPE_12_SERIAL_3_2 = 12
TYPE_13_4_2 = 13 # TYPE_13_4_2 = 13
TYPE_14_4_22 = 14 # TYPE_14_4_22 = 14
TYPE_15_WRONG = 15 TYPE_15_WRONG = 15
# betting round action # betting round action

View File

@ -10,7 +10,7 @@ import BidModel
def load_card_play_models(card_play_model_path_dict): def load_card_play_models(card_play_model_path_dict):
players = {} players = {}
for position in ['landlord', 'landlord_up', 'landlord_down']: for position in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']:
if card_play_model_path_dict[position] == 'rlcard': if card_play_model_path_dict[position] == 'rlcard':
from .rlcard_agent import RLCardAgent from .rlcard_agent import RLCardAgent
players[position] = RLCardAgent(position) players[position] = RLCardAgent(position)
@ -43,30 +43,34 @@ def mp_simulate(card_play_data_list, card_play_model_path_dict, q, output, bid_o
bid_results = [] bid_results = []
bid_values = [] bid_values = []
bid_info_list = [ bid_info_list = [
np.array([[-1,-1,-1], np.array([[-1,-1,-1,-1],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1]]), [-1,-1,-1,-1]]),
np.array([[0,0,0], np.array([[0,0,0,0],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1]]), [-1,-1,-1,-1]]),
np.array([[1,0,0], np.array([[1,0,0,0],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1]]), [-1,-1,-1,-1]]),
np.array([[0,0,0], np.array([[0,0,0,0],
[0,0,0], [0,0,0,0],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1]]), [-1,-1,-1,-1]]),
np.array([[0,0,1], np.array([[0,0,1,0],
[1,0,0], [0,0,0,1],
[-1,-1,-1], [-1,-1,-1,-1],
[-1,-1,-1]]), [-1,-1,-1,-1]]),
np.array([[0,1,0], np.array([[0,1,0,0],
[0,0,1], [0,0,1,0],
[1,0,0], [0,0,0,1],
[-1,-1,-1]]), [-1,-1,-1,-1]]),
np.array([[0,1,0,0],
[0,0,1,0],
[1,0,0,0],
[-1,-1,-1,-1]]),
] ]
for bid_info in bid_info_list: for bid_info in bid_info_list:
bid_obs = douzero.env.env._get_obs_for_bid(1, bid_info, card_play_data["landlord"]) bid_obs = douzero.env.env._get_obs_for_bid(1, bid_info, card_play_data["landlord"])
@ -82,6 +86,7 @@ def mp_simulate(card_play_data_list, card_play_model_path_dict, q, output, bid_o
print("\nStart ------- " + title) print("\nStart ------- " + title)
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord"]])) print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord"]]))
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_down"]])) print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_down"]]))
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_front"]]))
print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_up"]])) print ("".join([EnvCard2RealCard[c] for c in card_play_data["landlord_up"]]))
# print(card_play_data) # print(card_play_data)
count = 0 count = 0
@ -127,6 +132,7 @@ def evaluate(landlord, landlord_up, landlord_down, eval_data, num_workers, outpu
card_play_model_path_dict = { card_play_model_path_dict = {
'landlord': landlord, 'landlord': landlord,
'landlord_up': landlord_up, 'landlord_up': landlord_up,
'landlord_front': landlord_up,
'landlord_down': landlord_down} 'landlord_down': landlord_down}
num_landlord_wins = 0 num_landlord_wins = 0

View File

@ -10,6 +10,8 @@ if __name__ == '__main__':
default='baselines/douzero_12/landlord_weights_39762328900.ckpt') default='baselines/douzero_12/landlord_weights_39762328900.ckpt')
parser.add_argument('--landlord_up', type=str, parser.add_argument('--landlord_up', type=str,
default='baselines/douzero_12/landlord_up_weights_39762328900.ckpt') default='baselines/douzero_12/landlord_up_weights_39762328900.ckpt')
parser.add_argument('--landlord_front', type=str,
default='baselines/douzero_12/landlord_front_weights_39762328900.ckpt')
parser.add_argument('--landlord_down', type=str, parser.add_argument('--landlord_down', type=str,
default='baselines/douzero_12/landlord_down_weights_39762328900.ckpt') default='baselines/douzero_12/landlord_down_weights_39762328900.ckpt')
parser.add_argument('--eval_data', type=str, parser.add_argument('--eval_data', type=str,
@ -25,7 +27,7 @@ if __name__ == '__main__':
if args.output or args.bid: if args.output or args.bid:
args.num_workers = 1 args.num_workers = 1
t = 3 t = 3
frame = 3085177900 frame = 64000
adp_frame = 2511184300 adp_frame = 2511184300
# args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame # args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame
args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame
@ -44,6 +46,7 @@ if __name__ == '__main__':
elif t == 3: elif t == 3:
args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame args.landlord = 'baselines/resnet_landlord_%i.ckpt' % frame
args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame args.landlord_up = 'baselines/resnet_landlord_up_%i.ckpt' % frame
args.landlord_front = 'baselines/resnet_landlord_front_%i.ckpt' % frame
args.landlord_down = 'baselines/resnet_landlord_down_%i.ckpt' % frame args.landlord_down = 'baselines/resnet_landlord_down_%i.ckpt' % frame
elif t == 4: elif t == 4:
args.landlord = 'baselines/douzero_ADP/landlord.ckpt' args.landlord = 'baselines/douzero_ADP/landlord.ckpt'

View File

@ -4,9 +4,9 @@ import numpy as np
deck = [] deck = []
for i in range(3, 15): for i in range(3, 15):
deck.extend([i for _ in range(4)]) deck.extend([i for _ in range(8)])
deck.extend([17 for _ in range(4)]) deck.extend([17 for _ in range(8)])
deck.extend([20, 30]) deck.extend([20, 20, 30, 30])
def get_parser(): def get_parser():
parser = argparse.ArgumentParser(description='DouZero: random data generator') parser = argparse.ArgumentParser(description='DouZero: random data generator')
@ -17,10 +17,11 @@ def get_parser():
def generate(): def generate():
_deck = deck.copy() _deck = deck.copy()
np.random.shuffle(_deck) np.random.shuffle(_deck)
card_play_data = {'landlord': _deck[:20], card_play_data = {'landlord': _deck[:33],
'landlord_up': _deck[20:37], 'landlord_up': _deck[33:58],
'landlord_down': _deck[37:54], 'landlord_front': _deck[58:83],
'three_landlord_cards': _deck[17:20], 'landlord_down': _deck[83:108],
# 'three_landlord_cards': _deck[25:33],
} }
for key in card_play_data: for key in card_play_data:
card_play_data[key].sort() card_play_data[key].sort()