infer逻辑调整

This commit is contained in:
zhiyang7 2022-01-04 11:12:36 +08:00
parent 4571bb3dfc
commit 2b8586303b
4 changed files with 14 additions and 13 deletions

View File

@ -13,19 +13,17 @@ parser.add_argument('--objective', default='adp', type=str, choices=['adp', 'wp'
# Training settings # Training settings
parser.add_argument('--onnx_sync_interval', default=120, type=int, parser.add_argument('--onnx_sync_interval', default=120, type=int,
help='Time interval (in seconds) at which to sync the onnx model') help='Time interval (in seconds) at which to sync the onnx model')
parser.add_argument('--actor_device_cpu', action='store_true',
help='Use CPU as actor device')
parser.add_argument('--gpu_devices', default='0', type=str, parser.add_argument('--gpu_devices', default='0', type=str,
help='Which GPUs to be used for training') help='Which GPUs to be used for training')
parser.add_argument('--infer_devices', default='0', type=str, parser.add_argument('--infer_devices', default='0', type=str,
help='Which device to be used for infer') help='Which device to be used for infer')
parser.add_argument('--num_infer', default=3, type=int, parser.add_argument('--num_infer', default=2, type=int,
help='The number of process used for infer') help='The number of process used for infer')
parser.add_argument('--num_actor_devices', default=1, type=int, parser.add_argument('--num_actor_devices', default=1, type=int,
help='The number of devices used for simulation') help='The number of devices used for simulation')
parser.add_argument('--num_actors', default=4, type=int, parser.add_argument('--num_actors', default=3, type=int,
help='The number of actors for each simulation device') help='The number of actors for each simulation device')
parser.add_argument('--num_actors_thread', default=6, type=int, parser.add_argument('--num_actors_thread', default=4, type=int,
help='The number of actors for each simulation device') help='The number of actors for each simulation device')
parser.add_argument('--training_device', default='0', type=str, parser.add_argument('--training_device', default='0', type=str,
help='The index of the GPU used for training models. `cpu` means using cpu') help='The index of the GPU used for training models. `cpu` means using cpu')

View File

@ -70,9 +70,9 @@ def train(flags):
Then it will start subprocesses as actors. Then, it will call Then it will start subprocesses as actors. Then, it will call
learning function with multiple threads. learning function with multiple threads.
""" """
if not flags.actor_device_cpu or flags.training_device != 'cpu': if flags.training_device != 'cpu' or flags.infer_devices != 'cpu':
if not torch.cuda.is_available(): if not torch.cuda.is_available():
raise AssertionError("CUDA not available. If you have GPUs, please specify the ID after `--gpu_devices`. Otherwise, please train with CPU with `python3 train.py --actor_device_cpu --training_device cpu`") raise AssertionError("CUDA not available. If you have GPUs, please specify the ID after `--gpu_devices`. Otherwise, please train with CPU with `python3 train.py --infer_devices cpu --training_device cpu`")
plogger = FileWriter( plogger = FileWriter(
xpid=flags.xpid, xpid=flags.xpid,
xp_args=flags.__dict__, xp_args=flags.__dict__,

View File

@ -493,8 +493,11 @@ model_dict_new_lite['landlord_up'] = GeneralModelLite
model_dict_new_lite['landlord_front'] = GeneralModelLite model_dict_new_lite['landlord_front'] = GeneralModelLite
model_dict_new_lite['landlord_down'] = GeneralModelLite model_dict_new_lite['landlord_down'] = GeneralModelLite
def forward_logic(self_model, position, z, x, return_value=False, flags=None): def forward_logic(self_model, position, z, x, device='cpu', return_value=False, flags=None):
legal_count = len(z) legal_count = len(z)
if not flags.enable_onnx:
z = torch.tensor(z, device=device)
x = torch.tensor(x, device=device)
if legal_count >= 80: if legal_count >= 80:
partition_count = int(legal_count / 40) partition_count = int(legal_count / 40)
sub_z = np.array_split(z, partition_count) sub_z = np.array_split(z, partition_count)
@ -577,8 +580,8 @@ class OldModel:
def get_onnx_params(self, position): def get_onnx_params(self, position):
self.models[position].get_onnx_params(self.device) self.models[position].get_onnx_params(self.device)
def forward(self, position, z, x, return_value=False, flags=None): def forward(self, position, z, x, device='cpu', return_value=False, flags=None):
return forward_logic(self, position, z, x, return_value, flags) return forward_logic(self, position, z, x, device, return_value, flags)
def share_memory(self): def share_memory(self):
if self.models['landlord'] is not None: if self.models['landlord'] is not None:
@ -646,8 +649,8 @@ class Model:
def get_onnx_params(self, position): def get_onnx_params(self, position):
self.models[position].get_onnx_params(self.device) self.models[position].get_onnx_params(self.device)
def forward(self, position, z, x, return_value=False, flags=None, debug=False): def forward(self, position, z, x, device='cpu', return_value=False, flags=None):
return forward_logic(self, position, z, x, return_value, flags) return forward_logic(self, position, z, x, device, return_value, flags)
def share_memory(self): def share_memory(self):
if self.models['landlord'] is not None: if self.models['landlord'] is not None:

View File

@ -131,7 +131,7 @@ def infer_logic(i, device, infer_queues, model, flags, onnx_frame):
try: try:
task = infer_queue['input'].get_nowait() task = infer_queue['input'].get_nowait()
with torch.no_grad(): with torch.no_grad():
result = model.forward(task['position'], task['z_batch'], task['x_batch'], return_value=True, flags=flags) result = model.forward(task['position'], task['z_batch'], task['x_batch'], device=device, return_value=True, flags=flags)
infer_queue['output'].put({ infer_queue['output'].put({
'values': result['values'] 'values': result['values']
}) })