diff --git a/cuda_train.cmd b/cuda_train.cmd
index 5b395c3..2270905 100644
--- a/cuda_train.cmd
+++ b/cuda_train.cmd
@@ -1 +1 @@
-python train.py --load_model --batch_size 8 --learning_rate 0.0003
\ No newline at end of file
+python train.py --load_model --batch_size 8 --learning_rate 0.0003 --enable_onnx
\ No newline at end of file
diff --git a/douzero/dmc/dmc.py b/douzero/dmc/dmc.py
index 482e84d..00aa6f9 100644
--- a/douzero/dmc/dmc.py
+++ b/douzero/dmc/dmc.py
@@ -68,8 +68,9 @@ def learn(position, actor_models, model, batch, optimizer, flags, lock):
         nn.utils.clip_grad_norm_(model.parameters(), flags.max_grad_norm)
         optimizer.step()
 
-        for actor_model in actor_models.values():
-            actor_model.get_model(position).load_state_dict(model.state_dict())
+        if not flags.enable_onnx:
+            for actor_model in actor_models.values():
+                actor_model.get_model(position).load_state_dict(model.state_dict())
         return stats
 
 def train(flags):
@@ -103,9 +104,9 @@ def train(flags):
     models = {}
     for device in device_iterator:
         if flags.old_model:
-            model = OldModel(device="cpu")
+            model = OldModel(device="cpu", flags = flags)
         else:
-            model = Model(device="cpu")
+            model = Model(device="cpu", flags = flags)
         model.share_memory()
         model.eval()
         models[device] = model
@@ -149,8 +150,9 @@ def train(flags):
         for k in ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']: # ['landlord', 'landlord_up', 'landlord_down']
             learner_model.get_model(k).load_state_dict(checkpoint_states["model_state_dict"][k])
             optimizers[k].load_state_dict(checkpoint_states["optimizer_state_dict"][k])
-            for device in device_iterator:
-                models[device].get_model(k).load_state_dict(checkpoint_states["model_state_dict"][k])
+            if not flags.enable_onnx:
+                for device in device_iterator:
+                    models[device].get_model(k).load_state_dict(checkpoint_states["model_state_dict"][k])
         stats = checkpoint_states["stats"]
 
         if not 'mean_episode_return_bidding' in stats:
diff --git a/douzero/dmc/models.py b/douzero/dmc/models.py
index c876152..72b8f00 100644
--- a/douzero/dmc/models.py
+++ b/douzero/dmc/models.py
@@ -2,6 +2,7 @@
 This file includes the torch models. We wrap the three
 models into one class for convenience.
 """
+import os
 
 import numpy as np
 
@@ -331,8 +332,8 @@ class GeneralModel(nn.Module):
     def get_onnx_params(self):
         return {
             'args': (
-                torch.tensor(np.zeros([1, 40, 108]), dtype=torch.float32),
-                torch.tensor(np.zeros((1, 80)), dtype=torch.float32)
+                torch.tensor(np.zeros([1, 40, 108]), dtype=torch.float32, device='cuda:0'),
+                torch.tensor(np.zeros((1, 80)), dtype=torch.float32, device='cuda:0')
             ),
             'input_names': ['z_batch','x_batch'],
             'output_names': ['values'],
@@ -478,7 +479,7 @@ class OldModel:
     The wrapper for the three models. We also wrap several
     interfaces such as share_memory, eval, etc.
     """
-    def __init__(self, device=0):
+    def __init__(self, device=0, flags=None):
         self.models = {}
         if not device == "cpu":
             device = 'cuda:' + str(device)
@@ -519,17 +520,19 @@ class OldModel:
             return dict(action=action)
 
     def share_memory(self):
-        self.models['landlord'].share_memory()
-        self.models['landlord_up'].share_memory()
-        self.models['landlord_front'].share_memory()
-        self.models['landlord_down'].share_memory()
+        if self.models['landlord'] is not None:
+            self.models['landlord'].share_memory()
+            self.models['landlord_up'].share_memory()
+            self.models['landlord_front'].share_memory()
+            self.models['landlord_down'].share_memory()
         self.models['bidding'].share_memory()
 
     def eval(self):
-        self.models['landlord'].eval()
-        self.models['landlord_up'].eval()
-        self.models['landlord_front'].eval()
-        self.models['landlord_down'].eval()
+        if self.models['landlord'] is not None:
+            self.models['landlord'].eval()
+            self.models['landlord_up'].eval()
+            self.models['landlord_front'].eval()
+            self.models['landlord_down'].eval()
         self.models['bidding'].eval()
 
     def parameters(self, position):
@@ -547,32 +550,43 @@ class Model:
     The wrapper for the three models. We also wrap several
     interfaces such as share_memory, eval, etc.
     """
-    def __init__(self, device=0):
+    def __init__(self, device=0, flags=None):
         self.models = {}
+        self.onnx_models = {}
+        self.flags = flags
         if not device == "cpu":
             device = 'cuda:' + str(device)
         # model = GeneralModel().to(torch.device(device))
-        self.models['landlord'] = GeneralModel().to(torch.device(device))
-        self.models['landlord_up'] = GeneralModel().to(torch.device(device))
-        self.models['landlord_front'] = GeneralModel().to(torch.device(device))
-        self.models['landlord_down'] = GeneralModel().to(torch.device(device))
-        self.models['bidding'] = BidModel().to(torch.device(device))
-        self.onnx_models = {
-            'landlord': None,
-            'landlord_up': None,
-            'landlord_front': None,
-            'landlord_down': None,
-            'bidding': None
-        }
-        self.models['bidding'] = BidModel().to(torch.device(device))
+        positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']
+        if flags is not None and flags.enable_onnx:
+            self.models['bidding'] = BidModel().to(torch.device(device))
+            for position in positions:
+                self.models[position] = None
+        else:
+            for position in positions:
+                self.models[position] = GeneralModel().to(torch.device(device))
+            self.models['bidding'] = BidModel().to(torch.device(device))
+            self.onnx_models = {
+                'landlord': None,
+                'landlord_up': None,
+                'landlord_front': None,
+                'landlord_down': None,
+                'bidding': None
+            }
 
-    def set_onnx_model(self, position, model_path):
-        self.onnx_models[position] = onnxruntime.InferenceSession(get_example(model_path))
+    def set_onnx_model(self):
+        positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down']
+        for position in positions:
+            model_path = os.path.abspath('%s/%s/model_%s.onnx' % (self.flags.savedir, self.flags.xpid, position))
+            self.onnx_models[position] = onnxruntime.InferenceSession(get_example(model_path), providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        self.onnx_models['bidding'] = None
 
     def get_onnx_params(self, position):
         self.models[position].get_onnx_params()
 
     def forward(self, position, z, x, return_value=False, flags=None, debug=False):
+        if self.flags.enable_onnx and len(self.onnx_models) == 0:
+                self.set_onnx_model()
         model = self.onnx_models[position]
         if model is None:
             model = self.models[position]
@@ -590,17 +604,19 @@ class Model:
             return dict(action=action)
 
     def share_memory(self):
-        self.models['landlord'].share_memory()
-        self.models['landlord_up'].share_memory()
-        self.models['landlord_front'].share_memory()
-        self.models['landlord_down'].share_memory()
+        if self.models['landlord'] is not None:
+            self.models['landlord'].share_memory()
+            self.models['landlord_up'].share_memory()
+            self.models['landlord_front'].share_memory()
+            self.models['landlord_down'].share_memory()
         self.models['bidding'].share_memory()
 
     def eval(self):
-        self.models['landlord'].eval()
-        self.models['landlord_up'].eval()
-        self.models['landlord_front'].eval()
-        self.models['landlord_down'].eval()
+        if self.models['landlord'] is not None:
+            self.models['landlord'].eval()
+            self.models['landlord_up'].eval()
+            self.models['landlord_front'].eval()
+            self.models['landlord_down'].eval()
         self.models['bidding'].eval()
 
     def parameters(self, position):
diff --git a/douzero/dmc/utils.py b/douzero/dmc/utils.py
index d49aded..3df3f34 100644
--- a/douzero/dmc/utils.py
+++ b/douzero/dmc/utils.py
@@ -83,8 +83,11 @@ def create_optimizers(flags, learner_model):
 
 def act(i, device, batch_queues, model, flags, onnx_frame):
     positions = ['landlord', 'landlord_up', 'landlord_front', 'landlord_down', 'bidding']
-    for pos in positions:
-        model.models[pos].to(torch.device(device if device == "cpu" else ("cuda:"+str(device))))
+    if not flags.enable_onnx:
+        for pos in positions:
+            model.models[pos].to(torch.device(device if device == "cpu" else ("cuda:"+str(device))))
+    else:
+        model.models['bidding'].to(torch.device(device if device == "cpu" else ("cuda:" + str(device))))
     try:
         T = flags.unroll_length
         log.info('Device %s Actor %i started.', str(device), i)
@@ -117,9 +120,7 @@ def act(i, device, batch_queues, model, flags, onnx_frame):
                 last_onnx_frame = onnx_frame.value
                 for p in positions:
                     if p != 'bidding':
-                        model_path = '%s/%s/model_%s.onnx' % (flags.savedir, flags.xpid, p)
-                        if os.path.exists(model_path):
-                            model.set_onnx_model(p, os.path.abspath(model_path))
+                        model.set_onnx_model()
 
             for bid_obs in bid_obs_buffer:
                 obs_z_buf["bidding"].append(bid_obs['z_batch'])
diff --git a/requirements.txt b/requirements.txt
index 41b5f02..2dfca1a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ gitdb2
 rlcard
 psutil
 onnx
-onnxruntime
+onnxruntime-gpu