handle model with different parameter dtypes

1139854d · Yuxin Wu · 92a9315e · 1139854d · 1139854d
Commit 1139854d authored Apr 02, 2018 by Yuxin Wu
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 9 deletions

tensorpack/graph_builder/training.py tensorpack/graph_builder/training.py +8 -1

tensorpack/tfutils/model_utils.py tensorpack/tfutils/model_utils.py +14 -8

No files found.
--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -218,10 +218,17 @@ class SyncMultiGPUReplicatedBuilder(DataParallelBuilder):
            logger.warn("mode='hierarchical' require >= 8 GPUs. Fallback to mode='cpu'.")
            self._mode = 'cpu'
+        dtypes = set([x[0].dtype.base_dtype for x in grad_list[0]])
+        valid_for_nccl = all([k in [tf.float32, tf.float64] for k in dtypes])
+        if self._mode == 'nccl' and not valid_for_nccl:
+            logger.warn("Cannot use mode='nccl' because some gradients have unsupported types. Fallback to mode='cpu'")
+            self._mode = 'cpu'
        if self._mode in ['nccl', 'hierarchical']:
            all_grads, all_vars = split_grad_list(grad_list)
            if self._mode == 'nccl':
-                all_grads = allreduce_grads(all_grads, average=self._average)  # #gpu x #param x 2
+                all_grads = allreduce_grads(all_grads, average=self._average)  # #gpu x #param
            else:
                packer = GradientPacker(len(raw_devices))
                succ = packer.compute_strategy(all_grads[0])

--- a/tensorpack/tfutils/model_utils.py
+++ b/tensorpack/tfutils/model_utils.py
@@ -23,7 +23,6 @@ def describe_trainable_vars():
    total = 0
    total_bytes = 0
    data = []
-    devices = set()
    for v in train_vars:
        if v.name.startswith('tower'):
            continue
@@ -31,16 +30,23 @@ def describe_trainable_vars():
        ele = shape.num_elements()
        total += ele
        total_bytes += ele * v.dtype.size
-        devices.add(v.device)
+        data.append([v.name, shape.as_list(), ele, v.device, v.dtype.base_dtype.name])
-        data.append([v.name, shape.as_list(), ele, v.device])
+    headers = ['name', 'shape', 'dim', 'device', 'dtype']
+    dtypes = set([x[4] for x in data])
+    if len(dtypes) == 1:
+        for x in data:
+            del x[4]
+        del headers[4]
+    devices = set([x[3] for x in data])
    if len(devices) == 1:
        # don't log the device if all vars on the same device
-        for d in data:
+        for x in data:
-            d.pop()
+            del x[3]
-        table = tabulate(data, headers=['name', 'shape', 'dim'])
+        del headers[3]
-    else:
-        table = tabulate(data, headers=['name', 'shape', 'dim', 'device'])
+    table = tabulate(data, headers=headers)
    size_mb = total_bytes / 1024.0**2
    summary_msg = colored(