update docs; check TF is built with CUDA

0f0a9c53 · Yuxin Wu · ef62f188 · 0f0a9c53 · 0f0a9c53 · 0f0a9c53
Commit 0f0a9c53 authored May 11, 2018 by Yuxin Wu
6 changed files
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -11,26 +11,27 @@ Any unexpected problems: __PLEASE ALWAYS INCLUDE__:
 	+ If not, tell us what you did that may be relevant.
 		But we may not be able to resolve it if there is no reproducible code.
 	+ Better to paste what you did instead of describing them.
-2. What you observed, e.g. as much logs as possible.
+2. What you observed, e.g. the entire log:
 	+ Better to paste what you observed instead of describing them.
 3. What you expected, if not obvious.
 4. Your environment:
 	+ Python version.
 	+ TF version: `python -c 'import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)'`.
-	+ Tensorpack version: `python3 -c 'import tensorpack; print(tensorpack.__version__)'`. You can install Tensorpack master by `pip install -U git+https://github.com/ppwwyyxx/tensorpack.git`.
+	+ Tensorpack version: `python3 -c 'import tensorpack; print(tensorpack.__version__)'`.
+      You can install Tensorpack master by `pip install -U git+https://github.com/ppwwyyxx/tensorpack.git`.:
 5. About efficiency, PLEASE first read http://tensorpack.readthedocs.io/en/latest/tutorial/performance-tuning.html

 Feature Requests:
 + You can implement a lot of features by extending tensorpack
 	(See http://tensorpack.readthedocs.io/en/latest/tutorial/index.html#extend-tensorpack).
 	It does not have to be added to tensorpack unless you have a good reason.
-+ We don't take feature requests for implementing new techniques.
+ We don't take feature requests for implementing new papers.
  If you don't know how, ask it as a usage question.

 Usage Questions:

 + Read the [tutorials](http://tensorpack.readthedocs.io/en/latest/tutorial/index.html#user-tutorials) first.
-+ We answer "HOW to do X in tensorpack" for a specific well-defined X.
+ We answer "HOW to do X in tensorpack" for a well-defined X.
  We don't answer general machine learning questions,
  such as "what networks to use" or "I don't understand the paper".


--- a/examples/ImageNetModels/imagenet_utils.py
+++ b/examples/ImageNetModels/imagenet_utils.py
@@ -57,7 +57,10 @@ def fbresnet_augmentor(isTrain):
    if isTrain:
        augmentors = [
            GoogleNetResize(),
-            imgaug.RandomOrderAug(      # Remove these augs if your CPU is not fast enough
+            # It's OK to remove these augs if your CPU is not fast enough.
+            # Removing brightness/contrast/saturation does not have a significant effect on accuracy.
+            # Removing lighting leads to a tiny drop in accuracy.
+            imgaug.RandomOrderAug(
                [imgaug.BrightnessScale((0.6, 1.4), clip=False),
                 imgaug.Contrast((0.6, 1.4), clip=False),
                 imgaug.Saturation(0.4, rgb=False),

--- a/examples/ResNet/README.md
+++ b/examples/ResNet/README.md
@@ -31,7 +31,7 @@ To train, first decompress ImageNet data into [this structure](http://tensorpack
 ```

 You should be able to see good GPU utilization (95%~99%), if your data is fast enough.
-It can finish training [within 20 hours](http://dawn.cs.stanford.edu/benchmark/ImageNet/train.html) on AWS p3.16xlarge.
+With batch=64x8, it can finish 100 epochs in 16 hours on AWS p3.16xlarge (8 V100s).

 The default data pipeline is probably OK for machines with SSD & 20 CPU cores.
 See the [tutorial](http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html) on other options to speed up your data.

--- a/examples/ResNet/imagenet-resnet.py
+++ b/examples/ResNet/imagenet-resnet.py
@@ -119,7 +119,8 @@ if __name__ == '__main__':
    parser.add_argument('--eval', action='store_true', help='run offline evaluation instead of training')
    parser.add_argument('--batch', default=256, type=int,
                        help="total batch size. "
-                        "Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy.")
+                        "Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy."
+                        "Pretrained models listed in README were trained with batch=32x8.")
    parser.add_argument('--mode', choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use', default='resnet')
    args = parser.parse_args()

--- a/tensorpack/graph_builder/training.py
+++ b/tensorpack/graph_builder/training.py
@@ -39,6 +39,8 @@ class DataParallelBuilder(GraphBuilder):
        """
        if len(towers) > 1:
            logger.info("[DataParallel] Training a model of {} towers.".format(len(towers)))
+            if not tf.test.is_built_with_cuda():
+                logger.warn("TensorFlow was not built with CUDA support!")

        self.towers = towers


--- a/tensorpack/utils/gpu.py
+++ b/tensorpack/utils/gpu.py
@@ -27,26 +27,37 @@ def get_num_gpu():
    Returns:
        int: #available GPUs in CUDA_VISIBLE_DEVICES, or in the system.
    """
+
+    def warn_return(ret, message):
+        try:
+            import tensorflow as tf
+        except ImportError:
+            return ret
+
+        built_with_cuda = tf.test.is_built_with_cuda()
+        if not built_with_cuda and ret > 0:
+            logger.warn(message + "But TensorFlow was not built with CUDA support!")
+        return ret
+
    env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
    if env is not None:
-        return len(env.split(','))
+        return warn_return(len(env.split(',')), "Found non-empty CUDA_VISIBLE_DEVICES. ")
    output, code = subproc_call("nvidia-smi -L", timeout=5)
    if code == 0:
        output = output.decode('utf-8')
-        return len(output.strip().split('\n'))
-    else:
-        try:
-            # Use NVML to query device properties
-            with NVMLContext() as ctx:
-                return ctx.num_devices()
-        except Exception:
-            # Fallback
-            # Note this will initialize all GPUs and therefore has side effect
-            # https://github.com/tensorflow/tensorflow/issues/8136
-            logger.info("Loading local devices by TensorFlow ...")
-            from tensorflow.python.client import device_lib
-            local_device_protos = device_lib.list_local_devices()
-            return len([x.name for x in local_device_protos if x.device_type == 'GPU'])
+        return warn_return(len(output.strip().split('\n')), "Found nvidia-smi. ")
+    try:
+        # Use NVML to query device properties
+        with NVMLContext() as ctx:
+            return warn_return(ctx.num_devices(), "NVML found nvidia devices. ")
+    except Exception:
+        # Fallback
+        # Note this will initialize all GPUs and therefore has side effect
+        # https://github.com/tensorflow/tensorflow/issues/8136
+        logger.info("Loading local devices by TensorFlow ...")
+        from tensorflow.python.client import device_lib
+        local_device_protos = device_lib.list_local_devices()
+        return len([x.name for x in local_device_protos if x.device_type == 'GPU'])


 get_nr_gpu = get_num_gpu