NVIDIA · nvpstr · Nov 6, 2019 · Nov 6, 2019 · Nov 6, 2019 · Nov 6, 2019
diff --git a/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile b/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:19.08-py3
+FROM nvcr.io/nvidia/pytorch:19.10-py3
 
 ADD . /workspace/tacotron2
 WORKDIR /workspace/tacotron2

diff --git a/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile_trtis_client b/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile_trtis_client
@@ -0,0 +1,41 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvcr.io/nvidia/tensorrtserver:19.10-py3-clientsdk AS trt
+FROM continuumio/miniconda3
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract mc iputils-ping wget
+
+WORKDIR /workspace/speech_ai_demo__TTS/
+
+# Copy the perf_client over
+COPY --from=trt /workspace/install/ /workspace/install/
+ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
+
+# set up env variables
+ENV PATH="$PATH:/opt/conda/bin"
+RUN cd /workspace/speech_ai_demo__TTS/
+
+# jupyter lab extensions
+RUN conda install -c conda-forge jupyterlab=1.0 ipywidgets=7.5 nodejs python-sounddevice librosa unidecode inflect
+RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
+RUN pip install /workspace/install/python/tensorrtserver*.whl
+
+# Copy the python wheel and install with pip
+COPY --from=trt /workspace/install/python/tensorrtserver*.whl /tmp/
+RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl
+
+RUN cd /workspace/speech_ai_demo__TTS/
+COPY ./notebooks/trtis/ .
+RUN mkdir /workspace/speech_ai_demo__TTS/tacotron2/
+COPY ./tacotron2/text /workspace/speech_ai_demo__TTS/tacotron2/text
+RUN chmod a+x /workspace/speech_ai_demo__TTS/run_this.sh
diff --git a/PyTorch/SpeechSynthesis/Tacotron2/README.md b/PyTorch/SpeechSynthesis/Tacotron2/README.md
@@ -1,4 +1,4 @@
-# Tacotron 2 And WaveGlow v1.7 For PyTorch
+# Tacotron 2 And WaveGlow v1.10 For PyTorch
 
 This repository provides a script and recipe to train Tacotron 2 and WaveGlow
 v1.6 models to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
@@ -33,13 +33,13 @@ v1.6 models to achieve state of the art accuracy, and is tested and maintained b
       * [Inference performance benchmark](#inference-performance-benchmark)
    * [Results](#results)
       * [Training accuracy results](#training-accuracy-results)
-         * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
+         * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
       * [Training performance results](#training-performance-results)
-         * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
+         * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
          * [Expected training time](#expected-training-time)
       * [Inference performance results](#inference-performance-results)
-         * [NVIDIA V100 16G](#nvidia-v100-16g)
-         * [NVIDIA T4](#nvidia-t4)
+         * [Inference performance: NVIDIA V100 16G](#inference-performance-nvidia-v100-16g)
+         * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
 * [Release notes](#release-notes)
    * [Changelog](#changelog)
    * [Known issues](#known-issues)
@@ -471,7 +471,7 @@ To run inference, issue:
 ```bash
 python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrases/phrase.txt --amp-run
 ```
-Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained 
+Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained
 checkpoints for the respective models, and `phrases/phrase.txt` contains input 
 phrases. The number of text lines determines the inference batch size. Audio 
 will be saved in the output folder. The audio files [audio_fp16](./audio/audio_fp16.wav)
@@ -564,7 +564,7 @@ and accuracy in training and inference.
 
 #### Training accuracy results
 
-##### NVIDIA DGX-1 (8x V100 16G)
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
 
 Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{AMP,FP32}_DGX1_16GB_8GPU.sh` training script in the PyTorch-19.06-py3
 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
@@ -594,7 +594,7 @@ WaveGlow FP32 loss - batch size 4 (mean and std over 16 runs)
 
 #### Training performance results
 
-##### NVIDIA DGX-1 (8x V100 16G)
+##### Training performance: NVIDIA DGX-1 (8x V100 16G)
 
 Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{AMP,FP32}_DGX1_16GB_8GPU.sh`
 training script in the PyTorch-19.06-py3 NGC container on NVIDIA DGX-1 with
@@ -648,26 +648,27 @@ deviation, and latency confidence intervals. Throughput is measured
 as the number of generated audio samples per second. RTF is the real-time factor
 which tells how many seconds of speech are generated in 1 second of compute.
 
-##### NVIDIA V100 16G
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
 
-|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 50% (s)|Latency confidence interval 100% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
-|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-|1| 128| FP16| 1.73| 0.07| 1.72| 2.11|  89,162| 1.09| 601| 6.98| 4.04|
-|4| 128| FP16| 4.21| 0.17| 4.19| 4.84| 145,800| 1.16| 600| 6.97| 1.65|
-|1| 128| FP32| 1.85| 0.06| 1.84| 2.19|  81,868| 1.00| 590| 6.85| 3.71|
-|4| 128| FP32| 4.80| 0.15| 4.79| 5.43| 125,930| 1.00| 590| 6.85| 1.43|
+|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+|1| 128| FP16| 1.27| 0.06| 1.34| 1.38| 1.41| 121,190| 1.37| 603| 7.00| 5.51|
+|4| 128| FP16| 2.32| 0.09| 2.42| 2.45| 2.59| 277,711| 2.03| 628| 7.23| 3.12|
+|1| 128| FP32| 1.70| 0.05| 1.77| 1.79| 1.84|  88,650| 1.00| 590| 6.85| 4.03|
+|4| 128| FP32| 4.56| 0.12| 4.72| 4.77| 4.87| 136,518| 1.00| 608| 7.06| 1.55|
 
-##### NVIDIA T4
+##### Inference performance: NVIDIA T4
+
+|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+|1| 128| FP16|  3.13| 0.13|  3.28|  3.36|  3.46| 49,276| 1.26| 602| 6.99| 2.24|
+|4| 128| FP16| 11.98| 0.42| 12.44| 12.70| 13.29| 53,676| 1.23| 628| 7.29| 0.61| 
+|1| 128| FP32|  3.88| 0.12|  4.04|  4.09|  4.19| 38,964| 1.00| 591| 6.86| 1.77|
+|4| 128| FP32| 14.34| 0.42| 14.89| 15.08| 15.55| 43,489| 1.00| 609| 7.07| 0.49|
 
-|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 50% (s)|Latency confidence interval 100% (s)|Throughput (samples/sec)|Speed-up with mixed precision|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
-|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
-|1| 128| FP16|  3.16| 0.13|  3.16|  3.81| 48,792| 1.23| 603| 7.00| 2.21|
-|4| 128| FP16| 11.45| 0.49| 11.39| 14.38| 53,771| 1.22| 601| 6.98| 0.61|
-|1| 128| FP32|  3.82| 0.11|  3.81|  4.24| 39,603| 1.00| 591| 6.86| 1.80|
-|4| 128| FP32| 13.80| 0.45| 13.74| 16.09| 43,915| 1.00| 592| 6.87| 0.50|
 
 Our results were obtained by running the `./run_latency_tests.sh` script in
-the PyTorch-19.06-py3 NGC container. Please note that to reproduce the results,
+the PyTorch-19.09-py3 NGC container. Please note that to reproduce the results,
 you need to provide pretrained checkpoints for Tacotron 2 and WaveGlow. Please
 edit the script to provide your checkpoint filenames.
 
@@ -696,7 +697,13 @@ August 2019
 September 2019
 * Introduced inference statistics
 
+October 2019
+* Tacotron 2 inference with torch.jit.script
+
+November 2019
+* Implemented training resume from checkpoint
+* Added notebook for running Tacotron 2 and WaveGlow in TRTIS.
+
 ### Known issues
 
 There are no known issues in this release.
-
diff --git a/PyTorch/SpeechSynthesis/Tacotron2/common/utils.py b/PyTorch/SpeechSynthesis/Tacotron2/common/utils.py
@@ -33,8 +33,9 @@
 
 def get_mask_from_lengths(lengths):
     max_len = torch.max(lengths).item()
-    ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
+    ids = torch.arange(0, max_len, device=lengths.device, dtype=lengths.dtype)
     mask = (ids < lengths.unsqueeze(1)).byte()
+    mask = torch.le(mask, 0)
     return mask
 
 

diff --git a/PyTorch/SpeechSynthesis/Tacotron2/export_tacotron2_ts.py b/PyTorch/SpeechSynthesis/Tacotron2/export_tacotron2_ts.py
@@ -0,0 +1,67 @@
+# *****************************************************************************
+#  Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import torch
+import argparse
+from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
+from dllogger.autologging import log_hardware, log_args
+
+def parse_args(parser):
+    """
+    Parse commandline arguments.
+    """
+    parser.add_argument('--tacotron2', type=str, required=True,
+                        help='full path to the Tacotron2 model checkpoint file')
+
+    parser.add_argument('-o', '--output', type=str, default="trtis_repo/tacotron/1/model.pt",
+                        help='filename for the Tacotron 2 TorchScript model')
+    parser.add_argument('--amp-run', action='store_true',
+                        help='inference with AMP')
+
+    return parser
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description='PyTorch Tacotron 2 Inference')
+    parser = parse_args(parser)
+    args = parser.parse_args()
+
+    log_args(args)    
+    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
+                                     args.amp_run, rename=True)
+
+    jitted_tacotron2 = torch.jit.script(tacotron2)
+
+    torch.jit.save(jitted_tacotron2, args.output)
+
+
+if __name__ == '__main__':
+    main()
+
+
diff --git a/PyTorch/SpeechSynthesis/Tacotron2/export_tacotron2_ts_config.py b/PyTorch/SpeechSynthesis/Tacotron2/export_tacotron2_ts_config.py
@@ -0,0 +1,117 @@
+# *****************************************************************************
+#  Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+
+import os
+import argparse
+from dllogger.autologging import log_hardware, log_args
+
+
+def parse_args(parser):
+    """
+        Parse commandline arguments. 
+    """
+    parser.add_argument("--trtis_model_name",
+                        type=str,
+                        default='tacotron2',
+                        help="exports to appropriate directory for TRTIS")
+    parser.add_argument("--trtis_model_version",
+                        type=int,
+                        default=1,
+                        help="exports to appropriate directory for TRTIS")
+    parser.add_argument("--trtis_max_batch_size",
+                        type=int,
+                        default=8,
+                        help="Specifies the 'max_batch_size' in the TRTIS model config.\
+                              See the TRTIS documentation for more info.")
+    parser.add_argument('--amp-run', action='store_true',
+                        help='inference with AMP')
+    return parser
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='PyTorch Tacotron 2 TRTIS config exporter')
+    parser = parse_args(parser)
+    args = parser.parse_args()
+
+    log_args(args)    
+
+    # prepare repository
+    model_folder = os.path.join('./trtis_repo', args.trtis_model_name)
+    version_folder = os.path.join(model_folder, str(args.trtis_model_version))
+    if not os.path.exists(version_folder):
+        os.makedirs(version_folder)
+
+    # build the config for TRTIS
+    config_filename = os.path.join(model_folder, "config.pbtxt")
+    config_template = r"""
+name: "{model_name}"
+platform: "pytorch_libtorch"
+max_batch_size: {max_batch_size}
+input [
+  {{
+    name: "sequence__0"
+    data_type: TYPE_INT64
+    dims: [-1]
+  }},
+  {{
+    name: "input_lengths__1"
+    data_type: TYPE_INT64
+    dims: [1]
+    reshape: {{ shape: [ ] }}
+  }}
+]
+output [
+  {{
+    name: "mel_outputs_postnet__0"
+    data_type: {fp_type}
+    dims: [80,-1]
+  }},
+  {{
+    name: "mel_lengths__1"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: {{ shape: [ ] }}
+  }}
+]
+"""
+
+    config_values = {
+        "model_name": args.trtis_model_name,
+        "max_batch_size": args.trtis_max_batch_size,
+        "fp_type": "TYPE_FP16" if args.amp_run else "TYPE_FP32"
+    }
+
+    with open(model_folder + "/config.pbtxt", "w") as file:
+        final_config_str = config_template.format_map(config_values)
+        file.write(final_config_str)
+
+
+if __name__ == '__main__':
+    main()
+