microsoft · feihugis · Nov 18, 2020 · Nov 20, 2020 · Nov 21, 2020 · Nov 21, 2020
diff --git a/README.md b/README.md
@@ -37,20 +37,20 @@ We developped a wide range of speedup techniques, including improving beam searc
 
 If you use fairseq or transformers, you only need to install one of them. If you use both, you need to install both.
 
-### Install from PIP package
-
-`fastseq` Python package can be directly installed with pip using
+### Install from the source
 
 ```bash
-$ pip install fastseq
-```
+# when fairseq and/or transformers has been installed
+$ pip install git+https://github.com/microsoft/fastseq.git
 
-### Install from the source
+# install fastseq + transformers
+$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[transformers]
 
-```bash
-$ git clone https://github.com/microsoft/fastseq
-$ cd fastseq
-$ pip install --editable ./
+# install fastseq + fairseq
+$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[fairseq]
+
+# install fastseq + transformers + fairseq
+$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[transformers,fairseq]
 ```
 
 ## Usage

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -21,7 +21,14 @@ jobs:
     steps:
     - script: |
         #install fastseq
-        pip install --editable .
+        pip install --editable .[transformers,fairseq]
+
+        echo "******* Running fastseq unittests *******"
+        pip install pytorch-transformers==1.0.0
+        bash tests/run_fastseq_tests.sh
+
+        #cd benchmarks/
+        #bash run_all_benchmarks.sh
 
         #show environment
         which python
@@ -32,11 +39,14 @@ jobs:
 
         echo "******* Running fairseq unittests *******"
         bash tests/run_fairseq_tests.sh
+
         echo "******* Running transformers unittests *******"
         bash tests/run_transformers_tests.sh
-        echo "******* Running fastseq unittests *******"
-        pip install pytorch-transformers==1.0.0
-        python -m unittest discover -s tests/ -p 'test_*.py' -v
-        #cd benchmarks/
-        #bash run_all_benchmarks.sh
+
       displayName: 'run fastseq unit tests'
+    - task: PublishTestResults@2
+      condition: succeededOrFailed()
+      inputs:
+        testRunTitle: 'Publish test results for Python $(python.version)'
+        testResultsFiles: 'tests/log_xml/*.xml'
+        failTaskOnFailedTests: true
diff --git a/benchmarks/models/fast_test.sh b/benchmarks/models/fast_test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# Run it at its parent folder, and check result at ../perf. 
-# USAGE -./benchmark.sh 
+# Run it at its parent folder, and check result at ../perf.
+# USAGE -./benchmark.sh
 #   [fairseq|fairseq+fastseq|transformers|transformers+fastseq]
 #   <model>
 #   <task>

diff --git a/fastseq/config.py b/fastseq/config.py
@@ -9,9 +9,11 @@
 FASTSEQ_DEFAULT_LOG_LEVEL = 'INFO'
 FASTSEQ_LOG_LEVEL = os.getenv('FASTSEQ_LOG_LEVEL', FASTSEQ_DEFAULT_LOG_LEVEL)
 FASTSEQ_CACHE_DIR = os.getenv('FASTSEQ_CACHE_DIR', os.path.join(os.sep, 'tmp'))
+FASTSEQ_UNITTEST_LOG_XML_DIR = os.getenv(
+    'FASTSEQ_UNITTEST_LOG_XML_DIR', os.path.join('tests', 'log_xml'))
 
 FASTSEQ_LOG_FORMAT = (
-    '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+    '%(levelname)s %(asctime)s %(pathname)s:%(lineno)d] %(message)s')
 
 FASTSEQ_VERSION = '0.0.4'
 

diff --git a/fastseq/models/prophetnet_fs/bert_dictionary.py b/fastseq/models/prophetnet_fs/bert_dictionary.py
@@ -7,14 +7,16 @@
 
 from collections import Counter
 from multiprocessing import Pool
+import logging
 import os
-
 import torch
 
 from fairseq.tokenizer import tokenize_line
 from fairseq.binarizer import safe_readline
 from fairseq.data import data_utils, Dictionary
+from fastseq.logging import get_logger
 
+logger = get_logger(__name__, logging.INFO)
 
 class BertDictionary(Dictionary):
     """A mapping from symbols to consecutive integers"""
@@ -37,11 +39,17 @@ def load_from_file(cls, filename):
         d.count = []
         d.indices = {}
 
+        line_cnt = 0
         with open(
             filename, 'r', encoding='utf-8', errors='ignore') as input_file:
             for line in input_file:
-                k, v = line.split()
-                d.add_symbol(k)
+                line_cnt += 1
+                try:
+                    k, v = line.split(" ")
+                    d.add_symbol(k)
+                except:
+                    logger.error("Bad line at line: %d (1-based), content: '%s'." % (line_cnt, line))
+                    raise
 
         d.unk_word = '[UNK]'
         d.pad_word = '[PAD]'

diff --git a/fastseq/ops/__init__.py b/fastseq/ops/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/fastseq/optimizer/fairseq/__init__.py b/fastseq/optimizer/fairseq/__init__.py
@@ -40,7 +40,9 @@ def apply_fairseq_optimization():
             f"fairseq(v{fairseq.__version__}) is not supported by fastseq(v"
             f"{FASTSEQ_VERSION}) yet, please change fairseq to "
             f"v{MIN_FAIRSEQ_VERSION} ~ v{MAX_FAIRSEQ_VERSION}, or check other "
-            "versions of fastseq.")
+            "versions of fastseq. Currently, no optimization in fastseq has "
+            "been applied. Please ignore this warning if you are not using "
+            "fairseq")
         return
 
     import fastseq.optimizer.fairseq.beam_search_optimizer  # pylint: disable=import-outside-toplevel
@@ -68,15 +70,20 @@ def _update_fairseq_model_registration():
                 "Update the register model arch {} from {} to {}".format(
                     arch_name, model_class, OPTIMIZED_CLASSES[model_class]))
 
+is_fairseq_installed = True
 
 try:
     import fairseq # pylint: disable=ungrouped-imports
     from fairseq.models import ARCH_MODEL_REGISTRY, MODEL_REGISTRY # pylint: disable=ungrouped-imports
     from fairseq.sequence_generator import SequenceGenerator # pylint: disable=ungrouped-imports
-    apply_fairseq_optimization()
 except ImportError as error:
+    is_fairseq_installed = False
     logger.warning('fairseq can not be imported. Please ignore this warning if '
-                   'you are not using fairseq')
-except:
-    logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
-    raise
+                   'you are not using fairseq: {}'.format(error))
+
+if is_fairseq_installed:
+    try:
+        apply_fairseq_optimization()
+    except:
+        logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
+        raise
diff --git a/fastseq/optimizer/fairseq/beam_search_optimizer.py b/fastseq/optimizer/fairseq/beam_search_optimizer.py
@@ -13,10 +13,40 @@
 from fairseq import utils
 from fairseq.models.transformer import TransformerEncoder, TransformerModel
 from fairseq.modules.multihead_attention import MultiheadAttention
+from fairseq.search import BeamSearch
 from fairseq.sequence_generator import SequenceGenerator
 from fastseq.ops.ngram_repeat_block import NGramRepeatBlock
 from fastseq.utils.api_decorator import replace
 
+@replace(BeamSearch)
+class BeamSearchV2(BeamSearch):
+
+    def step(self, step, lprobs, scores):
+        super()._init_buffers(lprobs)
+        bsz, beam_size, vocab_size = lprobs.size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))
+
+        torch.topk(
+            lprobs.view(bsz, -1),
+            k=min(
+                # Take the best 2 x beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                beam_size * 2,
+                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+            out=(self.scores_buf, self.indices_buf),
+        )
+        self.beams_buf = torch.floor_divide(self.indices_buf, vocab_size)
+        self.indices_buf.fmod_(vocab_size)
+        return self.scores_buf, self.indices_buf, self.beams_buf
+
 @replace(TransformerEncoder)
 class TransformerEncoderV2(TransformerEncoder):
     """
@@ -494,6 +524,49 @@ def is_finished(sent, step, unfin_idx):
                 return True
             return False
 
+        def apply_no_repeat_ngram_cpu(self, tokens,lprobs, bsz,step,
+                beam_size, no_repeat_ngram_size):
+            """ Fairseq implementation of blocking
+                repeated ngrams
+            """
+            banned_list = [[] for bbsz_idx in range(bsz * beam_size)]
+            cpu_tokens = tokens.cpu()[:, :step + 1].numpy()
+            check_start_pos = step + 2 - no_repeat_ngram_size
+            for bbsz_idx in range(bsz * beam_size):
+                for i in range(check_start_pos):
+                    is_banned = True
+                    for k in range(no_repeat_ngram_size - 1):
+                        if cpu_tokens[bbsz_idx, i + k] != cpu_tokens[
+                            bbsz_idx, check_start_pos + k]:
+                            is_banned = False
+                            break
+                    if is_banned:
+                        banned_list[bbsz_idx].append(
+                            cpu_tokens[bbsz_idx,
+                                       i + no_repeat_ngram_size - 1])
+
+            def calculate_banned_tokens(bbsz_idx):
+                """before decoding the next token, prevent decoding
+                of ngrams that have already appeared
+                """
+                banned_tokens_per_sample = [
+                    (bbsz_idx, t) for t in banned_list[bbsz_idx]
+                ]
+                return banned_tokens_per_sample
+
+            banned_tokens = []
+            if step + 2 - no_repeat_ngram_size >= 0:
+                for bbsz_idx in range(bsz * beam_size):
+                    banned_tokens.extend(calculate_banned_tokens(bbsz_idx))
+
+            if banned_tokens:
+                banned_tokens = torch.LongTensor(banned_tokens)
+                lprobs.index_put_(
+                    tuple(banned_tokens.t()),
+                    lprobs.new_tensor([-math.inf] * len(banned_tokens)))
+
+            return lprobs
+
         def finalize_hypos(step, bbsz_idx, eos_scores):
             """
             Finalize the given hypotheses at this step, while keeping the total
@@ -658,8 +731,12 @@ def replicate_first_beam(tensor, mask):
 
             if self.no_repeat_ngram_size > 0:
                 #Applying Cuda Op for NGram repeat Blocking
-                lprobs = self.no_repeat_ngram_op(tokens,lprobs, bsz, step,
-                        beam_size, self.no_repeat_ngram_size)
+                if (tokens.is_cuda and lprobs.is_cuda):
+                    lprobs = self.no_repeat_ngram_op(tokens,lprobs, bsz, step,
+                            beam_size, self.no_repeat_ngram_size)
+                else:
+                    lprobs = apply_no_repeat_ngram_cpu(tokens, lprobs, bsz,
+                                step, beam_size, self.ngram_repeat_block_size)
 
             cand_scores, cand_indices, cand_beams = self.search.step(
                 step,
@@ -678,18 +755,16 @@ def replicate_first_beam(tensor, mask):
             eos_mask[:, :beam_size][blacklist] = 0
 
             # only consider eos when it's among the top beam_size indices
-            torch.masked_select(
+            eos_bbsz_idx = torch.masked_select(
                 cand_bbsz_idx[:, :beam_size],
                 mask=eos_mask[:, :beam_size],
-                out=eos_bbsz_idx,
             )
 
             finalized_sents = set()
             if eos_bbsz_idx.numel() > 0:
-                torch.masked_select(
+                eos_scores = torch.masked_select(
                     cand_scores[:, :beam_size],
                     mask=eos_mask[:, :beam_size],
-                    out=eos_scores,
                 )
                 finalized_sents = finalize_hypos(step, eos_bbsz_idx,
                                                  eos_scores)
@@ -706,7 +781,7 @@ def replicate_first_beam(tensor, mask):
                 # construct batch_idxs which holds indices of batches to keep for the next pass
                 batch_mask = cand_indices.new_ones(bsz)
                 batch_mask[cand_indices.new(finalized_sents)] = 0
-                batch_idxs = batch_mask.nonzero().squeeze(-1)
+                batch_idxs = torch.nonzero(batch_mask).squeeze(-1)
 
                 eos_mask = eos_mask[batch_idxs]
                 cand_beams = cand_beams[batch_idxs]
@@ -739,10 +814,9 @@ def replicate_first_beam(tensor, mask):
             # candidate active hypos.
             active_mask = buffer('active_mask')
             eos_mask[:, :beam_size] |= blacklist
-            torch.add(
+            active_mask = torch.add(
                 eos_mask.type_as(cand_offsets) * cand_size,
                 cand_offsets[:eos_mask.size(1)],
-                out=active_mask,
             )
 
             # get the top beam_size active hypotheses, which are just the hypos

diff --git a/fastseq/optimizer/jit/__init__.py b/fastseq/optimizer/jit/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/fastseq/optimizer/jit/einsum_rewriter.py b/fastseq/optimizer/jit/einsum_rewriter.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Optmize einsum operation in the graph"""
+
+from typing import List
+
+import torch
+from torch import Tensor
+
+from fastseq.optimizer.jit.utils import graph_pattern, rewrite_graph
+
+def einsum_pattern_0(t0: str, t1: List[Tensor]):
+    r = torch.einsum(t0, t1)
+    return r
+
+def einsum_rewrite_pattern_0(eqn: str, operands: List[Tensor]):
+    # eqn = eqn.replace(' ', '')  # TODO: fix the issue: ValueError: stoll
+    # for cases like "bmhtd,bnhsd->bmhts"
+    if (len(eqn) == 18 and eqn[0:4] == eqn[13:17] and eqn[0] == eqn[6] and
+        eqn[2] == eqn[8] and eqn[4] == eqn[10] and eqn[9] == eqn[17]):
+        t0 = operands[0]
+        t1 = operands[1]
+        b, m, h, t, d = t0.shape
+        s = t1.size(3)
+        n = t1.size(1)
+        t1 = t1.permute(0, 2, 3, 4, 1) # (b, h, s, d, n)
+        if n > 1:
+            t1 = t1.sum(dim=4, keepdim=True) # (b, h, s, d, 1)
+
+        t0 = t0.permute(0, 2, 1, 3, 4) # (b, h, m, t, d)
+        t1 = t1.permute(0, 1, 3, 4, 2) # (b, h, d, 1, s)
+        t0 = t0.reshape(b*h, m*t, d)
+        t1 = t1.view(b*h, d, s)
+        r = torch.bmm(t0, t1).view(b, h, m, t, s).permute(0, 2, 1, 3, 4)
+        return r
+
+    # for cases like "bmhts,bnhsd->bmhtd"
+    if (len(eqn) == 18 and eqn[0:4] == eqn[13:17] and eqn[0] == eqn[6] and
+        eqn[2] == eqn[8] and eqn[4] == eqn[9] and eqn[10] == eqn[17]):
+        t0 = operands[0]
+        t1 = operands[1]
+        b, m, h, t, s = t0.shape
+        n = t1.size(1)
+        d = t1.size(4)
+        t1 = t1.permute(0, 2, 4, 3, 1) # (b, h, d, s, n)
+        if n > 1:
+            t1 = t1.sum(dim=4, keepdim=True) # (b, h, d, s, 1)
+        # t1 = t1.squeeze(1) # (b, h, s, d)
+        t0 = t0.permute(0, 2, 1, 3, 4) # (b, h, m, t, s)
+        t1 = t1.permute(0, 1, 3, 4, 2) # (b, h, s, 1, d)
+        t0 = t0.reshape(b*h, m*t, s)
+        t1 = t1.view(b*h, s, d)
+        r = torch.bmm(t0, t1).view(b, h, m, t, d).permute(0, 2, 1, 3, 4)
+        return r
+
+    return torch.einsum(eqn, operands)
+
+EINSUM_PATTERN_STR = graph_pattern(einsum_pattern_0)()
+EINSUM_REWRITE_PATTERN_STR = graph_pattern(einsum_rewrite_pattern_0)()
+
+def rewrite_einsum(input_graph: torch._C.Graph):
+    rewrite_graph(EINSUM_PATTERN_STR, EINSUM_REWRITE_PATTERN_STR, input_graph)
diff --git a/fastseq/optimizer/jit/graph_rewriter.py b/fastseq/optimizer/jit/graph_rewriter.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Load and apply the registered graph rewrite patterns"""
+
+import torch
+
+from fastseq.optimizer.jit.einsum_rewriter import rewrite_einsum
+
+def optimize_graph(input_graph: torch._C.Graph):
+    rewrite_einsum(input_graph)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (c) Microsoft Corporation.
		# Licensed under the MIT License.