Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,20 @@ We developped a wide range of speedup techniques, including improving beam searc

If you use fairseq or transformers, you only need to install one of them. If you use both, you need to install both.

### Install from PIP package

`fastseq` Python package can be directly installed with pip using
### Install from the source

```bash
$ pip install fastseq
```
# when fairseq and/or transformers has been installed
$ pip install git+https://github.com/microsoft/fastseq.git

### Install from the source
# install fastseq + transformers
$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[transformers]

```bash
$ git clone https://github.com/microsoft/fastseq
$ cd fastseq
$ pip install --editable ./
# install fastseq + fairseq
$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[fairseq]

# install fastseq + transformers + fairseq
$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[transformers,fairseq]
```

## Usage
Expand Down
22 changes: 16 additions & 6 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,14 @@ jobs:
steps:
- script: |
#install fastseq
pip install --editable .
pip install --editable .[transformers,fairseq]

echo "******* Running fastseq unittests *******"
pip install pytorch-transformers==1.0.0
bash tests/run_fastseq_tests.sh

#cd benchmarks/
#bash run_all_benchmarks.sh

#show environment
which python
Expand All @@ -32,11 +39,14 @@ jobs:

echo "******* Running fairseq unittests *******"
bash tests/run_fairseq_tests.sh

echo "******* Running transformers unittests *******"
bash tests/run_transformers_tests.sh
echo "******* Running fastseq unittests *******"
pip install pytorch-transformers==1.0.0
python -m unittest discover -s tests/ -p 'test_*.py' -v
#cd benchmarks/
#bash run_all_benchmarks.sh

displayName: 'run fastseq unit tests'
- task: PublishTestResults@2
condition: succeededOrFailed()
inputs:
testRunTitle: 'Publish test results for Python $(python.version)'
testResultsFiles: 'tests/log_xml/*.xml'
failTaskOnFailedTests: true
4 changes: 2 additions & 2 deletions benchmarks/models/fast_test.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
# Run it at its parent folder, and check result at ../perf.
# USAGE -./benchmark.sh
# Run it at its parent folder, and check result at ../perf.
# USAGE -./benchmark.sh
# [fairseq|fairseq+fastseq|transformers|transformers+fastseq]
# <model>
# <task>
Expand Down
4 changes: 3 additions & 1 deletion fastseq/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
FASTSEQ_DEFAULT_LOG_LEVEL = 'INFO'
FASTSEQ_LOG_LEVEL = os.getenv('FASTSEQ_LOG_LEVEL', FASTSEQ_DEFAULT_LOG_LEVEL)
FASTSEQ_CACHE_DIR = os.getenv('FASTSEQ_CACHE_DIR', os.path.join(os.sep, 'tmp'))
FASTSEQ_UNITTEST_LOG_XML_DIR = os.getenv(
'FASTSEQ_UNITTEST_LOG_XML_DIR', os.path.join('tests', 'log_xml'))

FASTSEQ_LOG_FORMAT = (
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
'%(levelname)s %(asctime)s %(pathname)s:%(lineno)d] %(message)s')

FASTSEQ_VERSION = '0.0.4'

Expand Down
14 changes: 11 additions & 3 deletions fastseq/models/prophetnet_fs/bert_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@

from collections import Counter
from multiprocessing import Pool
import logging
import os

import torch

from fairseq.tokenizer import tokenize_line
from fairseq.binarizer import safe_readline
from fairseq.data import data_utils, Dictionary
from fastseq.logging import get_logger

logger = get_logger(__name__, logging.INFO)

class BertDictionary(Dictionary):
"""A mapping from symbols to consecutive integers"""
Expand All @@ -37,11 +39,17 @@ def load_from_file(cls, filename):
d.count = []
d.indices = {}

line_cnt = 0
with open(
filename, 'r', encoding='utf-8', errors='ignore') as input_file:
for line in input_file:
k, v = line.split()
d.add_symbol(k)
line_cnt += 1
try:
k, v = line.split(" ")
d.add_symbol(k)
except:
logger.error("Bad line at line: %d (1-based), content: '%s'." % (line_cnt, line))
raise

d.unk_word = '[UNK]'
d.pad_word = '[PAD]'
Expand Down
2 changes: 2 additions & 0 deletions fastseq/ops/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
19 changes: 13 additions & 6 deletions fastseq/optimizer/fairseq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def apply_fairseq_optimization():
f"fairseq(v{fairseq.__version__}) is not supported by fastseq(v"
f"{FASTSEQ_VERSION}) yet, please change fairseq to "
f"v{MIN_FAIRSEQ_VERSION} ~ v{MAX_FAIRSEQ_VERSION}, or check other "
"versions of fastseq.")
"versions of fastseq. Currently, no optimization in fastseq has "
"been applied. Please ignore this warning if you are not using "
"fairseq")
return

import fastseq.optimizer.fairseq.beam_search_optimizer # pylint: disable=import-outside-toplevel
Expand Down Expand Up @@ -68,15 +70,20 @@ def _update_fairseq_model_registration():
"Update the register model arch {} from {} to {}".format(
arch_name, model_class, OPTIMIZED_CLASSES[model_class]))

is_fairseq_installed = True

try:
import fairseq # pylint: disable=ungrouped-imports
from fairseq.models import ARCH_MODEL_REGISTRY, MODEL_REGISTRY # pylint: disable=ungrouped-imports
from fairseq.sequence_generator import SequenceGenerator # pylint: disable=ungrouped-imports
apply_fairseq_optimization()
except ImportError as error:
is_fairseq_installed = False
logger.warning('fairseq can not be imported. Please ignore this warning if '
'you are not using fairseq')
except:
logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
raise
'you are not using fairseq: {}'.format(error))

if is_fairseq_installed:
try:
apply_fairseq_optimization()
except:
logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
raise
92 changes: 83 additions & 9 deletions fastseq/optimizer/fairseq/beam_search_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,40 @@
from fairseq import utils
from fairseq.models.transformer import TransformerEncoder, TransformerModel
from fairseq.modules.multihead_attention import MultiheadAttention
from fairseq.search import BeamSearch
from fairseq.sequence_generator import SequenceGenerator
from fastseq.ops.ngram_repeat_block import NGramRepeatBlock
from fastseq.utils.api_decorator import replace

@replace(BeamSearch)
class BeamSearchV2(BeamSearch):

def step(self, step, lprobs, scores):
super()._init_buffers(lprobs)
bsz, beam_size, vocab_size = lprobs.size()

if step == 0:
# at the first step all hypotheses are equally likely, so use
# only the first beam
lprobs = lprobs[:, ::beam_size, :].contiguous()
else:
# make probs contain cumulative scores for each hypothesis
lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))

torch.topk(
lprobs.view(bsz, -1),
k=min(
# Take the best 2 x beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
beam_size * 2,
lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad
),
out=(self.scores_buf, self.indices_buf),
)
self.beams_buf = torch.floor_divide(self.indices_buf, vocab_size)
self.indices_buf.fmod_(vocab_size)
return self.scores_buf, self.indices_buf, self.beams_buf

@replace(TransformerEncoder)
class TransformerEncoderV2(TransformerEncoder):
"""
Expand Down Expand Up @@ -494,6 +524,49 @@ def is_finished(sent, step, unfin_idx):
return True
return False

def apply_no_repeat_ngram_cpu(self, tokens,lprobs, bsz,step,
beam_size, no_repeat_ngram_size):
""" Fairseq implementation of blocking
repeated ngrams
"""
banned_list = [[] for bbsz_idx in range(bsz * beam_size)]
cpu_tokens = tokens.cpu()[:, :step + 1].numpy()
check_start_pos = step + 2 - no_repeat_ngram_size
for bbsz_idx in range(bsz * beam_size):
for i in range(check_start_pos):
is_banned = True
for k in range(no_repeat_ngram_size - 1):
if cpu_tokens[bbsz_idx, i + k] != cpu_tokens[
bbsz_idx, check_start_pos + k]:
is_banned = False
break
if is_banned:
banned_list[bbsz_idx].append(
cpu_tokens[bbsz_idx,
i + no_repeat_ngram_size - 1])

def calculate_banned_tokens(bbsz_idx):
"""before decoding the next token, prevent decoding
of ngrams that have already appeared
"""
banned_tokens_per_sample = [
(bbsz_idx, t) for t in banned_list[bbsz_idx]
]
return banned_tokens_per_sample

banned_tokens = []
if step + 2 - no_repeat_ngram_size >= 0:
for bbsz_idx in range(bsz * beam_size):
banned_tokens.extend(calculate_banned_tokens(bbsz_idx))

if banned_tokens:
banned_tokens = torch.LongTensor(banned_tokens)
lprobs.index_put_(
tuple(banned_tokens.t()),
lprobs.new_tensor([-math.inf] * len(banned_tokens)))

return lprobs

def finalize_hypos(step, bbsz_idx, eos_scores):
"""
Finalize the given hypotheses at this step, while keeping the total
Expand Down Expand Up @@ -658,8 +731,12 @@ def replicate_first_beam(tensor, mask):

if self.no_repeat_ngram_size > 0:
#Applying Cuda Op for NGram repeat Blocking
lprobs = self.no_repeat_ngram_op(tokens,lprobs, bsz, step,
beam_size, self.no_repeat_ngram_size)
if (tokens.is_cuda and lprobs.is_cuda):
lprobs = self.no_repeat_ngram_op(tokens,lprobs, bsz, step,
beam_size, self.no_repeat_ngram_size)
else:
lprobs = apply_no_repeat_ngram_cpu(tokens, lprobs, bsz,
step, beam_size, self.ngram_repeat_block_size)

cand_scores, cand_indices, cand_beams = self.search.step(
step,
Expand All @@ -678,18 +755,16 @@ def replicate_first_beam(tensor, mask):
eos_mask[:, :beam_size][blacklist] = 0

# only consider eos when it's among the top beam_size indices
torch.masked_select(
eos_bbsz_idx = torch.masked_select(
cand_bbsz_idx[:, :beam_size],
mask=eos_mask[:, :beam_size],
out=eos_bbsz_idx,
)

finalized_sents = set()
if eos_bbsz_idx.numel() > 0:
torch.masked_select(
eos_scores = torch.masked_select(
cand_scores[:, :beam_size],
mask=eos_mask[:, :beam_size],
out=eos_scores,
)
finalized_sents = finalize_hypos(step, eos_bbsz_idx,
eos_scores)
Expand All @@ -706,7 +781,7 @@ def replicate_first_beam(tensor, mask):
# construct batch_idxs which holds indices of batches to keep for the next pass
batch_mask = cand_indices.new_ones(bsz)
batch_mask[cand_indices.new(finalized_sents)] = 0
batch_idxs = batch_mask.nonzero().squeeze(-1)
batch_idxs = torch.nonzero(batch_mask).squeeze(-1)

eos_mask = eos_mask[batch_idxs]
cand_beams = cand_beams[batch_idxs]
Expand Down Expand Up @@ -739,10 +814,9 @@ def replicate_first_beam(tensor, mask):
# candidate active hypos.
active_mask = buffer('active_mask')
eos_mask[:, :beam_size] |= blacklist
torch.add(
active_mask = torch.add(
eos_mask.type_as(cand_offsets) * cand_size,
cand_offsets[:eos_mask.size(1)],
out=active_mask,
)

# get the top beam_size active hypotheses, which are just the hypos
Expand Down
2 changes: 2 additions & 0 deletions fastseq/optimizer/jit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
63 changes: 63 additions & 0 deletions fastseq/optimizer/jit/einsum_rewriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Optmize einsum operation in the graph"""

from typing import List

import torch
from torch import Tensor

from fastseq.optimizer.jit.utils import graph_pattern, rewrite_graph

def einsum_pattern_0(t0: str, t1: List[Tensor]):
r = torch.einsum(t0, t1)
return r

def einsum_rewrite_pattern_0(eqn: str, operands: List[Tensor]):
# eqn = eqn.replace(' ', '') # TODO: fix the issue: ValueError: stoll
# for cases like "bmhtd,bnhsd->bmhts"
if (len(eqn) == 18 and eqn[0:4] == eqn[13:17] and eqn[0] == eqn[6] and
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make this more general ? Same pattern can be used for equations without batch dimension.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer to leaving it as what it is. If we meet the cases in the future, it can be added easily with similar code block. To make it more general, it will be more like the implementation of einsum kernel.

From the micro benchmarking result, the runtime for large tensors will be very similar with/without the optimization.

eqn[2] == eqn[8] and eqn[4] == eqn[10] and eqn[9] == eqn[17]):
t0 = operands[0]
t1 = operands[1]
b, m, h, t, d = t0.shape
s = t1.size(3)
n = t1.size(1)
t1 = t1.permute(0, 2, 3, 4, 1) # (b, h, s, d, n)
if n > 1:
t1 = t1.sum(dim=4, keepdim=True) # (b, h, s, d, 1)

t0 = t0.permute(0, 2, 1, 3, 4) # (b, h, m, t, d)
t1 = t1.permute(0, 1, 3, 4, 2) # (b, h, d, 1, s)
t0 = t0.reshape(b*h, m*t, d)
t1 = t1.view(b*h, d, s)
r = torch.bmm(t0, t1).view(b, h, m, t, s).permute(0, 2, 1, 3, 4)
return r

# for cases like "bmhts,bnhsd->bmhtd"
if (len(eqn) == 18 and eqn[0:4] == eqn[13:17] and eqn[0] == eqn[6] and
eqn[2] == eqn[8] and eqn[4] == eqn[9] and eqn[10] == eqn[17]):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same.

t0 = operands[0]
t1 = operands[1]
b, m, h, t, s = t0.shape
n = t1.size(1)
d = t1.size(4)
t1 = t1.permute(0, 2, 4, 3, 1) # (b, h, d, s, n)
if n > 1:
t1 = t1.sum(dim=4, keepdim=True) # (b, h, d, s, 1)
# t1 = t1.squeeze(1) # (b, h, s, d)
t0 = t0.permute(0, 2, 1, 3, 4) # (b, h, m, t, s)
t1 = t1.permute(0, 1, 3, 4, 2) # (b, h, s, 1, d)
t0 = t0.reshape(b*h, m*t, s)
t1 = t1.view(b*h, s, d)
r = torch.bmm(t0, t1).view(b, h, m, t, d).permute(0, 2, 1, 3, 4)
return r
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is returned tensor contiguous ? When comparing speedup with einsum, please take this into account as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the returned tensor is not contiguous, and the output of einsum is not contiguous either, so I think it is an apples to apples comparison.


return torch.einsum(eqn, operands)

EINSUM_PATTERN_STR = graph_pattern(einsum_pattern_0)()
EINSUM_REWRITE_PATTERN_STR = graph_pattern(einsum_rewrite_pattern_0)()

def rewrite_einsum(input_graph: torch._C.Graph):
rewrite_graph(EINSUM_PATTERN_STR, EINSUM_REWRITE_PATTERN_STR, input_graph)
11 changes: 11 additions & 0 deletions fastseq/optimizer/jit/graph_rewriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Load and apply the registered graph rewrite patterns"""

import torch

from fastseq.optimizer.jit.einsum_rewriter import rewrite_einsum

def optimize_graph(input_graph: torch._C.Graph):
rewrite_einsum(input_graph)
Loading