diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex index 13928ee1..7a78d7df 100644 --- a/lib/bumblebee.ex +++ b/lib/bumblebee.ex @@ -176,6 +176,7 @@ defmodule Bumblebee do "MPNetForTokenClassification" => {Bumblebee.Text.MpNet, :for_token_classification}, "MPNetForQuestionAnswering" => {Bumblebee.Text.MpNet, :for_question_answering}, "MPNetForMultipleChoice" => {Bumblebee.Text.MpNet, :for_multiple_choice}, + "NomicBertModel" => {Bumblebee.Text.NomicBert, :base}, "PhiModel" => {Bumblebee.Text.Phi, :base}, "PhiForCausalLM" => {Bumblebee.Text.Phi, :for_causal_language_modeling}, "PhiForSequenceClassification" => {Bumblebee.Text.Phi, :for_sequence_classification}, @@ -266,6 +267,7 @@ defmodule Bumblebee do "mistral" => :llama, "mbart" => :mbart, "mpnet" => :mpnet, + "nomic_bert" => :bert, "phi" => :code_gen, "phi3" => :llama, "qwen3" => :qwen2, diff --git a/lib/bumblebee/text/nomic_bert.ex b/lib/bumblebee/text/nomic_bert.ex new file mode 100644 index 00000000..98a4ef84 --- /dev/null +++ b/lib/bumblebee/text/nomic_bert.ex @@ -0,0 +1,357 @@ +defmodule Bumblebee.Text.NomicBert do + alias Bumblebee.Shared + + options = + [ + vocab_size: [ + default: 30528, + doc: """ + the vocabulary size of the token embedding. This corresponds to the number of distinct + tokens that can be represented in model input and output + """ + ], + max_positions: [ + default: 8192, + doc: """ + the maximum sequence length that this model can process. Typically this is set to a large + value just in case, such as 512, 1024 or 2048 + """ + ], + max_token_types: [ + default: 2, + doc: """ + the vocabulary size of the token type embedding (also referred to as segment embedding). + This corresponds to how many different token groups can be distinguished in the input + """ + ], + hidden_size: [ + default: 768, + doc: "the dimensionality of hidden layers" + ], + num_blocks: [ + default: 12, + doc: "the number of Transformer blocks in the encoder" + ], + num_attention_heads: [ + default: 12, + doc: "the number of attention heads for each attention layer in the encoder" + ], + intermediate_size: [ + default: nil, + doc: + "the dimensionality of the intermediate layer in the transformer feed-forward network (FFN) in the encoder. Defaults to 4 * hidden_size" + ], + activation: [ + default: :silu, + doc: "the activation function" + ], + rotary_embedding_base: [ + default: 1000, + doc: "base for computing rotary embedding frequency" + ], + rotary_embedding_percentage: [ + default: 1.0, + doc: "percentage of hidden size to use for rotary embeddings" + ], + layer_norm_epsilon: [ + default: 1.0e-5, + doc: "the epsilon used by the layer normalization layers" + ], + initializer_scale: [ + default: 0.02, + doc: + "the standard deviation of the normal initializer used for initializing kernel parameters" + ], + ffn_gate_bias: [ + default: true, + doc: "whether to use bias in the up and gate projections of the FFN" + ], + ffn_output_bias: [ + default: true, + doc: "whether to use bias in the output projection of the FFN" + ] + ] ++ Shared.common_options([:num_labels, :id_to_label]) + + @moduledoc """ + Nomic BERT model family. + + This is a variant of BERT that uses: + - Rotary position embeddings (RoPE) instead of absolute position embeddings + - SwiGLU activation in the feed-forward network + - Post-normalization (like original BERT) + - No biases in attention and feed-forward layers + + ## Architectures + + * `:base` - plain Nomic BERT without any head on top + + ## Inputs + + * `"input_ids"` - `{batch_size, sequence_length}` + + Indices of input sequence tokens in the vocabulary. + + * `"attention_mask"` - `{batch_size, sequence_length}` + + Mask indicating which tokens to attend to. This is used to ignore + padding tokens, which are added when processing a batch of sequences + with different length. + + * `"position_ids"` - `{batch_size, sequence_length}` + + Indices of positions of each input sequence token used when applying + rotary position embeddings (RoPE). + + ## Global layer options + + #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])} + + ## Configuration + + #{Shared.options_doc(options)} + """ + + defstruct [architecture: :base] ++ Shared.option_defaults(options) + + @behaviour Bumblebee.ModelSpec + @behaviour Bumblebee.Configurable + + import Bumblebee.Utils.Model, only: [join: 2] + + alias Bumblebee.Layers + + @impl true + def architectures(), do: [:base] + + @impl true + def config(spec, opts) do + spec + |> Shared.put_config_attrs(opts) + |> Shared.validate_label_options() + end + + @impl true + def input_template(_spec) do + %{"input_ids" => Nx.template({1, 1}, :u32)} + end + + @impl true + def model(%__MODULE__{architecture: :base} = spec) do + inputs = inputs(spec) + + inputs + |> core(spec) + |> Layers.output() + end + + defp inputs(spec) do + shape = {nil, nil} + attention_head_mask_shape = {spec.num_blocks, spec.num_attention_heads} + + Bumblebee.Utils.Model.inputs_to_map([ + Axon.input("input_ids", shape: shape), + Axon.input("attention_mask", optional: true, shape: shape), + Axon.input("token_type_ids", optional: true, shape: shape), + Axon.input("position_ids", optional: true, shape: shape), + Axon.input("attention_head_mask", optional: true, shape: attention_head_mask_shape) + ]) + end + + defp core(inputs, spec) do + token_type_ids = + Layers.default inputs["token_type_ids"] do + Layers.default_token_type_ids(inputs["input_ids"]) + end + + embeddings = embedder(inputs["input_ids"], token_type_ids, spec, name: "embedder") + + position_ids = + Layers.default inputs["position_ids"] do + Layers.default_position_ids(embeddings) + end + + encoder_outputs = + encoder( + embeddings, + position_ids, + inputs["attention_mask"], + inputs["attention_head_mask"], + spec, + name: "encoder" + ) + + pooled_state = pooler(encoder_outputs.hidden_state, spec, name: "pooler") + + %{ + hidden_state: encoder_outputs.hidden_state, + pooled_state: pooled_state, + hidden_states: encoder_outputs.hidden_states, + attentions: encoder_outputs.attentions + } + end + + defp embedder(input_ids, token_type_ids, spec, opts) do + name = opts[:name] + + token_embeddings = + Axon.embedding(input_ids, spec.vocab_size, spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + name: join(name, "token_embedding") + ) + + token_type_embeddings = + Axon.embedding(token_type_ids, spec.max_token_types, spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + name: join(name, "token_type_embedding") + ) + + Axon.add([token_embeddings, token_type_embeddings]) + |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "norm")) + end + + defp encoder(hidden_state, position_ids, attention_mask, attention_head_mask, spec, opts) do + name = opts[:name] + + Layers.Transformer.blocks(hidden_state, + attention_mask: attention_mask, + attention_head_mask: attention_head_mask, + num_blocks: spec.num_blocks, + num_attention_heads: spec.num_attention_heads, + hidden_size: spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + layer_norm: [epsilon: spec.layer_norm_epsilon], + ffn: + &gated_ffn(&1, intermediate_size(spec), spec.hidden_size, + name: &2, + activation: spec.activation, + gate_use_bias: spec.ffn_gate_bias, + output_use_bias: spec.ffn_output_bias + ), + block_type: :standard, + causal: false, + rotary_embedding: [ + position_ids: position_ids, + max_positions: spec.max_positions, + base: spec.rotary_embedding_base, + percentage: spec.rotary_embedding_percentage + ], + query_use_bias: false, + key_use_bias: false, + value_use_bias: false, + output_use_bias: false, + name: join(name, "blocks") + ) + end + + defp pooler(hidden_state, spec, opts) do + name = opts[:name] + + hidden_state + |> Layers.take_token(index: 0, axis: 1) + |> Axon.dense(spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + name: join(name, "output") + ) + |> Axon.tanh() + end + + defp gated_ffn(hidden_state, intermediate_size, output_size, opts) do + name = opts[:name] + activation = opts[:activation] + gate_use_bias = opts[:gate_use_bias] + output_use_bias = opts[:output_use_bias] + + # Nomic MLP: y = fc11(x) * activation(fc12(x)), then fc2 + # fc11 is "up", fc12 is "gate", fc2 is "down" + up = + Axon.dense(hidden_state, intermediate_size, + name: join(name, "up"), + use_bias: gate_use_bias + ) + + gate = + Axon.dense(hidden_state, intermediate_size, + name: join(name, "gate"), + use_bias: gate_use_bias + ) + + # Nomic applies activation to gate, not up: up * activation(gate) + hidden_state = Axon.multiply(up, Axon.activation(gate, activation)) + + Axon.dense(hidden_state, output_size, name: join(name, "down"), use_bias: output_use_bias) + end + + defp kernel_initializer(spec) do + Axon.Initializers.normal(scale: spec.initializer_scale) + end + + # NomicBERT rounds intermediate_size to nearest multiple of 256 for hardware efficiency + defp intermediate_size(spec) do + size = spec.intermediate_size || div(8 * spec.hidden_size, 3) + multiple_of = 256 + div(size + multiple_of - 1, multiple_of) * multiple_of + end + + defimpl Bumblebee.HuggingFace.Transformers.Config do + def load(spec, data) do + import Shared.Converters + + opts = + convert!(data, + vocab_size: {"vocab_size", number()}, + max_positions: {"n_positions", number()}, + max_token_types: {"type_vocab_size", number()}, + hidden_size: {"n_embd", number()}, + num_blocks: {"n_layer", number()}, + num_attention_heads: {"n_head", number()}, + intermediate_size: {"n_inner", optional(number())}, + rotary_embedding_base: {"rotary_emb_base", number()}, + rotary_embedding_percentage: {"rotary_emb_fraction", optional(number())}, + layer_norm_epsilon: {"layer_norm_epsilon", number()}, + initializer_scale: {"initializer_range", number()}, + ffn_gate_bias: {"mlp_fc1_bias", boolean()}, + ffn_output_bias: {"mlp_fc2_bias", boolean()} + ) ++ Shared.common_options_from_transformers(data, spec) + + @for.config(spec, opts) + end + end + + defimpl Bumblebee.HuggingFace.Transformers.Model do + def params_mapping(_spec) do + %{ + "embedder.token_embedding" => "embeddings.word_embeddings", + "embedder.token_type_embedding" => "embeddings.token_type_embeddings", + "embedder.norm" => "emb_ln", + "encoder.blocks.{n}.self_attention.query" => qkv_dense("encoder.layers.{n}.attn.Wqkv", 0), + "encoder.blocks.{n}.self_attention.key" => qkv_dense("encoder.layers.{n}.attn.Wqkv", 1), + "encoder.blocks.{n}.self_attention.value" => qkv_dense("encoder.layers.{n}.attn.Wqkv", 2), + "encoder.blocks.{n}.self_attention.output" => "encoder.layers.{n}.attn.out_proj", + "encoder.blocks.{n}.self_attention_norm" => "encoder.layers.{n}.norm1", + "encoder.blocks.{n}.ffn.up" => "encoder.layers.{n}.mlp.fc11", + "encoder.blocks.{n}.ffn.gate" => "encoder.layers.{n}.mlp.fc12", + "encoder.blocks.{n}.ffn.down" => "encoder.layers.{n}.mlp.fc2", + "encoder.blocks.{n}.output_norm" => "encoder.layers.{n}.norm2", + "pooler.output" => "pooler.dense" + } + end + + defp qkv_dense(source_layer_name, chunk_idx) do + # Wqkv is [3 * hidden_size, hidden_size] in PyTorch format + # After slicing, transpose to get [hidden_size, hidden_size] for Axon + %{ + "kernel" => { + [{source_layer_name, "weight"}], + fn [kernel] -> + size = Nx.axis_size(kernel, 0) + step = div(size, 3) + + kernel + |> Nx.slice_along_axis(chunk_idx * step, step, axis: 0) + |> Nx.transpose() + end + } + } + end + end +end diff --git a/mix.exs b/mix.exs index 47b2f90f..089c7f9d 100644 --- a/mix.exs +++ b/mix.exs @@ -101,6 +101,7 @@ defmodule Bumblebee.MixProject do Bumblebee.Text.Mbart, Bumblebee.Text.Mistral, Bumblebee.Text.MpNet, + Bumblebee.Text.NomicBert, Bumblebee.Text.Phi, Bumblebee.Text.Phi3, Bumblebee.Text.Roberta, diff --git a/test/bumblebee/text/nomic_bert_test.exs b/test/bumblebee/text/nomic_bert_test.exs new file mode 100644 index 00000000..92b1829b --- /dev/null +++ b/test/bumblebee/text/nomic_bert_test.exs @@ -0,0 +1,36 @@ +defmodule Bumblebee.Text.NomicBertTest do + use ExUnit.Case, async: true + + import Bumblebee.TestHelpers + + @moduletag model_test_tags() + + test ":base" do + assert {:ok, %{model: model, params: params, spec: spec}} = + Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-NomicBertModel"}) + + assert %Bumblebee.Text.NomicBert{architecture: :base} = spec + + inputs = %{ + "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]), + "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]) + } + + outputs = Axon.predict(model, params, inputs) + + assert Nx.shape(outputs.hidden_state) == {1, 10, 32} + assert Nx.shape(outputs.pooled_state) == {1, 32} + + assert_all_close( + outputs.hidden_state[[.., 1..3, 1..3]], + Nx.tensor([ + [[0.3062, -2.1607, -0.1782], [0.5486, 0.5353, -0.0453], [0.6323, -0.7370, -2.0245]] + ]) + ) + + assert_all_close( + outputs.pooled_state[[.., 1..3]], + Nx.tensor([[0.0197, -0.2129, -0.0071]]) + ) + end +end