Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# User input
prompt = "Lily picked up a flower."
model_name = "Maykeye/TinyLLama-v0"

# Tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
inputs = tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
max_length=30,
truncation=True,
)

# Generator
import torch

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

from tico.utils.record_input import RecordingInput

# past_key_values
# ---------------
# During prefill, "past_key_values" not None, but an empty Cache instance.
# Passing None makes torch.export happy.


input_to_remove = [
"attention_mask",
# For left pad, [0, ⋯, 0, 1, ⋯, 1]
# For right right pad, [1, ⋯, 1, 0, ⋯, 0]
# ( 0 is pad-token )
# This script uses right pad and pass all-1 attention mask (including pad).
# Npu computes all positions whether it is pad or not.
]
condition_fn = lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0

with torch.no_grad(), RecordingInput(
model, condition_fn, input_to_remove=input_to_remove
) as rec:
outputs = model.generate(
**inputs,
max_new_tokens=32,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
)
captured_input = rec.captured_input

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

# Tico
import tico
from tico.serialize.operators.onert.op_attention import llama_attention_forward_adapter
from transformers.models.llama.modeling_llama import LlamaAttention

LlamaAttention.forward = llama_attention_forward_adapter

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
circle_model = tico.convert(model, captured_input)
circle_model.save(f"tinyllama.decode.circle")
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# User input
prompt = "Lily picked up a flower."
model_name = "Maykeye/TinyLLama-v0"

# Tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
inputs = tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
max_length=31,
truncation=True,
)

# Generator
import torch

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

from tico.utils.record_input import RecordingInput

target_model = model.model.layers[0]
condition_fn = lambda args_dict: args_dict["past_key_value"].get_seq_length() != 0

with torch.no_grad(), RecordingInput(target_model, condition_fn) as rec:
outputs = model.generate(
**inputs,
max_new_tokens=32,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
)
captured_input = rec.captured_input

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


# Convert

import tico
from tico.serialize.operators.onert.op_attention import llama_attention_forward_adapter
from transformers.models.llama.modeling_llama import LlamaAttention

LlamaAttention.forward = llama_attention_forward_adapter

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
circle_model = tico.convert(model.model.layers[0], captured_input)
circle_model.save(f"tinyllama.layer.attn.circle")
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# User input
prompt = "Lily picked up a flower."
model_name = "Maykeye/TinyLLama-v0"

# Tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
inputs = tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
max_length=32,
truncation=True,
)

# Generator
import torch

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

from tico.utils.record_input import RecordingInput

target_model = model.model.layers[0]
condition_fn = lambda args_dict: args_dict["past_key_value"].get_seq_length() != 0

with torch.no_grad(), RecordingInput(target_model, condition_fn) as rec:
outputs = model.generate(
**inputs,
max_new_tokens=32,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
)
captured_input = rec.captured_input

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

from typing import Any, Optional, Tuple

# Define DecoderLayers

from torch import nn
from transformers.cache_utils import Cache
from transformers.models.llama.modeling_llama import LlamaAttention, LlamaModel


# DecoderLayers is not nn.Module. Not torch.export-able.
# Let's define decoder layers as nn.Module.


class LlamaDecoderLayers(nn.Module):
def __init__(self, model: LlamaModel):
super().__init__()
self.config = model.config
self.layers = model.layers

# Make sure signature is same to capturing input.
# Just copy and Paste from LlamaDecoderLayer::forward
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[
Tuple[torch.Tensor, torch.Tensor]
] = None, # necessary, but kept here for BC
**kwargs: Any,
) -> Tuple[
torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
]:

for decoder_layer in self.layers[: self.config.num_hidden_layers]:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
past_key_value=past_key_value,
cache_position=cache_position,
position_embeddings=position_embeddings,
)
hidden_states = layer_outputs[0]

return hidden_states


# Convert

import tico

# NOTE:
# If you want to restore forward, it may be implemented as context manager.
# However, it is just a simple script to export. No one uses forward after tico conversion.
from tico.serialize.operators.onert.op_attention import llama_attention_forward_adapter

LlamaAttention.forward = llama_attention_forward_adapter

model = AutoModelForCausalLM.from_pretrained(model_name)
layers = LlamaDecoderLayers(model.model)
layers.eval()
circle_model = tico.convert(layers, captured_input)
circle_model.save(f"tinyllama.layers.attn.circle")
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# User input
prompt = "Lily picked up a flower."
model_name = "Maykeye/TinyLLama-v0"

# Tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
inputs = tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
max_length=32,
truncation=True,
)

# Generator
import torch

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

from tico.utils.record_input import RecordingInput

# past_key_values
# ---------------
# During prefill, "past_key_values" not None, but an empty Cache instance.
# Passing None makes torch.export happy.


input_to_remove = [
"past_key_values",
# DynamicCache is flatten-able operator since 4.50.
# See _pytree.py > tree_flatten
# SUPPORTED_NODES has *transformers.DynamicCache*
# After flattening, DynamicCache becomes { "key_cache": [] , "value_cache": [ ] }
# dict.value is returne. dict.key is stored in treespec.
#
# On prefill, DynamicCache is empty, and dict is empty after flattening.
# PyTorch removes empty dict!
# If number of args is 4 (including cache), it becomes 3!
# To avoid this error, don't pass empty cache, just pass None.
"attention_mask",
# For left pad, [0, ⋯, 0, 1, ⋯, 1]
# For right right pad, [1, ⋯, 1, 0, ⋯, 0]
# ( 0 is pad-token )
# This script uses right pad and pass all-1 attention mask (including pad).
# Npu computes all positions whether it is pad or not.
"cache_position"
# It is the list of cache position like [0, 1, ..., 11].
# For npu, we always store all values (including pad).
]

with torch.no_grad(), RecordingInput(model, input_to_remove=input_to_remove) as rec:
outputs = model.generate(
**inputs,
max_new_tokens=32,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
)
captured_input = rec.captured_input
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Retrieve captured_input


generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

# Tico
import tico

model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
circle_model = tico.convert(model, captured_input)
Copy link
Contributor Author

@glistening glistening Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then, pass captured_input to tico.convert.

circle_model.save(f"tinyllama.prefill.circle")
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
transformers>=4.50.1
Loading