Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -616,13 +616,13 @@ jobs:
runs-on: windows-2022

env:
HIPSDK_INSTALLER_VERSION: "25.Q3"
HIPSDK_INSTALLER_VERSION: "26.Q1"

strategy:
matrix:
include:
- name: "radeon"
gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

steps:
- name: Clone
Expand All @@ -632,7 +632,7 @@ jobs:
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar

Expand All @@ -655,7 +655,7 @@ jobs:
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
Expand Down Expand Up @@ -689,20 +689,20 @@ jobs:
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_BUILD_TYPE=Release `
-DGGML_BACKEND_DL=ON `
-DGGML_NATIVE=OFF `
-DGGML_CPU=OFF `
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
Expand Down
40 changes: 19 additions & 21 deletions scripts/compare-logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,12 @@
"""


def generate_input_prompt(length: int) -> list[str]:
CORPUS = """
You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.

### Tool Call Format:
When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.

You can make multiple calls in one go by placing them one after another.
"""
words = [w.strip() for w in CORPUS.strip().split(" ")]
def get_remote_corpus(url: str, length: int) -> list[str]:
response = requests.get(url)
response.raise_for_status()
corpus = response.text
words = [w.strip() for w in corpus.strip().split(" ")]
words = [w for w in words if "<" not in w] # make sure nothing looks like special tokens
words = [w for w in words if len(w) > 0] # filter out empty strings
while len(words) < length:
words += words
Expand Down Expand Up @@ -226,9 +222,9 @@ def parse_args() -> argparse.Namespace:
)
parser_dump.add_argument(
"--file",
type=Path,
default=None,
help="File containing prompt to use instead of the default",
type=str,
default="https://raw.githubusercontent.com/ggml-org/llama.cpp/eaba92c3dcc980ebe753348855d4a5d75c069997/tools/server/README.md",
help="File containing prompt to use instead of the default (can also be an URL)",
)
parser_dump.add_argument(
"--pattern",
Expand Down Expand Up @@ -259,17 +255,19 @@ def main():

if args.verb == "dump":
pattern = parse_pattern(args.pattern)
input_length = sum(n for _, n in pattern)
input_words = generate_input_prompt(input_length)
if args.file is not None:
with args.file.open("r") as f:
required_words = sum(n for _, n in pattern)
if args.file.startswith("http"):
input_words = get_remote_corpus(args.file, required_words)
logger.info(f"Fetched {len(input_words)} words from remote {args.file}")
else:
with open(args.file, "r") as f:
input_words = f.read().strip().split(" ")
if input_length < sum(n for _, n in pattern):
input_words = [w for w in input_words if len(w) > 0] # filter out empty strings
if len(input_words) < required_words:
raise ValueError(
f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
f"Input file has only {len(input_words)} words, but pattern requires at least {required_words} words."
)
input_length = len(input_words)
logger.info(f"Using {input_length} words")
logger.info(f"Using {len(input_words)} words")
dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
elif args.verb == "compare":
compare_logits(args.input1, args.input2, args.output)
Expand Down
2 changes: 1 addition & 1 deletion src/llama-memory-recurrent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
const auto & cell = cells[tail_id];
// partial intersection is invalid if it includes the final pos
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
return false;
}
// invalidate tails which will be cleared
Expand Down
72 changes: 65 additions & 7 deletions tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,19 +231,77 @@ server_tokens::server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) :
server_tokens::server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
}

llama_pos server_tokens::pos_next() const {
llama_pos server_tokens::pos_next(int64_t n_tokens) const {
if (!has_mtmd) {
return tokens.size();
if (n_tokens < 0) {
return tokens.size();
}

return n_tokens;
}

llama_pos res = tokens.size();
if (n_tokens < 0) {
llama_pos res = tokens.size();

for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
const auto & chunk = it->second;
res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
const auto & chunk = it->second;
res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
}

return res;
}

return res;
int64_t idx = 0;
llama_pos pos = 0;

GGML_ASSERT(n_tokens <= (int64_t)tokens.size());

while (idx < n_tokens) {
const auto media_it = map_idx_to_media.find(idx);
if (media_it != map_idx_to_media.end()) {
const auto & chunk = media_it->second;
const llama_pos n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
const size_t n_tok = mtmd_input_chunk_get_n_tokens(chunk.get());

pos += n_pos;
idx += n_tok;
} else {
pos++;
idx++;
}
}

return pos;
}

size_t server_tokens::size_up_to_pos(llama_pos max_pos) const {
if (!has_mtmd) {
return std::min((size_t)(max_pos + 1), tokens.size());
}

size_t idx = 0;
llama_pos pos = 0;

while (idx < tokens.size()) {
const auto media_it = map_idx_to_media.find(idx);
if (media_it != map_idx_to_media.end()) {
const auto & chunk = media_it->second;
const llama_pos n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
const size_t n_tok = mtmd_input_chunk_get_n_tokens(chunk.get());

pos += n_pos;
idx += n_tok;
} else {
pos++;
idx++;
}

if (pos > max_pos) {
break;
}
}

return idx;
}

std::string server_tokens::str() const {
Expand Down
7 changes: 6 additions & 1 deletion tools/server/server-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,12 @@ struct server_tokens {
// for debugging
std::string str() const;

llama_pos pos_next() const;
// the next position after n_tokens. if n_tokens < 0, return the next position after all tokens.
llama_pos pos_next(int64_t n_tokens = -1) const;

// number of tokens with position <= max_pos
size_t size_up_to_pos(llama_pos max_pos) const;

const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;

void push_back(llama_token tok);
Expand Down
Loading
Loading