diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 860acc6b1b6..1f79a838159 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -616,13 +616,13 @@ jobs: runs-on: windows-2022 env: - HIPSDK_INSTALLER_VERSION: "25.Q3" + HIPSDK_INSTALLER_VERSION: "26.Q1" strategy: matrix: include: - name: "radeon" - gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" steps: - name: Clone @@ -632,7 +632,7 @@ jobs: - name: Grab rocWMMA package id: grab_rocwmma run: | - curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb" + curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb" 7z x rocwmma.deb 7z x data.tar @@ -655,7 +655,7 @@ jobs: run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru $completed = $proc.WaitForExit(600000) @@ -689,20 +689,20 @@ jobs: cmake -G "Unix Makefiles" -B build -S . ` -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" ` + -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" ` -DCMAKE_BUILD_TYPE=Release ` -DGGML_BACKEND_DL=ON ` -DGGML_NATIVE=OFF ` -DGGML_CPU=OFF ` - -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" ` + -DGPU_TARGETS="${{ matrix.gpu_targets }}" ` -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_HIP=ON ` -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" md "build\bin\hipblaslt\library" - cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\" diff --git a/scripts/compare-logprobs.py b/scripts/compare-logprobs.py index 63861dd9a48..ac10085b788 100644 --- a/scripts/compare-logprobs.py +++ b/scripts/compare-logprobs.py @@ -25,16 +25,12 @@ """ -def generate_input_prompt(length: int) -> list[str]: - CORPUS = """ - You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls. - - ### Tool Call Format: - When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text. - - You can make multiple calls in one go by placing them one after another. - """ - words = [w.strip() for w in CORPUS.strip().split(" ")] +def get_remote_corpus(url: str, length: int) -> list[str]: + response = requests.get(url) + response.raise_for_status() + corpus = response.text + words = [w.strip() for w in corpus.strip().split(" ")] + words = [w for w in words if "<" not in w] # make sure nothing looks like special tokens words = [w for w in words if len(w) > 0] # filter out empty strings while len(words) < length: words += words @@ -226,9 +222,9 @@ def parse_args() -> argparse.Namespace: ) parser_dump.add_argument( "--file", - type=Path, - default=None, - help="File containing prompt to use instead of the default", + type=str, + default="https://raw.githubusercontent.com/ggml-org/llama.cpp/eaba92c3dcc980ebe753348855d4a5d75c069997/tools/server/README.md", + help="File containing prompt to use instead of the default (can also be an URL)", ) parser_dump.add_argument( "--pattern", @@ -259,17 +255,19 @@ def main(): if args.verb == "dump": pattern = parse_pattern(args.pattern) - input_length = sum(n for _, n in pattern) - input_words = generate_input_prompt(input_length) - if args.file is not None: - with args.file.open("r") as f: + required_words = sum(n for _, n in pattern) + if args.file.startswith("http"): + input_words = get_remote_corpus(args.file, required_words) + logger.info(f"Fetched {len(input_words)} words from remote {args.file}") + else: + with open(args.file, "r") as f: input_words = f.read().strip().split(" ") - if input_length < sum(n for _, n in pattern): + input_words = [w for w in input_words if len(w) > 0] # filter out empty strings + if len(input_words) < required_words: raise ValueError( - f"Input file has only {input_length} words, but pattern requires at least {input_length} words." + f"Input file has only {len(input_words)} words, but pattern requires at least {required_words} words." ) - input_length = len(input_words) - logger.info(f"Using {input_length} words") + logger.info(f"Using {len(input_words)} words") dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key) elif args.verb == "compare": compare_logits(args.input1, args.input2, args.output) diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index f0038036dcb..6e8413f493d 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -163,7 +163,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos const auto & cell = cells[tail_id]; // partial intersection is invalid if it includes the final pos if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) { - //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n"); + //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1); return false; } // invalidate tails which will be cleared diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 88b6e77d82b..ff3c6d3c2b0 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -231,19 +231,77 @@ server_tokens::server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : server_tokens::server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) { } -llama_pos server_tokens::pos_next() const { +llama_pos server_tokens::pos_next(int64_t n_tokens) const { if (!has_mtmd) { - return tokens.size(); + if (n_tokens < 0) { + return tokens.size(); + } + + return n_tokens; } - llama_pos res = tokens.size(); + if (n_tokens < 0) { + llama_pos res = tokens.size(); - for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) { - const auto & chunk = it->second; - res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get()); + for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) { + const auto & chunk = it->second; + res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get()); + } + + return res; } - return res; + int64_t idx = 0; + llama_pos pos = 0; + + GGML_ASSERT(n_tokens <= (int64_t)tokens.size()); + + while (idx < n_tokens) { + const auto media_it = map_idx_to_media.find(idx); + if (media_it != map_idx_to_media.end()) { + const auto & chunk = media_it->second; + const llama_pos n_pos = mtmd_input_chunk_get_n_pos(chunk.get()); + const size_t n_tok = mtmd_input_chunk_get_n_tokens(chunk.get()); + + pos += n_pos; + idx += n_tok; + } else { + pos++; + idx++; + } + } + + return pos; +} + +size_t server_tokens::size_up_to_pos(llama_pos max_pos) const { + if (!has_mtmd) { + return std::min((size_t)(max_pos + 1), tokens.size()); + } + + size_t idx = 0; + llama_pos pos = 0; + + while (idx < tokens.size()) { + const auto media_it = map_idx_to_media.find(idx); + if (media_it != map_idx_to_media.end()) { + const auto & chunk = media_it->second; + const llama_pos n_pos = mtmd_input_chunk_get_n_pos(chunk.get()); + const size_t n_tok = mtmd_input_chunk_get_n_tokens(chunk.get()); + + pos += n_pos; + idx += n_tok; + } else { + pos++; + idx++; + } + + if (pos > max_pos) { + break; + } + } + + return idx; } std::string server_tokens::str() const { diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 2629a6bee92..4fb9e488dfd 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -167,7 +167,12 @@ struct server_tokens { // for debugging std::string str() const; - llama_pos pos_next() const; + // the next position after n_tokens. if n_tokens < 0, return the next position after all tokens. + llama_pos pos_next(int64_t n_tokens = -1) const; + + // number of tokens with position <= max_pos + size_t size_up_to_pos(llama_pos max_pos) const; + const mtmd::input_chunk_ptr & find_chunk(size_t idx) const; void push_back(llama_token tok); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 0f2f3a45aaa..73af812437e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -995,9 +995,6 @@ struct server_context_impl { // don't update the cache if the slot's context is empty update_cache = update_cache && tokens.size() > 0; - // TODO: mtmd does not support prompt cache - update_cache = update_cache && (ret->mctx == nullptr); - if (update_cache) { SRV_WRN("%s", "updating prompt cache\n"); @@ -1442,7 +1439,7 @@ struct server_context_impl { res->id = slot.task->id; res->id_slot = slot.id; - res->index = slot.task->index; + res->index = slot.task->index; // keep copy of last generated text for debugging purposes if (slots_debug) { @@ -2282,15 +2279,15 @@ struct server_context_impl { n_past = 0; } + llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); + // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1 const auto n_swa = std::max(1, llama_model_n_swa(model)); // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, n_past - n_swa); + const auto pos_min_thold = std::max(0, pos_next - n_swa); - // note: disallow with mtmd contexts for now - // https://github.com/ggml-org/llama.cpp/issues/17043 - if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) { + if (n_past > 0 && n_past < slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); if (pos_min == -1) { SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min); @@ -2341,9 +2338,6 @@ struct server_context_impl { } if (pos_min > pos_min_thold) { - // TODO: support can be added in the future when corresponding vision models get released - GGML_ASSERT(!slot.prompt.tokens.has_mtmd); - SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa); // search for a context checkpoint @@ -2364,18 +2358,20 @@ struct server_context_impl { const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); if (n != checkpoint_size) { - SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024); + SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024); do_reset = true; //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint"); } else { - n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max)); - SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024); + pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); + n_past = slot.prompt.tokens.size_up_to_pos(pos_next); + SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024); } } if (do_reset) { SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n", "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055"); + pos_next = 0; n_past = 0; } } @@ -2386,7 +2382,7 @@ struct server_context_impl { for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) { const auto & cur = *it; if (cur.pos_min > pos_min_thold) { - SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024); + SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, cur.n_tokens, n_swa, (float) cur.data.size() / 1024 / 1024); it = slot.prompt.checkpoints.erase(it); } else { ++it; @@ -2402,7 +2398,7 @@ struct server_context_impl { SLT_WRN(slot, "n_past was set to %d\n", n_past); } - slot.n_prompt_tokens_cache = n_past; + slot.n_prompt_tokens_cache = n_past; slot.n_prompt_tokens_processed = 0; slot.prompt.tokens.keep_first(n_past); @@ -2520,10 +2516,6 @@ struct server_context_impl { } } - // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str()); - - SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens()); - // entire prompt has been processed if (slot.prompt.n_tokens() == slot.task->n_tokens()) { slot.state = SLOT_STATE_DONE_PROMPT; @@ -2536,8 +2528,6 @@ struct server_context_impl { slot.n_decoded = 0; slot.i_batch = batch.n_tokens - 1; - SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens); - slot.init_sampler(); const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id); @@ -2549,13 +2539,15 @@ struct server_context_impl { // no need to create checkpoints that are too close together do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64); + // note: we create the checkpoint before calling llama_decode(), so the current batch is not + // yet processed and therefore it is not part of the checkpoint. if (do_checkpoint) { while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) { // make room for the new checkpoint, if needed const auto & cur = slot.prompt.checkpoints.front(); - SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", - cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024); + SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", + cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024); slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin()); } @@ -2563,16 +2555,21 @@ struct server_context_impl { const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{ - /*.pos_min = */ pos_min, - /*.pos_max = */ pos_max, - /*.data = */ std::vector(checkpoint_size), + /*.pos_min = */ pos_min, + /*.pos_max = */ pos_max, + /*.n_tokens = */ slot.prompt.n_tokens() - batch.n_tokens, + /*.data = */ std::vector(checkpoint_size), }); llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", - (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024); + SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", + (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.data.size() / 1024 / 1024); } + + SLT_INF(slot, "prompt processing done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens); + } else { + SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens()); } } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 739e30a7046..d3aba18489b 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1900,10 +1900,9 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t return nullptr; } - // TODO: for some reason we can't copy server_tokens, so we have to do this workaround auto & cur = states.emplace_back(); cur = { - /*.tokens =*/ server_tokens(prompt.tokens.get_text_tokens(), false), + /*.tokens =*/ prompt.tokens.clone(), /*.data =*/ std::move(state_data), /*.checkpoints =*/ prompt.checkpoints, }; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index a69e8f1a3d2..e2e3e5a5828 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -557,6 +557,8 @@ struct server_prompt_checkpoint { llama_pos pos_min; llama_pos pos_max; + int64_t n_tokens; + std::vector data; size_t size() const {