Skip to content

Commit ef1646e

Browse files
ggerganovngxson
andcommitted
server : fix pos_next() usage
Co-authored-by: Xuan-Son Nguyen <son@huggingface.co>
1 parent beb5c03 commit ef1646e

File tree

2 files changed

+12
-8
lines changed

2 files changed

+12
-8
lines changed

tools/server/server.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3728,6 +3728,8 @@ struct server_context {
37283728

37293729
// reuse chunks from the cached prompt by shifting their KV cache in the new position
37303730
if (params_base.n_cache_reuse > 0) {
3731+
GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
3732+
37313733
size_t head_c = n_past; // cache
37323734
size_t head_p = n_past; // current prompt
37333735

@@ -3836,6 +3838,9 @@ struct server_context {
38363838
}
38373839

38383840
if (pos_min > pos_min_thold) {
3841+
// TODO: support can be added in the future when corresponding vision models get released
3842+
GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
3843+
38393844
SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
38403845

38413846
// search for a context checkpoint
@@ -3908,8 +3913,9 @@ struct server_context {
39083913
}
39093914

39103915
// truncate any tokens that are beyond n_past for this slot
3911-
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1)) {
3912-
SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", slot.prompt.n_tokens());
3916+
const llama_pos p0 = slot.prompt.tokens.pos_next();
3917+
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
3918+
SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0);
39133919
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
39143920

39153921
// there is no common part left
@@ -3918,10 +3924,7 @@ struct server_context {
39183924
slot.prompt.tokens.clear();
39193925
}
39203926

3921-
SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), slot.prompt.n_tokens());
3922-
3923-
// remove the non-common part from the cache
3924-
slot.prompt.tokens.keep_first(slot.prompt.n_tokens());
3927+
SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
39253928

39263929
// check if we should process the image
39273930
if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {

tools/server/utils.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,7 @@ struct server_tokens {
10881088
// if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
10891089
// otherwise, it is a normal text token
10901090
// note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
1091+
// note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
10911092
llama_tokens tokens;
10921093

10931094
// for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
@@ -1366,7 +1367,7 @@ struct server_tokens {
13661367
llama_context * ctx,
13671368
mtmd_context * mctx,
13681369
size_t idx,
1369-
llama_pos n_past,
1370+
llama_pos pos,
13701371
int32_t seq_id,
13711372
size_t & n_tokens_out) const {
13721373
const auto & chunk = find_chunk(idx);
@@ -1378,7 +1379,7 @@ struct server_tokens {
13781379
llama_pos new_n_past; // unused for now
13791380
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
13801381
chunk.get(),
1381-
n_past,
1382+
pos,
13821383
seq_id,
13831384
n_batch,
13841385
true, // logits last

0 commit comments

Comments
 (0)