@@ -407,17 +407,31 @@ void llamaCPP::embedding(
407407 std::function<void (const HttpResponsePtr &)> &&callback) {
408408 check_model_loaded (llama, req, callback);
409409
410+ auto state = create_inference_state (this );
411+
410412 const auto &jsonBody = req->getJsonObject ();
411413
412414 Json::Value responseData (Json::arrayValue);
413415
414416 if (jsonBody->isMember (" input" )) {
417+ // If single queue is busy, we will wait if not we will just go ahead and
418+ // process and make it busy, and yet i'm aware not DRY, i have the same
419+ // stuff on chatcompletion as well
420+ if (state->instance ->llama .params .n_parallel == 1 ) {
421+ while (state->instance ->single_queue_is_busy ) {
422+ LOG_INFO << " Waiting for task to be released status:"
423+ << state->instance ->single_queue_is_busy ;
424+ std::this_thread::sleep_for (
425+ std::chrono::milliseconds (500 )); // Waiting in 500 miliseconds step
426+ }
427+ }
415428 const Json::Value &input = (*jsonBody)[" input" ];
416429 if (input.isString ()) {
417430 // Process the single string input
418- const int task_id = llama.request_completion (
431+ state-> task_id = llama.request_completion (
419432 {{" prompt" , input.asString ()}, {" n_predict" , 0 }}, false , true , -1 );
420- task_result result = llama.next_result (task_id);
433+ state->instance ->single_queue_is_busy = true ;
434+ task_result result = llama.next_result (state->task_id );
421435 std::vector<float > embedding_result = result.result_json [" embedding" ];
422436 responseData.append (create_embedding_payload (embedding_result, 0 ));
423437 } else if (input.isArray ()) {
@@ -434,6 +448,9 @@ void llamaCPP::embedding(
434448 }
435449 }
436450
451+ // We already got result of the embedding so no longer busy
452+ state->instance ->single_queue_is_busy = false ;
453+
437454 auto resp = nitro_utils::nitroHttpResponse ();
438455 Json::Value root;
439456 root[" data" ] = responseData;
0 commit comments