@@ -10,11 +10,7 @@ using json = nlohmann::json;
1010/* *
1111 * The state of the inference task
1212 */
13- enum InferenceStatus {
14- PENDING,
15- RUNNING,
16- FINISHED
17- };
13+ enum InferenceStatus { PENDING, RUNNING, FINISHED };
1814
1915/* *
2016 * There is a need to save state of current ongoing inference status of a
@@ -141,7 +137,9 @@ std::string create_return_json(const std::string &id, const std::string &model,
141137 return Json::writeString (writer, root);
142138}
143139
144- llamaCPP::llamaCPP (): queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel, " llamaCPP" )) {
140+ llamaCPP::llamaCPP ()
141+ : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel,
142+ " llamaCPP" )) {
145143 // Some default values for now below
146144 log_disable (); // Disable the log to file feature, reduce bloat for
147145 // target
@@ -172,7 +170,7 @@ void llamaCPP::inference(
172170
173171 const auto &jsonBody = req->getJsonObject ();
174172 // Check if model is loaded
175- if (checkModelLoaded (callback)) {
173+ if (checkModelLoaded (callback)) {
176174 // Model is loaded
177175 // Do Inference
178176 inferenceImpl (jsonBody, callback);
@@ -329,8 +327,7 @@ void llamaCPP::inferenceImpl(
329327 auto state = create_inference_state (this );
330328 auto chunked_content_provider =
331329 [state, data](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
332-
333- if (state->inferenceStatus == PENDING) {
330+ if (state->inferenceStatus == PENDING) {
334331 state->inferenceStatus = RUNNING;
335332 } else if (state->inferenceStatus == FINISHED) {
336333 return 0 ;
@@ -341,7 +338,7 @@ void llamaCPP::inferenceImpl(
341338 state->inferenceStatus = FINISHED;
342339 return 0 ;
343340 }
344-
341+
345342 task_result result = state->instance ->llama .next_result (state->task_id );
346343 if (!result.error ) {
347344 const std::string to_send = result.result_json [" content" ];
@@ -367,10 +364,10 @@ void llamaCPP::inferenceImpl(
367364 LOG_INFO << " reached result stop" ;
368365 state->inferenceStatus = FINISHED;
369366 }
370-
367+
371368 // Make sure nBufferSize is not zero
372369 // Otherwise it stop streaming
373- if (!nRead) {
370+ if (!nRead) {
374371 state->inferenceStatus = FINISHED;
375372 }
376373
@@ -380,31 +377,33 @@ void llamaCPP::inferenceImpl(
380377 return 0 ;
381378 };
382379 // Queued task
383- state->instance ->queue ->runTaskInQueue ([callback, state, data,
384- chunked_content_provider]() {
385- state->task_id =
386- state->instance ->llama .request_completion (data, false , false , -1 );
387-
388- // Start streaming response
389- auto resp = nitro_utils::nitroStreamResponse (chunked_content_provider,
390- " chat_completions.txt" );
391- callback (resp);
392-
393- int retries = 0 ;
394-
395- // Since this is an async task, we will wait for the task to be completed
396- while (state->inferenceStatus != FINISHED && retries < 10 ) {
397- // Should wait chunked_content_provider lambda to be called within 3s
398- if (state->inferenceStatus == PENDING) {
399- retries += 1 ;
400- }
401- if (state->inferenceStatus != RUNNING)
402- LOG_INFO << " Wait for task to be released:" << state->task_id ;
403- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
404- }
405- // Request completed, release it
406- state->instance ->llama .request_cancel (state->task_id );
407- });
380+ state->instance ->queue ->runTaskInQueue (
381+ [callback, state, data, chunked_content_provider]() {
382+ state->task_id =
383+ state->instance ->llama .request_completion (data, false , false , -1 );
384+
385+ // Start streaming response
386+ auto resp = nitro_utils::nitroStreamResponse (chunked_content_provider,
387+ " chat_completions.txt" );
388+ callback (resp);
389+
390+ int retries = 0 ;
391+
392+ // Since this is an async task, we will wait for the task to be
393+ // completed
394+ while (state->inferenceStatus != FINISHED && retries < 10 ) {
395+ // Should wait chunked_content_provider lambda to be called within
396+ // 3s
397+ if (state->inferenceStatus == PENDING) {
398+ retries += 1 ;
399+ }
400+ if (state->inferenceStatus != RUNNING)
401+ LOG_INFO << " Wait for task to be released:" << state->task_id ;
402+ std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
403+ }
404+ // Request completed, release it
405+ state->instance ->llama .request_cancel (state->task_id );
406+ });
408407 } else {
409408 Json::Value respData;
410409 auto resp = nitro_utils::nitroHttpResponse ();
@@ -434,7 +433,7 @@ void llamaCPP::embedding(
434433 const HttpRequestPtr &req,
435434 std::function<void (const HttpResponsePtr &)> &&callback) {
436435 // Check if model is loaded
437- if (checkModelLoaded (callback)) {
436+ if (checkModelLoaded (callback)) {
438437 // Model is loaded
439438 const auto &jsonBody = req->getJsonObject ();
440439 // Run embedding
0 commit comments