Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 5fdbdd4

Browse files
authored
Merge pull request #435 from janhq/429-feat-properly-decoupling-cors-and-handleprelight-into-a-seperated-controller
429 feat properly decoupling cors and handleprelight into a seperated controller
2 parents 627a597 + ab408c9 commit 5fdbdd4

File tree

4 files changed

+71
-40
lines changed

4 files changed

+71
-40
lines changed

controllers/llamaCPP.cc

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@ using json = nlohmann::json;
1010
/**
1111
* The state of the inference task
1212
*/
13-
enum InferenceStatus {
14-
PENDING,
15-
RUNNING,
16-
FINISHED
17-
};
13+
enum InferenceStatus { PENDING, RUNNING, FINISHED };
1814

1915
/**
2016
* There is a need to save state of current ongoing inference status of a
@@ -141,7 +137,9 @@ std::string create_return_json(const std::string &id, const std::string &model,
141137
return Json::writeString(writer, root);
142138
}
143139

144-
llamaCPP::llamaCPP(): queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel, "llamaCPP")) {
140+
llamaCPP::llamaCPP()
141+
: queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel,
142+
"llamaCPP")) {
145143
// Some default values for now below
146144
log_disable(); // Disable the log to file feature, reduce bloat for
147145
// target
@@ -172,7 +170,7 @@ void llamaCPP::inference(
172170

173171
const auto &jsonBody = req->getJsonObject();
174172
// Check if model is loaded
175-
if(checkModelLoaded(callback)) {
173+
if (checkModelLoaded(callback)) {
176174
// Model is loaded
177175
// Do Inference
178176
inferenceImpl(jsonBody, callback);
@@ -329,8 +327,7 @@ void llamaCPP::inferenceImpl(
329327
auto state = create_inference_state(this);
330328
auto chunked_content_provider =
331329
[state, data](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
332-
333-
if(state->inferenceStatus == PENDING) {
330+
if (state->inferenceStatus == PENDING) {
334331
state->inferenceStatus = RUNNING;
335332
} else if (state->inferenceStatus == FINISHED) {
336333
return 0;
@@ -341,7 +338,7 @@ void llamaCPP::inferenceImpl(
341338
state->inferenceStatus = FINISHED;
342339
return 0;
343340
}
344-
341+
345342
task_result result = state->instance->llama.next_result(state->task_id);
346343
if (!result.error) {
347344
const std::string to_send = result.result_json["content"];
@@ -367,10 +364,10 @@ void llamaCPP::inferenceImpl(
367364
LOG_INFO << "reached result stop";
368365
state->inferenceStatus = FINISHED;
369366
}
370-
367+
371368
// Make sure nBufferSize is not zero
372369
// Otherwise it stop streaming
373-
if(!nRead) {
370+
if (!nRead) {
374371
state->inferenceStatus = FINISHED;
375372
}
376373

@@ -380,31 +377,33 @@ void llamaCPP::inferenceImpl(
380377
return 0;
381378
};
382379
// Queued task
383-
state->instance->queue->runTaskInQueue([callback, state, data,
384-
chunked_content_provider]() {
385-
state->task_id =
386-
state->instance->llama.request_completion(data, false, false, -1);
387-
388-
// Start streaming response
389-
auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
390-
"chat_completions.txt");
391-
callback(resp);
392-
393-
int retries = 0;
394-
395-
// Since this is an async task, we will wait for the task to be completed
396-
while (state->inferenceStatus != FINISHED && retries < 10) {
397-
// Should wait chunked_content_provider lambda to be called within 3s
398-
if(state->inferenceStatus == PENDING) {
399-
retries += 1;
400-
}
401-
if(state->inferenceStatus != RUNNING)
402-
LOG_INFO << "Wait for task to be released:" << state->task_id;
403-
std::this_thread::sleep_for(std::chrono::milliseconds(100));
404-
}
405-
// Request completed, release it
406-
state->instance->llama.request_cancel(state->task_id);
407-
});
380+
state->instance->queue->runTaskInQueue(
381+
[callback, state, data, chunked_content_provider]() {
382+
state->task_id =
383+
state->instance->llama.request_completion(data, false, false, -1);
384+
385+
// Start streaming response
386+
auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
387+
"chat_completions.txt");
388+
callback(resp);
389+
390+
int retries = 0;
391+
392+
// Since this is an async task, we will wait for the task to be
393+
// completed
394+
while (state->inferenceStatus != FINISHED && retries < 10) {
395+
// Should wait chunked_content_provider lambda to be called within
396+
// 3s
397+
if (state->inferenceStatus == PENDING) {
398+
retries += 1;
399+
}
400+
if (state->inferenceStatus != RUNNING)
401+
LOG_INFO << "Wait for task to be released:" << state->task_id;
402+
std::this_thread::sleep_for(std::chrono::milliseconds(100));
403+
}
404+
// Request completed, release it
405+
state->instance->llama.request_cancel(state->task_id);
406+
});
408407
} else {
409408
Json::Value respData;
410409
auto resp = nitro_utils::nitroHttpResponse();
@@ -434,7 +433,7 @@ void llamaCPP::embedding(
434433
const HttpRequestPtr &req,
435434
std::function<void(const HttpResponsePtr &)> &&callback) {
436435
// Check if model is loaded
437-
if(checkModelLoaded(callback)) {
436+
if (checkModelLoaded(callback)) {
438437
// Model is loaded
439438
const auto &jsonBody = req->getJsonObject();
440439
// Run embedding

controllers/llamaCPP.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2526,10 +2526,11 @@ class llamaCPP : public drogon::HttpController<llamaCPP>, public ChatProvider {
25262526

25272527
// Openai compatible path
25282528
ADD_METHOD_TO(llamaCPP::inference, "/v1/chat/completions", Post);
2529-
// ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/chat/completions", Options); NOTE: prelight will be added back when browser support is properly planned
2529+
// ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/chat/completions", Options);
2530+
// NOTE: prelight will be added back when browser support is properly planned
25302531

25312532
ADD_METHOD_TO(llamaCPP::embedding, "/v1/embeddings", Post);
2532-
//ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/embeddings", Options);
2533+
// ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/embeddings", Options);
25332534

25342535
// PATH_ADD("/llama/chat_completion", Post);
25352536
METHOD_LIST_END

controllers/prelight.cc

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#include "prelight.h"
2+
3+
void prelight::handlePrelight(
4+
const HttpRequestPtr &req,
5+
std::function<void(const HttpResponsePtr &)> &&callback) {
6+
auto resp = drogon::HttpResponse::newHttpResponse();
7+
resp->setStatusCode(drogon::HttpStatusCode::k200OK);
8+
resp->addHeader("Access-Control-Allow-Origin", "*");
9+
resp->addHeader("Access-Control-Allow-Methods", "POST, OPTIONS");
10+
resp->addHeader("Access-Control-Allow-Headers", "*");
11+
callback(resp);
12+
}
13+

controllers/prelight.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#pragma once
2+
3+
#include <drogon/HttpController.h>
4+
5+
using namespace drogon;
6+
7+
class prelight : public drogon::HttpController<prelight> {
8+
public:
9+
METHOD_LIST_BEGIN
10+
ADD_METHOD_TO(prelight::handlePrelight, "/v1/chat/completions", Options);
11+
ADD_METHOD_TO(prelight::handlePrelight, "/v1/embeddings", Options);
12+
ADD_METHOD_TO(prelight::handlePrelight, "/v1/audio/transcriptions", Options);
13+
ADD_METHOD_TO(prelight::handlePrelight, "/v1/audio/translations", Options);
14+
METHOD_LIST_END
15+
16+
void handlePrelight(const HttpRequestPtr &req,
17+
std::function<void(const HttpResponsePtr &)> &&callback);
18+
};

0 commit comments

Comments
 (0)