Download OpenAPI specification:Download
Please see https://nitro.jan.ai/ for documentation.
Given a list of messages comprising a conversation, the model will return a response.
messages | arrays Contains input data or prompts for the model to process |
stream | boolean Default: true Enables continuous output generation, allowing for streaming of model responses |
model | string Specifies the model being used for inference or processing tasks |
max_tokens | number Default: 2048 The maximum number of tokens the model will generate in a single response |
stop | arrays Defines specific tokens or phrases at which the model will stop generating further output |
frequency_penalty | number Default: 0 Adjusts the likelihood of the model repeating words or phrases in its output |
presence_penalty | number Default: 0 Influences the generation of new and varied concepts in the model's output |
temperature | number Default: 0.7 Controls the randomness of the model's output |
top_p | number Default: 0.95 Set probability threshold for more relevant outputs |
{- "messages": [
- {
- "content": "Hello there :wave:",
- "role": "assistant"
}, - {
- "content": "Can you write a long story",
- "role": "user"
}
], - "stream": true,
- "model": "gpt-3.5-turbo",
- "max_tokens": 2048,
- "stop": [
- "hello"
], - "frequency_penalty": 0,
- "presence_penalty": 0,
- "temperature": 0.7,
- "top_p": 0.95
}
{- "choices": [
- {
- "finish_reason": null,
- "index": 0,
- "message": {
- "content": "Hello user. What can I help you with?",
- "role": "assistant"
}
}
], - "created": 1700193928,
- "id": "ebwd2niJvJB1Q2Whyvkz",
- "model": "_",
- "object": "chat.completion",
- "system_fingerprint": "_",
- "usage": {
- "completion_tokens": 500,
- "prompt_tokens": 33,
- "total_tokens": 533
}
}
input | any Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. |
encoding_format | any Encoding format |
{- "input": "hello",
- "encoding_format": "float"
}
{- "data": [
- {
- "embedding": [
- [
- 0.06781931221485138,
- 0.17273959517478943,
- -0.31053683161735535,
- "...",
- 0.361769437789917
]
], - "index": 0,
- "object": "embedding"
}
], - "model": "_",
- "object": "list",
- "usage": {
- "prompt_tokens": 33,
- "total_tokens": 533
}
}
llama_model_path required | string Path to your local LLM |
ngl | number or null [ 0 .. 100 ] Default: 100 The number of layers to load onto the GPU for acceleration. |
ctx_len | number or null Default: 2048 The context length for model operations varies; the maximum depends on the specific model used. |
embedding | boolean or null Default: true Whether to enable embedding. |
cont_batching | boolean or null Default: false Whether to use continuous batching. |
n_parallel | integer or null Default: 1 The number of parallel operations. Only set when enable continuous batching. |
cpu_threads | integer or null The number of threads for CPU-based inference. |
pre_prompt | string or null Default: "A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what." The prompt to use for internal configuration. |
system_prompt | string or null Default: "ASSISTANT's RULE:" The prefix for system prompt |
user_prompt | string or null Default: "USER:" The prefix for user prompt. |
ai_prompt | string or null Default: "ASSISTANT:" The prefix for assistant prompt. |
clean_cache_threshold | integer or null Default: 5 Number of chats that will trigger clean cache action. |
{- "llama_model_path": "nitro/model/zephyr-7b-beta.Q5_K_M.gguf",
- "ngl": 100,
- "ctx_len": 2048,
- "embedding": true,
- "cont_batching": false,
- "n_parallel": 1,
- "cpu_threads": 4,
- "pre_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.",
- "system_prompt": "ASSISTANT's RULE:",
- "user_prompt": "USER:",
- "ai_prompt": "ASSISTANT:",
- "clean_cache_threshold": 5
}
{- "message": "Model loaded successfully",
- "code": "Model loaded successfully"
}
curl http://localhost:3928/inferences/llamacpp/modelstatus
{- "model_data": {
- "model_loaded": true,
- "frequency_penalty": 0,
- "grammar": "",
- "ignore_eos": false,
- "logit_bias": [ ],
- "mirostat": 0,
- "mirostat_eta": 0.1,
- "mirostat_tau": 5,
- "model": "nitro/model/zephyr-7b-beta.Q5_K_M.gguf",
- "n_ctx": 42,
- "n_keep": 0,
- "n_predict": 100,
- "n_probs": 0,
- "penalize_nl": true,
- "presence_penalty": 0,
- "repeat_last_n": 64,
- "repeat_penalty": 1.1,
- "seed": 4294967295,
- "stop": [
- "hello",
- "USER: "
], - "stream": true,
- "temp": 0.7,
- "tfs_z": 1,
- "top_k": 40,
- "top_p": 0.95,
- "typical_p": 1
}
}