diff --git a/external/packages/bsp/rk3588/usr/bin/rknn_server b/external/packages/bsp/rk3588/usr/bin/rknn_server index 742c0cb468c7..7a436c91eace 100755 Binary files a/external/packages/bsp/rk3588/usr/bin/rknn_server and b/external/packages/bsp/rk3588/usr/bin/rknn_server differ diff --git a/external/packages/bsp/rk3588/usr/include/rkllm.h b/external/packages/bsp/rk3588/usr/include/rkllm.h index ce1b2ffdace5..e565e6c89002 100644 --- a/external/packages/bsp/rk3588/usr/include/rkllm.h +++ b/external/packages/bsp/rk3588/usr/include/rkllm.h @@ -1,119 +1,271 @@ -#ifndef _LLM_H_ -#define _LLM_H_ +#ifndef _RKLLM_H_ +#define _RKLLM_H_ #ifdef __cplusplus extern "C" { #endif -typedef void* LLMHandle; /* Handle for an instance of a language model. */ +/** + * @typedef LLMHandle + * @brief A handle used to manage and interact with the large language model. + */ +typedef void* LLMHandle; /** - * @brief Structure for possible states of an inference call. - * + * @enum LLMCallState + * @brief Describes the possible states of an LLM call. */ typedef enum { - LLM_RUN_NORMAL = 0, /* Inference status is normal and inference has not yet finished. */ - LLM_RUN_FINISH = 1, /* Inference status is normal and inference has finished. */ - LLM_RUN_ERROR = 2 /* Inference status is abnormal. */ + RKLLM_RUN_NORMAL = 0, /**< The LLM call is in a normal running state. */ + RKLLM_RUN_WAITING = 1, /**< The LLM call is waiting for complete UTF-8 encoded character. */ + RKLLM_RUN_FINISH = 2, /**< The LLM call has finished execution. */ + RKLLM_RUN_ERROR = 3, /**< An error occurred during the LLM call. */ + RKLLM_RUN_GET_LAST_HIDDEN_LAYER = 4 /**< Retrieve the last hidden layer during inference. */ } LLMCallState; /** - * @brief Structure for setting up parameters for the language model - * + * @enum RKLLMInputType + * @brief Defines the types of inputs that can be fed into the LLM. + */ +typedef enum { + RKLLM_INPUT_PROMPT = 0, /**< Input is a text prompt. */ + RKLLM_INPUT_TOKEN = 1, /**< Input is a sequence of tokens. */ + RKLLM_INPUT_EMBED = 2, /**< Input is an embedding vector. */ + RKLLM_INPUT_MULTIMODAL = 3, /**< Input is multimodal (e.g., text and image). */ +} RKLLMInputType; + +/** + * @enum RKLLMInferMode + * @brief Specifies the inference modes of the LLM. + */ +typedef enum { + RKLLM_INFER_GENERATE = 0, /**< The LLM generates text based on input. */ + RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1, /**< The LLM retrieves the last hidden layer for further processing. */ +} RKLLMInferMode; + +/** + * @struct RKLLMExtendParam + * @brief The extend parameters for configuring an LLM instance. */ typedef struct { - const char* model_path; /* Path where the model file is located. */ - int32_t num_npu_core; /* Number of NPU cores used for model inference. */ - int32_t max_context_len; /* Maximum size of the context. */ - int32_t max_new_tokens; /* Maximum number of tokens to generate during model inference. */ - int32_t top_k; /* The number of highest probability tokens to consider for generation. */ - float top_p; /* Nucleus sampling: cumulative probability cutoff to use for token selection. */ - float temperature; /* Hyperparameter to control the randomness of predictions by scaling the logits before applying softmax. */ - float repeat_penalty; /* Penalty applied to the logits of previously generated tokens, helps prevent repetitive or monotonic text. */ - float frequency_penalty; /* Penalty for repeating the same word or phrase, reducing the likelihood of repeated content. */ - float presence_penalty; /* Penalty or reward for introducing new tokens into the generated text. */ - int32_t mirostat; /* Enables mirostat algorithm, where 0 = off, 1 = use mirostat algorithm, 2 = use mirostat 2.0 algorithm. */ - float mirostat_tau; /* Target entropy (perplexity) for mirostat algorithm, setting the desired complexity of the generated text. */ - float mirostat_eta; /* Learning rate for the mirostat algorithm. */ - bool logprobs; /* Whether to return the log probabilities for each output token along with their token ids. */ - int32_t top_logprobs; /* The number of top tokens for which to return log probabilities, along with their token ids. */ - bool use_gpu; /* Flag to indicate whether to use GPU for inference. */ + int32_t base_domain_id; /**< base_domain_id */ + uint8_t reserved[112]; /**< reserved */ +} RKLLMExtendParam; + +/** + * @struct RKLLMParam + * @brief Defines the parameters for configuring an LLM instance. + */ +typedef struct { + const char* model_path; /**< Path to the model file. */ + int32_t max_context_len; /**< Maximum number of tokens in the context window. */ + int32_t max_new_tokens; /**< Maximum number of new tokens to generate. */ + int32_t top_k; /**< Top-K sampling parameter for token generation. */ + float top_p; /**< Top-P (nucleus) sampling parameter. */ + float temperature; /**< Sampling temperature, affecting the randomness of token selection. */ + float repeat_penalty; /**< Penalty for repeating tokens in generation. */ + float frequency_penalty; /**< Penalizes frequent tokens during generation. */ + float presence_penalty; /**< Penalizes tokens based on their presence in the input. */ + int32_t mirostat; /**< Mirostat sampling strategy flag (0 to disable). */ + float mirostat_tau; /**< Tau parameter for Mirostat sampling. */ + float mirostat_eta; /**< Eta parameter for Mirostat sampling. */ + bool skip_special_token; /**< Whether to skip special tokens during generation. */ + bool is_async; /**< Whether to run inference asynchronously. */ + const char* img_start; /**< Starting position of an image in multimodal input. */ + const char* img_end; /**< Ending position of an image in multimodal input. */ + const char* img_content; /**< Pointer to the image content. */ + RKLLMExtendParam extend_param; /**< Extend parameters. */ } RKLLMParam; /** - * @brief Structure representing a token with its associated log probability. - * + * @struct RKLLMLoraAdapter + * @brief Defines parameters for a Lora adapter used in model fine-tuning. */ typedef struct { - float logprob; /* Log probability corresponding to the token ID. */ - int id; /* Token ID. */ -} Token; + const char* lora_adapter_path; /**< Path to the Lora adapter file. */ + const char* lora_adapter_name; /**< Name of the Lora adapter. */ + float scale; /**< Scaling factor for applying the Lora adapter. */ +} RKLLMLoraAdapter; /** - * @brief Structure to hold the results from the language model inference, including text and token details. - * + * @struct RKLLMEmbedInput + * @brief Represents an embedding input to the LLM. */ typedef struct { - const char* text; /* Decoded text from the inference output. */ - Token* tokens; /* Array of Token structures, each containing a log probability and a token ID. */ - int num; /* Number of top tokens returned, typically those with the highest probabilities. */ + float* embed; /**< Pointer to the embedding vector (of size n_tokens * n_embed). */ + size_t n_tokens; /**< Number of tokens represented in the embedding. */ +} RKLLMEmbedInput; + +/** + * @struct RKLLMTokenInput + * @brief Represents token input to the LLM. + */ +typedef struct { + int32_t* input_ids; /**< Array of token IDs. */ + size_t n_tokens; /**< Number of tokens in the input. */ +} RKLLMTokenInput; + +/** + * @struct RKLLMMultiModelInput + * @brief Represents multimodal input (e.g., text and image). + */ +typedef struct { + char* prompt; /**< Text prompt input. */ + float* image_embed; /**< Embedding of the image (of size n_image_tokens * n_image_embed). */ + size_t n_image_tokens; /**< Number of image tokens. */ +} RKLLMMultiModelInput; + +/** + * @struct RKLLMInput + * @brief Represents different types of input to the LLM via a union. + */ +typedef struct { + RKLLMInputType input_type; /**< Specifies the type of input provided (e.g., prompt, token, embed, multimodal). */ + union { + const char* prompt_input; /**< Text prompt input if input_type is RKLLM_INPUT_PROMPT. */ + RKLLMEmbedInput embed_input; /**< Embedding input if input_type is RKLLM_INPUT_EMBED. */ + RKLLMTokenInput token_input; /**< Token input if input_type is RKLLM_INPUT_TOKEN. */ + RKLLMMultiModelInput multimodal_input; /**< Multimodal input if input_type is RKLLM_INPUT_MULTIMODAL. */ + }; +} RKLLMInput; + +/** + * @struct RKLLMLoraParam + * @brief Structure defining parameters for Lora adapters. + */ +typedef struct { + const char* lora_adapter_name; /**< Name of the Lora adapter. */ +} RKLLMLoraParam; + +/** + * @struct RKLLMPromptCacheParam + * @brief Structure to define parameters for caching prompts. + */ +typedef struct { + int save_prompt_cache; /**< Flag to indicate whether to save the prompt cache (0 = don't save, 1 = save). */ + const char* prompt_cache_path; /**< Path to the prompt cache file. */ +} RKLLMPromptCacheParam; + +/** + * @struct RKLLMInferParam + * @brief Structure for defining parameters during inference. + */ +typedef struct { + RKLLMInferMode mode; /**< Inference mode (e.g., generate or get last hidden layer). */ + RKLLMLoraParam* lora_params; /**< Pointer to Lora adapter parameters. */ + RKLLMPromptCacheParam* prompt_cache_params; /**< Pointer to prompt cache parameters. */ +} RKLLMInferParam; + +/** + * @struct RKLLMResultLastHiddenLayer + * @brief Structure to hold the hidden states from the last layer. + */ +typedef struct { + const float* hidden_states; /**< Pointer to the hidden states (of size num_tokens * embd_size). */ + int embd_size; /**< Size of the embedding vector. */ + int num_tokens; /**< Number of tokens for which hidden states are stored. */ +} RKLLMResultLastHiddenLayer; + +/** + * @struct RKLLMResult + * @brief Structure to represent the result of LLM inference. + */ +typedef struct { + const char* text; /**< Generated text result. */ + int32_t token_id; /**< ID of the generated token. */ + RKLLMResultLastHiddenLayer last_hidden_layer; /**< Hidden states of the last layer (if requested). */ } RKLLMResult; /** - * @brief Callback function for handling inference results. - * - * @param result A pointer to an RKLLMResult struct containing the inference results. - * @param userdata A pointer to user-defined function or null if no user function was provided. - * @param state The state of the inference process, indicating success, failure, or completion. + * @typedef LLMResultCallback + * @brief Callback function to handle LLM results. + * @param result Pointer to the LLM result. + * @param userdata Pointer to user data for the callback. + * @param state State of the LLM call (e.g., finished, error). */ typedef void(*LLMResultCallback)(RKLLMResult* result, void* userdata, LLMCallState state); /** - * @brief Initializes RKLLMParam with default settings. - * - * @return RKLLMParam An RKLLMParam struct with default values set. + * @brief Creates a default RKLLMParam structure with preset values. + * @return A default RKLLMParam structure. */ RKLLMParam rkllm_createDefaultParam(); /** - * @brief Initializes the model with specified parameters. - * - * @param handle Pointer to a handle for the language model, which will be initialized by this function. - * @param param An RKLLMParam struct containing all the parameters needed for the model. - * @param callback A function pointer to the callback that handles the results of the inference. - * @return int Returns 0 on success, or a negative error code on failure. + * @brief Initializes the LLM with the given parameters. + * @param handle Pointer to the LLM handle. + * @param param Configuration parameters for the LLM. + * @param callback Callback function to handle LLM results. + * @return Status code (0 for success, non-zero for failure). */ -int rkllm_init(LLMHandle* handle, RKLLMParam param, LLMResultCallback callback); +int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback); /** - * @brief Releases the model resources. - * - * @param handle The handle to the language model to be destroyed. - * @return int Returns 0 on successful release, or a negative error code if an error occurs. + * @brief Loads a Lora adapter into the LLM. + * @param handle LLM handle. + * @param lora_adapter Pointer to the Lora adapter structure. + * @return Status code (0 for success, non-zero for failure). + */ +int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter); + +/** + * @brief Loads a prompt cache from a file. + * @param handle LLM handle. + * @param prompt_cache_path Path to the prompt cache file. + * @return Status code (0 for success, non-zero for failure). + */ +int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path); + +/** + * @brief Releases the prompt cache from memory. + * @param handle LLM handle. + * @return Status code (0 for success, non-zero for failure). + */ +int rkllm_release_prompt_cache(LLMHandle handle); + +/** + * @brief Destroys the LLM instance and releases resources. + * @param handle LLM handle. + * @return Status code (0 for success, non-zero for failure). */ int rkllm_destroy(LLMHandle handle); /** - * @brief Runs model inference on the given prompt. - * - * @param handle The handle to the initialized language model. - * @param prompt The text prompt on which to perform inference. - * @param userdata Optional user-defined function that will be passed to the callback. - * @return int Returns 0 on success, or a negative error code if an error occurs during inference. + * @brief Runs an LLM inference task synchronously. + * @param handle LLM handle. + * @param rkllm_input Input data for the LLM. + * @param rkllm_infer_params Parameters for the inference task. + * @param userdata Pointer to user data for the callback. + * @return Status code (0 for success, non-zero for failure). */ -int rkllm_run(LLMHandle handle, const char* prompt, void* userdata); +int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata); /** - * @brief Aborts the current inference process. - * - * @param handle The handle to the language model whose inference is to be aborted. - * @return int Returns 0 if the process is successfully aborted, or a negative error code - * if no process was running or if the abort fails. + * @brief Runs an LLM inference task asynchronously. + * @param handle LLM handle. + * @param rkllm_input Input data for the LLM. + * @param rkllm_infer_params Parameters for the inference task. + * @param userdata Pointer to user data for the callback. + * @return Status code (0 for success, non-zero for failure). + */ +int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata); + +/** + * @brief Aborts an ongoing LLM task. + * @param handle LLM handle. + * @return Status code (0 for success, non-zero for failure). */ int rkllm_abort(LLMHandle handle); +/** + * @brief Checks if an LLM task is currently running. + * @param handle LLM handle. + * @return Status code (0 if a task is running, non-zero for otherwise). + */ +int rkllm_is_running(LLMHandle handle); + #ifdef __cplusplus -} //extern "C" +} #endif -#endif \ No newline at end of file +#endif diff --git a/external/packages/bsp/rk3588/usr/lib/librkllmrt.so b/external/packages/bsp/rk3588/usr/lib/librkllmrt.so index c913ab1569df..3bb8cb6172c4 100755 Binary files a/external/packages/bsp/rk3588/usr/lib/librkllmrt.so and b/external/packages/bsp/rk3588/usr/lib/librkllmrt.so differ diff --git a/external/packages/bsp/rk3588/usr/lib/librknnrt.so b/external/packages/bsp/rk3588/usr/lib/librknnrt.so index 5c886a02670b..ec8a54512475 100755 Binary files a/external/packages/bsp/rk3588/usr/lib/librknnrt.so and b/external/packages/bsp/rk3588/usr/lib/librknnrt.so differ