mirror of
https://github.com/bytecodealliance/wasm-micro-runtime.git
synced 2025-09-08 02:41:13 +00:00
wasi-nn: make the host use the wasi_ephemeral_nn version of tensor_data (#4411)
the motivations: * make the actual input size available to the backends. (currently the backends have to make a guess from shape/type.) * make the host logic look a bit similar to wasi_ephemeral_nn. this is a backend api/abi change.
This commit is contained in:
parent
23799a2cb6
commit
2372a472aa
|
@ -99,7 +99,7 @@ typedef enum {
|
||||||
// 4-byte f32 elements would have a data array of length 16). Naturally, this
|
// 4-byte f32 elements would have a data array of length 16). Naturally, this
|
||||||
// representation requires some knowledge of how to lay out data in
|
// representation requires some knowledge of how to lay out data in
|
||||||
// memory--e.g., using row-major ordering--and could perhaps be improved.
|
// memory--e.g., using row-major ordering--and could perhaps be improved.
|
||||||
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0 && defined(__wasm__)
|
#if !defined(__wasm__) || WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t *buf;
|
uint8_t *buf;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
|
|
|
@ -99,7 +99,8 @@ graph_builder_array_app_native(wasm_module_inst_t instance,
|
||||||
|
|
||||||
static wasi_nn_error
|
static wasi_nn_error
|
||||||
tensor_data_app_native(wasm_module_inst_t instance, uint32_t total_elements,
|
tensor_data_app_native(wasm_module_inst_t instance, uint32_t total_elements,
|
||||||
tensor_wasm *input_tensor_wasm, tensor_data *data)
|
tensor_wasm *input_tensor_wasm, void **data,
|
||||||
|
uint32_t *size)
|
||||||
{
|
{
|
||||||
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
||||||
#define data_size input_tensor_wasm->data_size
|
#define data_size input_tensor_wasm->data_size
|
||||||
|
@ -113,8 +114,9 @@ tensor_data_app_native(wasm_module_inst_t instance, uint32_t total_elements,
|
||||||
NN_ERR_PRINTF("input_tensor_wasm->data_offset is invalid");
|
NN_ERR_PRINTF("input_tensor_wasm->data_offset is invalid");
|
||||||
return invalid_argument;
|
return invalid_argument;
|
||||||
}
|
}
|
||||||
*data = (tensor_data)wasm_runtime_addr_app_to_native(
|
*data = wasm_runtime_addr_app_to_native(
|
||||||
instance, (uint64)input_tensor_wasm->data_offset);
|
instance, (uint64)input_tensor_wasm->data_offset);
|
||||||
|
*size = data_size;
|
||||||
return success;
|
return success;
|
||||||
#undef data_size
|
#undef data_size
|
||||||
}
|
}
|
||||||
|
@ -188,16 +190,19 @@ tensor_app_native(wasm_module_inst_t instance, tensor_wasm *input_tensor_wasm,
|
||||||
NN_DBG_PRINTF("Tensor type: %d", input_tensor_wasm->type);
|
NN_DBG_PRINTF("Tensor type: %d", input_tensor_wasm->type);
|
||||||
NN_DBG_PRINTF("Total number of elements: %d", total_elements);
|
NN_DBG_PRINTF("Total number of elements: %d", total_elements);
|
||||||
|
|
||||||
tensor_data data = NULL;
|
void *data = NULL;
|
||||||
|
uint32_t datasize;
|
||||||
if (success
|
if (success
|
||||||
!= (res = tensor_data_app_native(instance, total_elements,
|
!= (res =
|
||||||
input_tensor_wasm, &data))) {
|
tensor_data_app_native(instance, total_elements,
|
||||||
|
input_tensor_wasm, &data, &datasize))) {
|
||||||
wasm_runtime_free(dimensions);
|
wasm_runtime_free(dimensions);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
input_tensor->type = input_tensor_wasm->type;
|
input_tensor->type = input_tensor_wasm->type;
|
||||||
input_tensor->dimensions = dimensions;
|
input_tensor->dimensions = dimensions;
|
||||||
input_tensor->data = data;
|
input_tensor->data.buf = data;
|
||||||
|
input_tensor->data.size = datasize;
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
|
@ -720,12 +720,12 @@ fail:
|
||||||
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
||||||
wasi_nn_error
|
wasi_nn_error
|
||||||
wasi_nn_get_output(wasm_exec_env_t exec_env, graph_execution_context ctx,
|
wasi_nn_get_output(wasm_exec_env_t exec_env, graph_execution_context ctx,
|
||||||
uint32_t index, tensor_data output_tensor,
|
uint32_t index, void *output_tensor,
|
||||||
uint32_t output_tensor_len, uint32_t *output_tensor_size)
|
uint32_t output_tensor_len, uint32_t *output_tensor_size)
|
||||||
#else /* WASM_ENABLE_WASI_EPHEMERAL_NN == 0 */
|
#else /* WASM_ENABLE_WASI_EPHEMERAL_NN == 0 */
|
||||||
wasi_nn_error
|
wasi_nn_error
|
||||||
wasi_nn_get_output(wasm_exec_env_t exec_env, graph_execution_context ctx,
|
wasi_nn_get_output(wasm_exec_env_t exec_env, graph_execution_context ctx,
|
||||||
uint32_t index, tensor_data output_tensor,
|
uint32_t index, void *output_tensor,
|
||||||
uint32_t *output_tensor_size)
|
uint32_t *output_tensor_size)
|
||||||
#endif /* WASM_ENABLE_WASI_EPHEMERAL_NN != 0 */
|
#endif /* WASM_ENABLE_WASI_EPHEMERAL_NN != 0 */
|
||||||
{
|
{
|
||||||
|
@ -753,16 +753,17 @@ wasi_nn_get_output(wasm_exec_env_t exec_env, graph_execution_context ctx,
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tensor_data tensor = {
|
||||||
|
.buf = output_tensor,
|
||||||
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
||||||
|
.size = output_tensor_len,
|
||||||
|
#else
|
||||||
|
.size = *output_tensor_size,
|
||||||
|
#endif
|
||||||
|
};
|
||||||
call_wasi_nn_func(wasi_nn_ctx->backend, get_output, res,
|
call_wasi_nn_func(wasi_nn_ctx->backend, get_output, res,
|
||||||
wasi_nn_ctx->backend_ctx, ctx, index, output_tensor,
|
wasi_nn_ctx->backend_ctx, ctx, index, &tensor,
|
||||||
&output_tensor_len);
|
|
||||||
*output_tensor_size = output_tensor_len;
|
|
||||||
#else /* WASM_ENABLE_WASI_EPHEMERAL_NN == 0 */
|
|
||||||
call_wasi_nn_func(wasi_nn_ctx->backend, get_output, res,
|
|
||||||
wasi_nn_ctx->backend_ctx, ctx, index, output_tensor,
|
|
||||||
output_tensor_size);
|
output_tensor_size);
|
||||||
#endif /* WASM_ENABLE_WASI_EPHEMERAL_NN != 0 */
|
|
||||||
fail:
|
fail:
|
||||||
unlock_ctx(wasi_nn_ctx);
|
unlock_ctx(wasi_nn_ctx);
|
||||||
return res;
|
return res;
|
||||||
|
|
|
@ -385,7 +385,7 @@ set_input(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
{
|
{
|
||||||
struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
|
struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
|
||||||
// tensor->data is the prompt string. ends with \0
|
// tensor->data is the prompt string. ends with \0
|
||||||
char *prompt_text = (char *)wasi_nn_tensor->data;
|
char *prompt_text = (char *)wasi_nn_tensor->data.buf;
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
NN_DBG_PRINTF("--------------------------------------------------");
|
NN_DBG_PRINTF("--------------------------------------------------");
|
||||||
|
@ -552,7 +552,7 @@ fail:
|
||||||
|
|
||||||
__attribute__((visibility("default"))) wasi_nn_error
|
__attribute__((visibility("default"))) wasi_nn_error
|
||||||
get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
tensor_data output_tensor, uint32_t *output_tensor_size)
|
tensor_data *output_tensor, uint32_t *output_tensor_size)
|
||||||
{
|
{
|
||||||
struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
|
struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
|
||||||
|
|
||||||
|
@ -571,7 +571,7 @@ get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
printf("%s\n", output_metadata);
|
printf("%s\n", output_metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(output_tensor, output_metadata, strlen(output_metadata));
|
memcpy(output_tensor->buf, output_metadata, strlen(output_metadata));
|
||||||
*output_tensor_size = strlen(output_metadata);
|
*output_tensor_size = strlen(output_metadata);
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
@ -591,7 +591,7 @@ get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
printf("%s", buf);
|
printf("%s", buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(output_tensor + end_pos, buf, strlen(buf));
|
memcpy(output_tensor->buf + end_pos, buf, strlen(buf));
|
||||||
end_pos += strlen(buf);
|
end_pos += strlen(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -402,7 +402,7 @@ set_input(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
shape_info);
|
shape_info);
|
||||||
|
|
||||||
CHECK_OV_STATUS(ov_tensor_create_from_host_ptr(input_type, input_shape,
|
CHECK_OV_STATUS(ov_tensor_create_from_host_ptr(input_type, input_shape,
|
||||||
wasi_nn_tensor->data,
|
wasi_nn_tensor->data.buf,
|
||||||
&input_tensor),
|
&input_tensor),
|
||||||
ret);
|
ret);
|
||||||
}
|
}
|
||||||
|
@ -441,7 +441,7 @@ fail:
|
||||||
|
|
||||||
__attribute__((visibility("default"))) wasi_nn_error
|
__attribute__((visibility("default"))) wasi_nn_error
|
||||||
get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
tensor_data output_tensor, uint32_t *output_tensor_size)
|
tensor_data *output_tensor, uint32_t *output_tensor_size)
|
||||||
{
|
{
|
||||||
OpenVINOContext *ov_ctx = (OpenVINOContext *)ctx;
|
OpenVINOContext *ov_ctx = (OpenVINOContext *)ctx;
|
||||||
struct OpenVINOExecutionContext *exec;
|
struct OpenVINOExecutionContext *exec;
|
||||||
|
@ -460,14 +460,14 @@ get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
|
|
||||||
CHECK_OV_STATUS(ov_tensor_get_byte_size(ov_tensor, &byte_size), ret);
|
CHECK_OV_STATUS(ov_tensor_get_byte_size(ov_tensor, &byte_size), ret);
|
||||||
|
|
||||||
if (byte_size > *output_tensor_size) {
|
if (byte_size > output_tensor->size) {
|
||||||
ret = too_large;
|
ret = too_large;
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
CHECK_OV_STATUS(ov_tensor_data(ov_tensor, &data), ret);
|
CHECK_OV_STATUS(ov_tensor_data(ov_tensor, &data), ret);
|
||||||
|
|
||||||
memcpy(output_tensor, data, byte_size);
|
memcpy(output_tensor->buf, data, byte_size);
|
||||||
|
|
||||||
*output_tensor_size = (uint32_t)byte_size;
|
*output_tensor_size = (uint32_t)byte_size;
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ compute(void *ctx, graph_execution_context exec_ctx);
|
||||||
|
|
||||||
__attribute__((visibility("default"))) wasi_nn_error
|
__attribute__((visibility("default"))) wasi_nn_error
|
||||||
get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
|
||||||
tensor_data output_tensor, uint32_t *output_tensor_size);
|
tensor_data *output_tensor, uint32_t *output_tensor_size);
|
||||||
|
|
||||||
__attribute__((visibility("default"))) wasi_nn_error
|
__attribute__((visibility("default"))) wasi_nn_error
|
||||||
init_backend(void **ctx);
|
init_backend(void **ctx);
|
||||||
|
|
|
@ -32,7 +32,7 @@ typedef wasi_nn_error (*SET_INPUT)(void *, graph_execution_context, uint32_t,
|
||||||
tensor *);
|
tensor *);
|
||||||
typedef wasi_nn_error (*COMPUTE)(void *, graph_execution_context);
|
typedef wasi_nn_error (*COMPUTE)(void *, graph_execution_context);
|
||||||
typedef wasi_nn_error (*GET_OUTPUT)(void *, graph_execution_context, uint32_t,
|
typedef wasi_nn_error (*GET_OUTPUT)(void *, graph_execution_context, uint32_t,
|
||||||
tensor_data, uint32_t *);
|
tensor_data *, uint32_t *);
|
||||||
/* wasi-nn general APIs */
|
/* wasi-nn general APIs */
|
||||||
typedef wasi_nn_error (*BACKEND_INITIALIZE)(void **);
|
typedef wasi_nn_error (*BACKEND_INITIALIZE)(void **);
|
||||||
typedef wasi_nn_error (*BACKEND_DEINITIALIZE)(void *);
|
typedef wasi_nn_error (*BACKEND_DEINITIALIZE)(void *);
|
||||||
|
|
|
@ -324,7 +324,7 @@ set_input(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
index);
|
index);
|
||||||
|
|
||||||
int size = model_tensor_size * sizeof(float);
|
int size = model_tensor_size * sizeof(float);
|
||||||
bh_memcpy_s(it, size, input_tensor->data, size);
|
bh_memcpy_s(it, size, input_tensor->data.buf, size);
|
||||||
}
|
}
|
||||||
else { // TODO: Assuming uint8 quantized networks.
|
else { // TODO: Assuming uint8 quantized networks.
|
||||||
TfLiteAffineQuantization *quant_info =
|
TfLiteAffineQuantization *quant_info =
|
||||||
|
@ -342,7 +342,7 @@ set_input(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
NN_DBG_PRINTF("input tensor: (scale, offset) = (%f, %f)", scale,
|
NN_DBG_PRINTF("input tensor: (scale, offset) = (%f, %f)", scale,
|
||||||
zero_point);
|
zero_point);
|
||||||
|
|
||||||
float *input_tensor_f = (float *)input_tensor->data;
|
float *input_tensor_f = (float *)input_tensor->data.buf;
|
||||||
for (uint32_t i = 0; i < model_tensor_size; ++i) {
|
for (uint32_t i = 0; i < model_tensor_size; ++i) {
|
||||||
it[i] = (uint8_t)(input_tensor_f[i] / scale + zero_point);
|
it[i] = (uint8_t)(input_tensor_f[i] / scale + zero_point);
|
||||||
}
|
}
|
||||||
|
@ -366,7 +366,7 @@ compute(void *tflite_ctx, graph_execution_context ctx)
|
||||||
|
|
||||||
__attribute__((visibility("default"))) wasi_nn_error
|
__attribute__((visibility("default"))) wasi_nn_error
|
||||||
get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
tensor_data output_tensor, uint32_t *output_tensor_size)
|
tensor_data *output_tensor, uint32_t *output_tensor_size)
|
||||||
{
|
{
|
||||||
TFLiteContext *tfl_ctx = (TFLiteContext *)tflite_ctx;
|
TFLiteContext *tfl_ctx = (TFLiteContext *)tflite_ctx;
|
||||||
|
|
||||||
|
@ -392,7 +392,7 @@ get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
if (tensor->quantization.type == kTfLiteNoQuantization) {
|
if (tensor->quantization.type == kTfLiteNoQuantization) {
|
||||||
NN_DBG_PRINTF("No quantization information");
|
NN_DBG_PRINTF("No quantization information");
|
||||||
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
||||||
if (*output_tensor_size < tensor->bytes) {
|
if (output_tensor->size < tensor->bytes) {
|
||||||
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
||||||
return too_large;
|
return too_large;
|
||||||
}
|
}
|
||||||
|
@ -401,12 +401,12 @@ get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
* for now, maintain the bug-to-bug compatibility with the old abi,
|
* for now, maintain the bug-to-bug compatibility with the old abi,
|
||||||
* where the size here is the number of fp32, not bytes.
|
* where the size here is the number of fp32, not bytes.
|
||||||
*/
|
*/
|
||||||
if (*output_tensor_size < tensor->bytes / sizeof(float)) {
|
if (output_tensor->size < tensor->bytes / sizeof(float)) {
|
||||||
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
||||||
return too_large;
|
return too_large;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
bh_memcpy_s(output_tensor, *output_tensor_size, tensor->data.data,
|
bh_memcpy_s(output_tensor->buf, output_tensor->size, tensor->data.data,
|
||||||
tensor->bytes);
|
tensor->bytes);
|
||||||
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
||||||
*output_tensor_size = tensor->bytes;
|
*output_tensor_size = tensor->bytes;
|
||||||
|
@ -431,7 +431,7 @@ get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
model_tensor_size *= (uint32_t)tensor->dims->data[i];
|
model_tensor_size *= (uint32_t)tensor->dims->data[i];
|
||||||
|
|
||||||
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
#if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
|
||||||
if (*output_tensor_size / sizeof(float) < model_tensor_size) {
|
if (output_tensor->size / sizeof(float) < model_tensor_size) {
|
||||||
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
||||||
return too_large;
|
return too_large;
|
||||||
}
|
}
|
||||||
|
@ -440,7 +440,7 @@ get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
* for now, maintain the bug-to-bug compatibility with the old abi,
|
* for now, maintain the bug-to-bug compatibility with the old abi,
|
||||||
* where the size here is the number of fp32, not bytes.
|
* where the size here is the number of fp32, not bytes.
|
||||||
*/
|
*/
|
||||||
if (*output_tensor_size < model_tensor_size) {
|
if (output_tensor->size < model_tensor_size) {
|
||||||
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
NN_ERR_PRINTF("Insufficient memory to copy tensor %d", index);
|
||||||
return too_large;
|
return too_large;
|
||||||
}
|
}
|
||||||
|
@ -454,7 +454,7 @@ get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
NN_DBG_PRINTF("output tensor: (scale, offset) = (%f, %f)", scale,
|
NN_DBG_PRINTF("output tensor: (scale, offset) = (%f, %f)", scale,
|
||||||
zero_point);
|
zero_point);
|
||||||
|
|
||||||
float *output_tensor_f = (float *)output_tensor;
|
float *output_tensor_f = (float *)output_tensor->buf;
|
||||||
for (uint32_t i = 0; i < model_tensor_size; ++i) {
|
for (uint32_t i = 0; i < model_tensor_size; ++i) {
|
||||||
output_tensor_f[i] = (ot[i] - zero_point) * scale;
|
output_tensor_f[i] = (ot[i] - zero_point) * scale;
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ compute(void *tflite_ctx, graph_execution_context ctx);
|
||||||
|
|
||||||
__attribute__((visibility("default"))) wasi_nn_error
|
__attribute__((visibility("default"))) wasi_nn_error
|
||||||
get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
get_output(void *tflite_ctx, graph_execution_context ctx, uint32_t index,
|
||||||
tensor_data output_tensor, uint32_t *output_tensor_size);
|
tensor_data *output_tensor, uint32_t *output_tensor_size);
|
||||||
|
|
||||||
__attribute__((visibility("default"))) wasi_nn_error
|
__attribute__((visibility("default"))) wasi_nn_error
|
||||||
init_backend(void **tflite_ctx);
|
init_backend(void **tflite_ctx);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user