diff --git a/.github/workflows/compilation_on_nuttx.yml b/.github/workflows/compilation_on_nuttx.yml
index 2f8014fac..e10784fe2 100644
--- a/.github/workflows/compilation_on_nuttx.yml
+++ b/.github/workflows/compilation_on_nuttx.yml
@@ -119,11 +119,12 @@ jobs:
         run: make -j$(nproc) EXTRAFLAGS=-Werror
 
       - name: Checkout Bloaty
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: google/bloaty
           submodules: recursive
           path: bloaty
+          ref: 34f4a66559ad4938c1e629e9b5f54630b2b4d7b0
 
       - name: Build Bloaty
         run: |
diff --git a/.github/workflows/spec_test_on_nuttx.yml b/.github/workflows/spec_test_on_nuttx.yml
index f2e59ba68..712bd06bc 100644
--- a/.github/workflows/spec_test_on_nuttx.yml
+++ b/.github/workflows/spec_test_on_nuttx.yml
@@ -74,11 +74,11 @@ jobs:
             target: "riscv32",
             fpu_type: "none"
           },
-          {
-            config: "boards/risc-v/qemu-rv/rv-virt/configs/nsh",
-            target: "riscv32_ilp32f",
-            fpu_type: "fp"
-          },
+          #{
+          #  config: "boards/risc-v/qemu-rv/rv-virt/configs/nsh",
+          #  target: "riscv32_ilp32f",
+          #  fpu_type: "fp"
+          #},
           # {
           #   config: "boards/risc-v/qemu-rv/rv-virt/configs/nsh",
           #   target: "riscv32_ilp32d",
@@ -327,19 +327,6 @@ jobs:
         working-directory: apps/interpreters/wamr/wamr
 
       - name: Test
-        if: matrix.target_config.target != 'xtensa'
-        run: |
-          cd apps/interpreters/wamr/wamr/tests/wamr-test-suites
-          ./test_wamr.sh -s spec ${{ matrix.wamr_test_option.mode }} -m ${{ matrix.target_config.target }} -b -Q -P -F ${{ steps.build_firmware_path.outputs.firmware }} ${{ matrix.wamr_feature_option.mode}}
-
-      # for xtensa, for some reasons, when running the tests
-      # with test_wamr.sh -P, nuttx occasionally hangs after
-      # "total segments stored 6" on the CI.
-      # i (yamamoto) couldn't reproduce it locally (macOS) even
-      # with the identical flash image.
-      # for now, run the tests without -P.
-      - name: Test
-        if: matrix.target_config.target == 'xtensa'
         run: |
           cd apps/interpreters/wamr/wamr/tests/wamr-test-suites
           ./test_wamr.sh -s spec ${{ matrix.wamr_test_option.mode }} -m ${{ matrix.target_config.target }} -b -Q -F ${{ steps.build_firmware_path.outputs.firmware }} ${{ matrix.wamr_feature_option.mode}}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7f766689..0531ec411 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,10 +121,14 @@ set (WAMR_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 include (${WAMR_ROOT_DIR}/build-scripts/runtime_lib.cmake)
 
-set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Wformat -Wformat-security -Wshadow -Wno-unused-parameter -fvisibility=hidden")
-# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wconversion -Wsign-conversion")
-
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wformat -Wformat-security -Wno-unused")
+if (NOT WIN32)
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Wformat -Wformat-security \
+                                       -ffunction-sections -fdata-sections \
+                                       -Wno-unused-parameter -Wno-pedantic")
+  # Remove the extra spaces for better make log
+  string (REGEX REPLACE "  *" " " CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wformat -Wformat-security -Wno-unused")
+endif()
 
 if (WAMR_BUILD_TARGET MATCHES "X86_.*" OR WAMR_BUILD_TARGET STREQUAL "AMD_64")
   if (NOT (CMAKE_C_COMPILER MATCHES ".*clang.*" OR CMAKE_C_COMPILER_ID MATCHES ".*Clang"))
@@ -145,6 +149,10 @@ include (${SHARED_DIR}/utils/uncommon/shared_uncommon.cmake)
 set (THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
+if (MSVC)
+  add_definitions(-DCOMPILING_WASM_RUNTIME_API=1)
+endif ()
+
 # STATIC LIBRARY
 if (WAMR_BUILD_STATIC)
     add_library(iwasm_static STATIC ${WAMR_RUNTIME_LIB_SOURCE})
@@ -155,6 +163,14 @@ if (WAMR_BUILD_STATIC)
       target_link_libraries(iwasm_static INTERFACE boringssl_crypto)
     endif ()
 
+    if (MINGW)
+      target_link_libraries (iwasm_static PRIVATE ws2_32)
+    endif ()
+
+    if (WIN32)
+      target_link_libraries(iwasm_static PRIVATE ntdll)  
+    endif()
+
     install (TARGETS iwasm_static ARCHIVE DESTINATION lib)
 endif ()
 
@@ -169,9 +185,14 @@ if (WAMR_BUILD_SHARED)
     endif ()
 
     if (MINGW)
-      target_link_libraries (iwasm_shared INTERFACE -lWs2_32 -lwsock32)
+      target_link_libraries(iwasm_shared INTERFACE -lWs2_32 -lwsock32)
+      target_link_libraries(iwasm_shared PRIVATE ws2_32)
     endif ()
 
+    if (WIN32)
+      target_link_libraries(iwasm_shared PRIVATE ntdll)  
+    endif()
+
     install (TARGETS iwasm_shared LIBRARY DESTINATION lib)
 endif ()
 
diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake
index 252ba3a84..12fc06bd7 100644
--- a/build-scripts/config_common.cmake
+++ b/build-scripts/config_common.cmake
@@ -442,7 +442,9 @@ if (WAMR_BUILD_WASI_NN EQUAL 1)
   message ("     WASI-NN enabled")
   add_definitions (-DWASM_ENABLE_WASI_NN=1)
   # Variant backends
-  if (NOT WAMR_BUILD_WASI_NN_TFLITE EQUAL 1 AND NOT WAMR_BUILD_WASI_NN_OPENVINO EQUAL 1)
+  if (NOT WAMR_BUILD_WASI_NN_TFLITE EQUAL 1 AND
+      NOT WAMR_BUILD_WASI_NN_OPENVINO EQUAL 1 AND
+      NOT WAMR_BUILD_WASI_NN_LLAMACPP EQUAL 1)
     message (FATAL_ERROR "   Need to select a backend for WASI-NN")
   endif ()
 
@@ -454,6 +456,10 @@ if (WAMR_BUILD_WASI_NN EQUAL 1)
     message ("     WASI-NN: backend openvino enabled")
     add_definitions (-DWASM_ENABLE_WASI_NN_OPENVINO)
   endif ()
+  if (WAMR_BUILD_WASI_NN_LLAMACPP EQUAL 1)
+    message ("     WASI-NN: backend llamacpp enabled")
+    add_definitions (-DWASM_ENABLE_WASI_NN_LLAMACPP)
+  endif ()
   # Variant devices
   if (WAMR_BUILD_WASI_NN_ENABLE_GPU EQUAL 1)
       message ("     WASI-NN: GPU enabled")
diff --git a/core/iwasm/aot/aot_loader.c b/core/iwasm/aot/aot_loader.c
index b96079d3b..0abafd9dd 100644
--- a/core/iwasm/aot/aot_loader.c
+++ b/core/iwasm/aot/aot_loader.c
@@ -302,7 +302,10 @@ loader_mmap(uint32 size, bool prot_exec, char *error_buf, uint32 error_buf_size)
     int map_flags;
     void *mem;
 
-#if UINTPTR_MAX == UINT64_MAX
+#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64) \
+    || defined(BUILD_TARGET_RISCV64_LP64D)                       \
+    || defined(BUILD_TARGET_RISCV64_LP64)
+#ifndef __APPLE__
     /* The mmapped AOT data and code in 64-bit targets had better be in
        range 0 to 2G, or aot loader may fail to apply some relocations,
        e.g., R_X86_64_32/R_X86_64_32S/R_X86_64_PC32/R_RISCV_32.
@@ -316,6 +319,7 @@ loader_mmap(uint32 size, bool prot_exec, char *error_buf, uint32 error_buf_size)
         bh_assert((uintptr_t)mem < INT32_MAX);
         return mem;
     }
+#endif
 #endif
 
     map_flags = MMAP_MAP_NONE;
@@ -579,6 +583,10 @@ load_target_info_section(const uint8 *buf, const uint8 *buf_end,
         return false;
     }
 
+    /* for backwards compatibility with previous wamrc aot files */
+    if (!strcmp(target_info.arch, "arm64"))
+        bh_strcpy_s(target_info.arch, sizeof(target_info.arch), "aarch64v8");
+
     /* Check machine info */
     if (!check_machine_info(&target_info, error_buf, error_buf_size)) {
         return false;
@@ -589,6 +597,10 @@ load_target_info_section(const uint8 *buf, const uint8 *buf_end,
         return false;
     }
 
+#if WASM_ENABLE_DUMP_CALL_STACK != 0
+    module->feature_flags = target_info.feature_flags;
+#endif
+
     /* Finally, check feature flags */
     return check_feature_flags(error_buf, error_buf_size,
                                target_info.feature_flags);
diff --git a/core/iwasm/aot/aot_reloc.h b/core/iwasm/aot/aot_reloc.h
index 8ead3cd93..f7ada4d8d 100644
--- a/core/iwasm/aot/aot_reloc.h
+++ b/core/iwasm/aot/aot_reloc.h
@@ -226,7 +226,7 @@ SymbolMap *
 get_target_symbol_map(uint32 *sym_num);
 
 uint32
-get_plt_table_size();
+get_plt_table_size(void);
 
 void
 init_plt_table(uint8 *plt);
diff --git a/core/iwasm/aot/aot_runtime.c b/core/iwasm/aot/aot_runtime.c
index 3ca26114f..013c761a0 100644
--- a/core/iwasm/aot/aot_runtime.c
+++ b/core/iwasm/aot/aot_runtime.c
@@ -4,6 +4,7 @@
  */
 
 #include "aot_runtime.h"
+#include "../compilation/aot_stack_frame.h"
 #include "bh_log.h"
 #include "mem_alloc.h"
 #include "../common/wasm_runtime_common.h"
@@ -72,6 +73,10 @@ bh_static_assert(offsetof(AOTFrame, sp) == sizeof(uintptr_t) * 5);
 bh_static_assert(offsetof(AOTFrame, frame_ref) == sizeof(uintptr_t) * 6);
 bh_static_assert(offsetof(AOTFrame, lp) == sizeof(uintptr_t) * 7);
 
+bh_static_assert(offsetof(AOTTinyFrame, func_index) == sizeof(uint32) * 0);
+bh_static_assert(offsetof(AOTTinyFrame, ip_offset) == sizeof(uint32) * 1);
+bh_static_assert(sizeof(AOTTinyFrame) == sizeof(uint32) * 2);
+
 static void
 set_error_buf(char *error_buf, uint32 error_buf_size, const char *string)
 {
@@ -110,6 +115,55 @@ runtime_malloc(uint64 size, char *error_buf, uint32 error_buf_size)
     return mem;
 }
 
+#if WASM_ENABLE_AOT_STACK_FRAME != 0
+static bool
+is_tiny_frame(WASMExecEnv *exec_env)
+{
+    AOTModule *module =
+        (AOTModule *)((AOTModuleInstance *)exec_env->module_inst)->module;
+
+    return module->feature_flags & WASM_FEATURE_TINY_STACK_FRAME;
+}
+
+static bool
+is_frame_per_function(WASMExecEnv *exec_env)
+{
+    AOTModule *module =
+        (AOTModule *)((AOTModuleInstance *)exec_env->module_inst)->module;
+
+    return module->feature_flags & WASM_FEATURE_FRAME_PER_FUNCTION;
+}
+
+static void *
+get_top_frame(WASMExecEnv *exec_env)
+{
+    if (is_tiny_frame(exec_env)) {
+        return exec_env->wasm_stack.top > exec_env->wasm_stack.bottom
+                   ? exec_env->wasm_stack.top - sizeof(AOTTinyFrame)
+                   : NULL;
+    }
+    else {
+        return exec_env->cur_frame;
+    }
+}
+
+static void *
+get_prev_frame(WASMExecEnv *exec_env, void *cur_frame)
+{
+    bh_assert(cur_frame);
+
+    if (is_tiny_frame(exec_env)) {
+        if ((uint8 *)cur_frame == exec_env->wasm_stack.bottom) {
+            return NULL;
+        }
+        return ((AOTTinyFrame *)cur_frame) - 1;
+    }
+    else {
+        return ((AOTFrame *)cur_frame)->prev_frame;
+    }
+}
+#endif
+
 static bool
 check_global_init_expr(const AOTModule *module, uint32 global_index,
                        char *error_buf, uint32 error_buf_size)
@@ -2265,7 +2319,7 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         uint32 ext_ret_cell = wasm_get_cell_num(ext_ret_types, ext_ret_count);
         uint64 size;
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        struct WASMInterpFrame *prev_frame = exec_env->cur_frame;
+        void *prev_frame = get_top_frame(exec_env);
 #endif
 
         /* Allocate memory all arguments */
@@ -2296,7 +2350,8 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         }
 
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        if (!aot_alloc_frame(exec_env, function->func_index)) {
+        if (!is_frame_per_function(exec_env)
+            && !aot_alloc_frame(exec_env, function->func_index)) {
             if (argv1 != argv1_buf)
                 wasm_runtime_free(argv1);
             return false;
@@ -2324,7 +2379,7 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         /* Free all frames allocated, note that some frames
            may be allocated in AOT code and haven't been
            freed if exception occurred */
-        while (exec_env->cur_frame != prev_frame)
+        while (get_top_frame(exec_env) != prev_frame)
             aot_free_frame(exec_env);
 #endif
         if (!ret) {
@@ -2367,9 +2422,12 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
     }
     else {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        struct WASMInterpFrame *prev_frame = exec_env->cur_frame;
-
-        if (!aot_alloc_frame(exec_env, function->func_index)) {
+        void *prev_frame = get_top_frame(exec_env);
+        /* Only allocate frame for frame-per-call mode; in the
+           frame-per-function mode the frame is allocated at the
+           beginning of the function. */
+        if (!is_frame_per_function(exec_env)
+            && !aot_alloc_frame(exec_env, function->func_index)) {
             return false;
         }
 #endif
@@ -2394,7 +2452,7 @@ aot_call_function(WASMExecEnv *exec_env, AOTFunctionInstance *function,
         /* Free all frames allocated, note that some frames
            may be allocated in AOT code and haven't been
            freed if exception occurred */
-        while (exec_env->cur_frame != prev_frame)
+        while (get_top_frame(exec_env) != prev_frame)
             aot_free_frame(exec_env);
 #endif
 
@@ -2880,7 +2938,7 @@ aot_invoke_native(WASMExecEnv *exec_env, uint32 func_idx, uint32 argc,
             goto fail;
         }
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        struct WASMInterpFrame *prev_frame = exec_env->cur_frame;
+        void *prev_frame = get_top_frame(exec_env);
 
         if (!aot_alloc_frame(exec_env, func_idx)) {
             goto fail;
@@ -2894,7 +2952,7 @@ aot_invoke_native(WASMExecEnv *exec_env, uint32 func_idx, uint32 argc,
         /* Free all frames allocated, note that some frames
            may be allocated in AOT code and haven't been
            freed if exception occurred */
-        while (exec_env->cur_frame != prev_frame)
+        while (get_top_frame(exec_env) != prev_frame)
             aot_free_frame(exec_env);
 #endif
     }
@@ -3622,8 +3680,8 @@ get_func_name_from_index(const AOTModuleInstance *module_inst,
           WASM_ENABLE_PERF_PROFILING != 0 */
 
 #if WASM_ENABLE_GC == 0
-bool
-aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
+static bool
+aot_alloc_standard_frame(WASMExecEnv *exec_env, uint32 func_index)
 {
     AOTModuleInstance *module_inst = (AOTModuleInstance *)exec_env->module_inst;
 #if WASM_ENABLE_PERF_PROFILING != 0
@@ -3668,37 +3726,10 @@ aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
     return true;
 }
 
-static inline void
-aot_free_frame_internal(WASMExecEnv *exec_env)
-{
-    AOTFrame *cur_frame = (AOTFrame *)exec_env->cur_frame;
-    AOTFrame *prev_frame = cur_frame->prev_frame;
-
-#if WASM_ENABLE_PERF_PROFILING != 0
-    uint64 time_elapsed =
-        (uintptr_t)os_time_thread_cputime_us() - cur_frame->time_started;
-
-    cur_frame->func_perf_prof_info->total_exec_time += time_elapsed;
-    cur_frame->func_perf_prof_info->total_exec_cnt++;
-
-    /* parent function */
-    if (prev_frame)
-        prev_frame->func_perf_prof_info->children_exec_time += time_elapsed;
-#endif
-
-    exec_env->cur_frame = (struct WASMInterpFrame *)prev_frame;
-}
-
-void
-aot_free_frame(WASMExecEnv *exec_env)
-{
-    aot_free_frame_internal(exec_env);
-}
-
 #else /* else of WASM_ENABLE_GC == 0 */
 
-bool
-aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
+static bool
+aot_alloc_standard_frame(WASMExecEnv *exec_env, uint32 func_index)
 {
     AOTModuleInstance *module_inst = (AOTModuleInstance *)exec_env->module_inst;
     AOTModule *module = (AOTModule *)module_inst->module;
@@ -3752,12 +3783,50 @@ aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
     frame->func_index = func_index;
     return true;
 }
+#endif /* end of WASM_ENABLE_GC == 0 */
+
+static bool
+aot_alloc_tiny_frame(WASMExecEnv *exec_env, uint32 func_index)
+{
+    AOTTinyFrame *new_frame = (AOTTinyFrame *)exec_env->wasm_stack.top;
+
+    if ((uint8 *)new_frame > exec_env->wasm_stack.top_boundary) {
+        aot_set_exception((WASMModuleInstance *)exec_env->module_inst,
+                          "wasm operand stack overflow");
+        return false;
+    }
+
+    new_frame->func_index = func_index;
+    exec_env->wasm_stack.top += sizeof(AOTTinyFrame);
+    return true;
+}
+
+bool
+aot_alloc_frame(WASMExecEnv *exec_env, uint32 func_index)
+{
+    AOTModule *module =
+        (AOTModule *)((AOTModuleInstance *)exec_env->module_inst)->module;
+
+    if (is_frame_per_function(exec_env)
+        && func_index >= module->import_func_count) {
+        /* in frame per function mode the frame is allocated at
+        the beginning of each frame, so we only need to allocate
+        the frame for imported functions */
+        return true;
+    }
+    if (is_tiny_frame(exec_env)) {
+        return aot_alloc_tiny_frame(exec_env, func_index);
+    }
+    else {
+        return aot_alloc_standard_frame(exec_env, func_index);
+    }
+}
 
 static inline void
-aot_free_frame_internal(WASMExecEnv *exec_env)
+aot_free_standard_frame(WASMExecEnv *exec_env)
 {
     AOTFrame *cur_frame = (AOTFrame *)exec_env->cur_frame;
-    AOTFrame *prev_frame = cur_frame->prev_frame;
+    AOTFrame *prev_frame = (AOTFrame *)cur_frame->prev_frame;
 
 #if WASM_ENABLE_PERF_PROFILING != 0
     uint64 time_elapsed =
@@ -3771,18 +3840,30 @@ aot_free_frame_internal(WASMExecEnv *exec_env)
         prev_frame->func_perf_prof_info->children_exec_time += time_elapsed;
 #endif
 
+#if WASM_ENABLE_GC != 0
     wasm_exec_env_free_wasm_frame(exec_env, cur_frame);
+#endif
     exec_env->cur_frame = (struct WASMInterpFrame *)prev_frame;
 }
 
+static inline void
+aot_free_tiny_frame(WASMExecEnv *exec_env)
+{
+    exec_env->wasm_stack.top =
+        get_prev_frame(exec_env, exec_env->wasm_stack.top);
+}
+
 void
 aot_free_frame(WASMExecEnv *exec_env)
 {
-    aot_free_frame_internal(exec_env);
+    if (is_tiny_frame(exec_env)) {
+        aot_free_tiny_frame(exec_env);
+    }
+    else {
+        aot_free_standard_frame(exec_env);
+    }
 }
 
-#endif /* end of WASM_ENABLE_GC == 0 */
-
 void
 aot_frame_update_profile_info(WASMExecEnv *exec_env, bool alloc_frame)
 {
@@ -3831,14 +3912,13 @@ aot_frame_update_profile_info(WASMExecEnv *exec_env, bool alloc_frame)
 bool
 aot_create_call_stack(struct WASMExecEnv *exec_env)
 {
-    AOTFrame *cur_frame = (AOTFrame *)exec_env->cur_frame,
-             *first_frame = cur_frame;
     AOTModuleInstance *module_inst = (AOTModuleInstance *)exec_env->module_inst;
     AOTModule *module = (AOTModule *)module_inst->module;
     uint32 n = 0;
 
-    while (cur_frame) {
-        cur_frame = cur_frame->prev_frame;
+    void *top_frame = get_top_frame(exec_env);
+    while (top_frame) {
+        top_frame = get_prev_frame(exec_env, top_frame);
         n++;
     }
 
@@ -3848,28 +3928,46 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
         return false;
     }
 
-    cur_frame = first_frame;
-    while (cur_frame) {
+    top_frame = get_top_frame(exec_env);
+    while (n-- > 0) {
+        uint32 func_index, ip_offset;
+        uint32 *lp = NULL;
+#if WASM_ENABLE_GC != 0
+        uint32 *sp = NULL;
+        uint8 *frame_ref = NULL;
+#endif
+        if (is_tiny_frame(exec_env)) {
+            AOTTinyFrame *frame = (AOTTinyFrame *)top_frame;
+            func_index = (uint32)frame->func_index;
+            ip_offset = (uint32)frame->ip_offset;
+        }
+        else {
+            AOTFrame *frame = (AOTFrame *)top_frame;
+            func_index = (uint32)frame->func_index;
+            ip_offset = (uint32)frame->ip_offset;
+            lp = frame->lp;
+#if WASM_ENABLE_GC != 0
+            sp = frame->sp;
+            frame_ref = frame->frame_ref;
+#endif
+        }
         WASMCApiFrame frame = { 0 };
         uint32 max_local_cell_num, max_stack_cell_num;
         uint32 all_cell_num, lp_size;
 
         frame.instance = module_inst;
         frame.module_offset = 0;
-        frame.func_index = (uint32)cur_frame->func_index;
-        frame.func_offset = (uint32)cur_frame->ip_offset;
-        frame.func_name_wp = get_func_name_from_index(
-            module_inst, (uint32)cur_frame->func_index);
+        frame.func_index = func_index;
+        frame.func_offset = ip_offset;
+        frame.func_name_wp = get_func_name_from_index(module_inst, func_index);
 
-        if (cur_frame->func_index >= module->import_func_count) {
-            uint32 aot_func_idx =
-                (uint32)(cur_frame->func_index - module->import_func_count);
+        if (func_index >= module->import_func_count) {
+            uint32 aot_func_idx = func_index - module->import_func_count;
             max_local_cell_num = module->max_local_cell_nums[aot_func_idx];
             max_stack_cell_num = module->max_stack_cell_nums[aot_func_idx];
         }
         else {
-            AOTFuncType *func_type =
-                module->import_funcs[cur_frame->func_index].func_type;
+            AOTFuncType *func_type = module->import_funcs[func_index].func_type;
             max_local_cell_num =
                 func_type->param_cell_num > 2 ? func_type->param_cell_num : 2;
             max_stack_cell_num = 0;
@@ -3881,12 +3979,12 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
 #else
         lp_size = align_uint(all_cell_num * 5, 4);
 #endif
-        if (lp_size > 0) {
+        if (lp_size > 0 && !is_tiny_frame(exec_env)) {
             if (!(frame.lp = wasm_runtime_malloc(lp_size))) {
                 destroy_c_api_frames(module_inst->frames);
                 return false;
             }
-            bh_memcpy_s(frame.lp, lp_size, cur_frame->lp, lp_size);
+            bh_memcpy_s(frame.lp, lp_size, lp, lp_size);
 
 #if WASM_ENABLE_GC != 0
             uint32 local_ref_flags_cell_num =
@@ -3894,9 +3992,8 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
                     .local_ref_flag_cell_num;
             uint8 *local_ref_flags =
                 module->func_local_ref_flags[frame.func_index].local_ref_flags;
-            frame.sp = frame.lp + (cur_frame->sp - cur_frame->lp);
-            frame.frame_ref = (uint8 *)frame.lp
-                              + (cur_frame->frame_ref - (uint8 *)cur_frame->lp);
+            frame.sp = frame.lp + (sp - lp);
+            frame.frame_ref = (uint8 *)frame.lp + (frame_ref - (uint8 *)lp);
             /* copy local ref flags from AOT module */
             bh_memcpy_s(frame.frame_ref, local_ref_flags_cell_num,
                         local_ref_flags, lp_size);
@@ -3910,7 +4007,7 @@ aot_create_call_stack(struct WASMExecEnv *exec_env)
             return false;
         }
 
-        cur_frame = cur_frame->prev_frame;
+        top_frame = get_prev_frame(exec_env, top_frame);
     }
 
     return true;
diff --git a/core/iwasm/aot/aot_runtime.h b/core/iwasm/aot/aot_runtime.h
index 0eb647987..3ff0e0e3c 100644
--- a/core/iwasm/aot/aot_runtime.h
+++ b/core/iwasm/aot/aot_runtime.h
@@ -25,12 +25,15 @@ extern "C" {
 #define WASM_FEATURE_REF_TYPES (1 << 3)
 #define WASM_FEATURE_GARBAGE_COLLECTION (1 << 4)
 #define WASM_FEATURE_EXCEPTION_HANDLING (1 << 5)
-#define WASM_FEATURE_MEMORY64 (1 << 6)
+#define WASM_FEATURE_TINY_STACK_FRAME (1 << 6)
 #define WASM_FEATURE_MULTI_MEMORY (1 << 7)
 #define WASM_FEATURE_DYNAMIC_LINKING (1 << 8)
 #define WASM_FEATURE_COMPONENT_MODEL (1 << 9)
 #define WASM_FEATURE_RELAXED_SIMD (1 << 10)
 #define WASM_FEATURE_FLEXIBLE_VECTORS (1 << 11)
+/* Stack frame is created at the beginning of the function,
+ * and not at the beginning of each function call */
+#define WASM_FEATURE_FRAME_PER_FUNCTION (1 << 12)
 
 typedef enum AOTSectionType {
     AOT_SECTION_TYPE_TARGET_INFO = 0,
@@ -39,6 +42,10 @@ typedef enum AOTSectionType {
     AOT_SECTION_TYPE_FUNCTION = 3,
     AOT_SECTION_TYPE_EXPORT = 4,
     AOT_SECTION_TYPE_RELOCATION = 5,
+    /*
+     * Note: We haven't had anything to use AOT_SECTION_TYPE_SIGNATURE.
+     * It's just reserved for possible module signing features.
+     */
     AOT_SECTION_TYPE_SIGNATURE = 6,
     AOT_SECTION_TYPE_CUSTOM = 100,
 } AOTSectionType;
@@ -322,6 +329,10 @@ typedef struct AOTModule {
     /* `.data` and `.text` sections merged into one large mmaped section */
     uint8 *merged_data_text_sections;
     uint32 merged_data_text_sections_size;
+
+#if WASM_ENABLE_AOT_STACK_FRAME != 0
+    uint32 feature_flags;
+#endif
 } AOTModule;
 
 #define AOTMemoryInstance WASMMemoryInstance
@@ -637,7 +648,7 @@ aot_check_app_addr_and_convert(AOTModuleInstance *module_inst, bool is_str,
                                void **p_native_addr);
 
 uint32
-aot_get_plt_table_size();
+aot_get_plt_table_size(void);
 
 void *
 aot_memmove(void *dest, const void *src, size_t n);
diff --git a/core/iwasm/aot/arch/aot_reloc_aarch64.c b/core/iwasm/aot/arch/aot_reloc_aarch64.c
index b4bb6024a..26815334f 100644
--- a/core/iwasm/aot/arch/aot_reloc_aarch64.c
+++ b/core/iwasm/aot/arch/aot_reloc_aarch64.c
@@ -53,12 +53,6 @@ get_target_symbol_map(uint32 *sym_num)
     return target_sym_map;
 }
 
-#if (defined(__APPLE__) || defined(__MACH__)) && defined(__arm64__)
-#define BUILD_TARGET_AARCH64_DEFAULT "arm64"
-#else
-#define BUILD_TARGET_AARCH64_DEFAULT "aarch64v8"
-#endif
-
 void
 get_current_target(char *target_buf, uint32 target_buf_size)
 {
@@ -68,8 +62,8 @@ get_current_target(char *target_buf, uint32 target_buf_size)
 
     /* Set to "aarch64v8" by default if sub version isn't specified */
     if (strcmp(s, "AARCH64") == 0) {
-        s = BUILD_TARGET_AARCH64_DEFAULT;
-        s_size = sizeof(BUILD_TARGET_AARCH64_DEFAULT);
+        s = "aarch64v8";
+        s_size = 9; /* strlen("aarch64v8"); */
     }
     if (target_buf_size < s_size) {
         s_size = target_buf_size;
@@ -83,10 +77,9 @@ get_current_target(char *target_buf, uint32 target_buf_size)
     /* Ensure the string is null byte ('\0') terminated */
     *d = '\0';
 }
-#undef BUILD_TARGET_AARCH64_DEFAULT
 
 static uint32
-get_plt_item_size()
+get_plt_item_size(void)
 {
     /* 6*4 bytes instructions and 8 bytes symbol address */
     return 32;
diff --git a/core/iwasm/aot/arch/aot_reloc_arm.c b/core/iwasm/aot/arch/aot_reloc_arm.c
index bb492bfeb..0be17ef4c 100644
--- a/core/iwasm/aot/arch/aot_reloc_arm.c
+++ b/core/iwasm/aot/arch/aot_reloc_arm.c
@@ -12,102 +12,102 @@
 #define R_ARM_MOVT_ABS 44
 
 /* clang-format off */
-void __adddf3();
-void __addsf3();
-void __aeabi_d2f();
-void __aeabi_d2iz();
-void __aeabi_d2lz();
-void __aeabi_d2uiz();
-void __aeabi_d2ulz();
-void __aeabi_dadd();
-void __aeabi_dcmpeq();
-void __aeabi_dcmpge();
-void __aeabi_dcmpgt();
-void __aeabi_dcmple();
-void __aeabi_dcmplt();
-void __aeabi_dcmpun();
-void __aeabi_ddiv();
-void __aeabi_dmul();
-void __aeabi_dsub();
-void __aeabi_f2d();
-void __aeabi_f2iz();
-void __aeabi_f2lz();
-void __aeabi_f2ulz();
-void __aeabi_fadd();
-void __aeabi_fcmpeq();
-void __aeabi_fcmpge();
-void __aeabi_fcmpgt();
-void __aeabi_fcmple();
-void __aeabi_fcmplt();
-void __aeabi_fcmpun();
-void __aeabi_fdiv();
-void __aeabi_fmul();
-void __aeabi_fsub();
-void __aeabi_i2d();
-void __aeabi_i2f();
-void __aeabi_idiv();
-void __aeabi_idivmod();
-void __aeabi_l2d();
-void __aeabi_l2f();
-void __aeabi_ldivmod();
-void __aeabi_memclr();
-void __aeabi_memcpy();
-void __aeabi_memmove();
-void __aeabi_memset();
-void __aeabi_ui2d();
-void __aeabi_ui2f();
-void __aeabi_uidiv();
-void __aeabi_uidivmod();
-void __aeabi_ul2d();
-void __aeabi_ul2f();
-void __aeabi_uldivmod();
-void __clzsi2();
-void __divdf3();
-void __divdi3();
-void __divsf3();
-void __divsi3();
-void __eqdf2();
-void __eqsf2();
-void __extendsfdf2();
-void __fixdfdi();
-void __fixdfsi();
-void __fixsfdi();
-void __fixsfsi();
-void __fixunsdfdi();
-void __fixunsdfsi();
-void __fixunssfdi();
-void __floatdidf();
-void __floatdisf();
-void __floatsidf();
-void __floatsisf();
-void __floatundidf();
-void __floatundisf();
-void __floatunsidf();
-void __floatunsisf();
-void __gedf2();
-void __gesf2();
-void __gtdf2();
-void __gtsf2();
-void __ledf2();
-void __lesf2();
-void __ltdf2();
-void __ltsf2();
-void __moddi3();
-void __modsi3();
-void __muldf3();
-void __mulsf3();
-void __nedf2();
-void __nesf2();
-void __subdf3();
-void __subsf3();
-void __truncdfsf2();
-void __udivdi3();
-void __udivmoddi4();
-void __udivsi3();
-void __umoddi3();
-void __umodsi3();
-void __unorddf2();
-void __unordsf2();
+void __adddf3(void);
+void __addsf3(void);
+void __aeabi_d2f(void);
+void __aeabi_d2iz(void);
+void __aeabi_d2lz(void);
+void __aeabi_d2uiz(void);
+void __aeabi_d2ulz(void);
+void __aeabi_dadd(void);
+void __aeabi_dcmpeq(void);
+void __aeabi_dcmpge(void);
+void __aeabi_dcmpgt(void);
+void __aeabi_dcmple(void);
+void __aeabi_dcmplt(void);
+void __aeabi_dcmpun(void);
+void __aeabi_ddiv(void);
+void __aeabi_dmul(void);
+void __aeabi_dsub(void);
+void __aeabi_f2d(void);
+void __aeabi_f2iz(void);
+void __aeabi_f2lz(void);
+void __aeabi_f2ulz(void);
+void __aeabi_fadd(void);
+void __aeabi_fcmpeq(void);
+void __aeabi_fcmpge(void);
+void __aeabi_fcmpgt(void);
+void __aeabi_fcmple(void);
+void __aeabi_fcmplt(void);
+void __aeabi_fcmpun(void);
+void __aeabi_fdiv(void);
+void __aeabi_fmul(void);
+void __aeabi_fsub(void);
+void __aeabi_i2d(void);
+void __aeabi_i2f(void);
+void __aeabi_idiv(void);
+void __aeabi_idivmod(void);
+void __aeabi_l2d(void);
+void __aeabi_l2f(void);
+void __aeabi_ldivmod(void);
+void __aeabi_memclr(void);
+void __aeabi_memcpy(void);
+void __aeabi_memmove(void);
+void __aeabi_memset(void);
+void __aeabi_ui2d(void);
+void __aeabi_ui2f(void);
+void __aeabi_uidiv(void);
+void __aeabi_uidivmod(void);
+void __aeabi_ul2d(void);
+void __aeabi_ul2f(void);
+void __aeabi_uldivmod(void);
+void __clzsi2(void);
+void __divdf3(void);
+void __divdi3(void);
+void __divsf3(void);
+void __divsi3(void);
+void __eqdf2(void);
+void __eqsf2(void);
+void __extendsfdf2(void);
+void __fixdfdi(void);
+void __fixdfsi(void);
+void __fixsfdi(void);
+void __fixsfsi(void);
+void __fixunsdfdi(void);
+void __fixunsdfsi(void);
+void __fixunssfdi(void);
+void __floatdidf(void);
+void __floatdisf(void);
+void __floatsidf(void);
+void __floatsisf(void);
+void __floatundidf(void);
+void __floatundisf(void);
+void __floatunsidf(void);
+void __floatunsisf(void);
+void __gedf2(void);
+void __gesf2(void);
+void __gtdf2(void);
+void __gtsf2(void);
+void __ledf2(void);
+void __lesf2(void);
+void __ltdf2(void);
+void __ltsf2(void);
+void __moddi3(void);
+void __modsi3(void);
+void __muldf3(void);
+void __mulsf3(void);
+void __nedf2(void);
+void __nesf2(void);
+void __subdf3(void);
+void __subsf3(void);
+void __truncdfsf2(void);
+void __udivdi3(void);
+void __udivmoddi4(void);
+void __udivsi3(void);
+void __umoddi3(void);
+void __umodsi3(void);
+void __unorddf2(void);
+void __unordsf2(void);
 /* clang-format on */
 
 static SymbolMap target_sym_map[] = {
@@ -255,7 +255,7 @@ get_current_target(char *target_buf, uint32 target_buf_size)
 #undef BUILD_TARGET_ARM_DEFAULT
 
 uint32
-get_plt_item_size()
+get_plt_item_size(void)
 {
     /* 8 bytes instructions and 4 bytes symbol address */
     return 12;
diff --git a/core/iwasm/aot/arch/aot_reloc_mips.c b/core/iwasm/aot/arch/aot_reloc_mips.c
index f9f06a053..4b856119c 100644
--- a/core/iwasm/aot/arch/aot_reloc_mips.c
+++ b/core/iwasm/aot/arch/aot_reloc_mips.c
@@ -28,7 +28,7 @@ get_current_target(char *target_buf, uint32 target_buf_size)
 }
 
 static uint32
-get_plt_item_size()
+get_plt_item_size(void)
 {
     return 0;
 }
diff --git a/core/iwasm/aot/arch/aot_reloc_riscv.c b/core/iwasm/aot/arch/aot_reloc_riscv.c
index b87bb2000..058ad0e10 100644
--- a/core/iwasm/aot/arch/aot_reloc_riscv.c
+++ b/core/iwasm/aot/arch/aot_reloc_riscv.c
@@ -49,58 +49,58 @@
 #endif
 
 /* clang-format off */
-void __adddf3();
-void __addsf3();
-void __divdf3();
-void __divdi3();
-void __divsf3();
-void __divsi3();
-void __eqdf2();
-void __eqsf2();
-void __extendsfdf2();
-void __fixdfdi();
-void __fixdfsi();
-void __fixsfdi();
-void __fixsfsi();
-void __fixunsdfdi();
-void __fixunsdfsi();
-void __fixunssfdi();
-void __fixunssfsi();
-void __floatdidf();
-void __floatdisf();
-void __floatsidf();
-void __floatsisf();
-void __floatundidf();
-void __floatundisf();
-void __floatunsidf();
-void __floatunsisf();
-void __gedf2();
-void __gesf2();
-void __gtdf2();
-void __gtsf2();
-void __ledf2();
-void __lesf2();
-void __ltdf2();
-void __ltsf2();
-void __moddi3();
-void __modsi3();
-void __muldf3();
-void __muldi3();
-void __mulsf3();
-void __mulsi3();
-void __nedf2();
-void __negdf2();
-void __negsf2();
-void __nesf2();
-void __subdf3();
-void __subsf3();
-void __truncdfsf2();
-void __udivdi3();
-void __udivsi3();
-void __umoddi3();
-void __umodsi3();
-void __unorddf2();
-void __unordsf2();
+void __adddf3(void);
+void __addsf3(void);
+void __divdf3(void);
+void __divdi3(void);
+void __divsf3(void);
+void __divsi3(void);
+void __eqdf2(void);
+void __eqsf2(void);
+void __extendsfdf2(void);
+void __fixdfdi(void);
+void __fixdfsi(void);
+void __fixsfdi(void);
+void __fixsfsi(void);
+void __fixunsdfdi(void);
+void __fixunsdfsi(void);
+void __fixunssfdi(void);
+void __fixunssfsi(void);
+void __floatdidf(void);
+void __floatdisf(void);
+void __floatsidf(void);
+void __floatsisf(void);
+void __floatundidf(void);
+void __floatundisf(void);
+void __floatunsidf(void);
+void __floatunsisf(void);
+void __gedf2(void);
+void __gesf2(void);
+void __gtdf2(void);
+void __gtsf2(void);
+void __ledf2(void);
+void __lesf2(void);
+void __ltdf2(void);
+void __ltsf2(void);
+void __moddi3(void);
+void __modsi3(void);
+void __muldf3(void);
+void __muldi3(void);
+void __mulsf3(void);
+void __mulsi3(void);
+void __nedf2(void);
+void __negdf2(void);
+void __negsf2(void);
+void __nesf2(void);
+void __subdf3(void);
+void __subsf3(void);
+void __truncdfsf2(void);
+void __udivdi3(void);
+void __udivsi3(void);
+void __umoddi3(void);
+void __umodsi3(void);
+void __unorddf2(void);
+void __unordsf2(void);
 /* clang-format on */
 
 static SymbolMap target_sym_map[] = {
@@ -193,7 +193,7 @@ get_current_target(char *target_buf, uint32 target_buf_size)
 }
 
 uint32
-get_plt_item_size()
+get_plt_item_size(void)
 {
 #if __riscv_xlen == 64
     /* auipc + ld + jalr + nop + addr */
diff --git a/core/iwasm/aot/arch/aot_reloc_thumb.c b/core/iwasm/aot/arch/aot_reloc_thumb.c
index f90507dec..c0957a42a 100644
--- a/core/iwasm/aot/arch/aot_reloc_thumb.c
+++ b/core/iwasm/aot/arch/aot_reloc_thumb.c
@@ -14,102 +14,102 @@
 #define R_ARM_THM_MOVT_PREL 50
 
 /* clang-format off */
-void __adddf3();
-void __addsf3();
-void __aeabi_d2f();
-void __aeabi_d2iz();
-void __aeabi_d2lz();
-void __aeabi_d2uiz();
-void __aeabi_d2ulz();
-void __aeabi_dadd();
-void __aeabi_dcmpeq();
-void __aeabi_dcmpge();
-void __aeabi_dcmpgt();
-void __aeabi_dcmple();
-void __aeabi_dcmplt();
-void __aeabi_dcmpun();
-void __aeabi_ddiv();
-void __aeabi_dmul();
-void __aeabi_dsub();
-void __aeabi_f2d();
-void __aeabi_f2iz();
-void __aeabi_f2lz();
-void __aeabi_f2ulz();
-void __aeabi_fadd();
-void __aeabi_fcmpeq();
-void __aeabi_fcmpge();
-void __aeabi_fcmpgt();
-void __aeabi_fcmple();
-void __aeabi_fcmplt();
-void __aeabi_fcmpun();
-void __aeabi_fdiv();
-void __aeabi_fmul();
-void __aeabi_fsub();
-void __aeabi_i2d();
-void __aeabi_i2f();
-void __aeabi_idiv();
-void __aeabi_idivmod();
-void __aeabi_l2d();
-void __aeabi_l2f();
-void __aeabi_ldivmod();
-void __aeabi_llsl();
-void __aeabi_llsr();
-void __aeabi_lmul();
-void __aeabi_ui2d();
-void __aeabi_ui2f();
-void __aeabi_uidiv();
-void __aeabi_uidivmod();
-void __aeabi_ul2d();
-void __aeabi_ul2f();
-void __aeabi_uldivmod();
-void __ashldi3();
-void __clzsi2();
-void __divdf3();
-void __divdi3();
-void __divsi3();
-void __eqdf2();
-void __eqsf2();
-void __extendsfdf2();
-void __fixdfdi();
-void __fixdfsi();
-void __fixsfdi();
-void __fixunsdfdi();
-void __fixunsdfsi();
-void __fixunssfdi();
-void __floatdidf();
-void __floatdisf();
-void __floatsidf();
-void __floatsisf();
-void __floatundidf();
-void __floatundisf();
-void __floatunsidf();
-void __floatunsisf();
-void __gedf2();
-void __gesf2();
-void __gtdf2();
-void __gtsf2();
-void __ledf2();
-void __lesf2();
-void __lshrdi3();
-void __ltdf2();
-void __ltsf2();
-void __moddi3();
-void __modsi3();
-void __muldf3();
-void __muldi3();
-void __mulsf3();
-void __nedf2();
-void __nesf2();
-void __subdf3();
-void __subsf3();
-void __truncdfsf2();
-void __udivdi3();
-void __udivmoddi4();
-void __udivsi3();
-void __umoddi3();
-void __umodsi3();
-void __unorddf2();
-void __unordsf2();
+void __adddf3(void);
+void __addsf3(void);
+void __aeabi_d2f(void);
+void __aeabi_d2iz(void);
+void __aeabi_d2lz(void);
+void __aeabi_d2uiz(void);
+void __aeabi_d2ulz(void);
+void __aeabi_dadd(void);
+void __aeabi_dcmpeq(void);
+void __aeabi_dcmpge(void);
+void __aeabi_dcmpgt(void);
+void __aeabi_dcmple(void);
+void __aeabi_dcmplt(void);
+void __aeabi_dcmpun(void);
+void __aeabi_ddiv(void);
+void __aeabi_dmul(void);
+void __aeabi_dsub(void);
+void __aeabi_f2d(void);
+void __aeabi_f2iz(void);
+void __aeabi_f2lz(void);
+void __aeabi_f2ulz(void);
+void __aeabi_fadd(void);
+void __aeabi_fcmpeq(void);
+void __aeabi_fcmpge(void);
+void __aeabi_fcmpgt(void);
+void __aeabi_fcmple(void);
+void __aeabi_fcmplt(void);
+void __aeabi_fcmpun(void);
+void __aeabi_fdiv(void);
+void __aeabi_fmul(void);
+void __aeabi_fsub(void);
+void __aeabi_i2d(void);
+void __aeabi_i2f(void);
+void __aeabi_idiv(void);
+void __aeabi_idivmod(void);
+void __aeabi_l2d(void);
+void __aeabi_l2f(void);
+void __aeabi_ldivmod(void);
+void __aeabi_llsl(void);
+void __aeabi_llsr(void);
+void __aeabi_lmul(void);
+void __aeabi_ui2d(void);
+void __aeabi_ui2f(void);
+void __aeabi_uidiv(void);
+void __aeabi_uidivmod(void);
+void __aeabi_ul2d(void);
+void __aeabi_ul2f(void);
+void __aeabi_uldivmod(void);
+void __ashldi3(void);
+void __clzsi2(void);
+void __divdf3(void);
+void __divdi3(void);
+void __divsi3(void);
+void __eqdf2(void);
+void __eqsf2(void);
+void __extendsfdf2(void);
+void __fixdfdi(void);
+void __fixdfsi(void);
+void __fixsfdi(void);
+void __fixunsdfdi(void);
+void __fixunsdfsi(void);
+void __fixunssfdi(void);
+void __floatdidf(void);
+void __floatdisf(void);
+void __floatsidf(void);
+void __floatsisf(void);
+void __floatundidf(void);
+void __floatundisf(void);
+void __floatunsidf(void);
+void __floatunsisf(void);
+void __gedf2(void);
+void __gesf2(void);
+void __gtdf2(void);
+void __gtsf2(void);
+void __ledf2(void);
+void __lesf2(void);
+void __lshrdi3(void);
+void __ltdf2(void);
+void __ltsf2(void);
+void __moddi3(void);
+void __modsi3(void);
+void __muldf3(void);
+void __muldi3(void);
+void __mulsf3(void);
+void __nedf2(void);
+void __nesf2(void);
+void __subdf3(void);
+void __subsf3(void);
+void __truncdfsf2(void);
+void __udivdi3(void);
+void __udivmoddi4(void);
+void __udivsi3(void);
+void __umoddi3(void);
+void __umodsi3(void);
+void __unorddf2(void);
+void __unordsf2(void);
 /* clang-format on */
 
 static SymbolMap target_sym_map[] = {
@@ -259,7 +259,7 @@ get_current_target(char *target_buf, uint32 target_buf_size)
 #undef BUILD_TARGET_THUMB_V4T
 
 uint32
-get_plt_item_size()
+get_plt_item_size(void)
 {
     /* 16 bytes instructions and 4 bytes symbol address */
     return 20;
diff --git a/core/iwasm/aot/arch/aot_reloc_x86_64.c b/core/iwasm/aot/arch/aot_reloc_x86_64.c
index d1f5cb5ac..fe18d79c6 100644
--- a/core/iwasm/aot/arch/aot_reloc_x86_64.c
+++ b/core/iwasm/aot/arch/aot_reloc_x86_64.c
@@ -58,7 +58,7 @@ get_current_target(char *target_buf, uint32 target_buf_size)
 }
 
 static uint32
-get_plt_item_size()
+get_plt_item_size(void)
 {
     /* size of mov instruction and jmp instruction */
     return 12;
diff --git a/core/iwasm/aot/arch/aot_reloc_xtensa.c b/core/iwasm/aot/arch/aot_reloc_xtensa.c
index a29c9f2b9..fca1b80da 100644
--- a/core/iwasm/aot/arch/aot_reloc_xtensa.c
+++ b/core/iwasm/aot/arch/aot_reloc_xtensa.c
@@ -10,44 +10,44 @@
 
 /* clang-format off */
 /* for soft-float */
-void __floatsidf();
-void __divdf3();
-void __ltdf2();
+void __floatsidf(void);
+void __divdf3(void);
+void __ltdf2(void);
 
 /* for mul32 */
-void __mulsi3();
-void __muldi3();
+void __mulsi3(void);
+void __muldi3(void);
 
-void __modsi3();
+void __modsi3(void);
 
-void __divdi3();
+void __divdi3(void);
 
-void __udivdi3();
-void __unorddf2();
-void __adddf3();
-void __eqdf2();
-void __muldf3();
-void __gedf2();
-void __ledf2();
-void __fixunsdfsi();
-void __floatunsidf();
-void __subdf3();
-void __nedf2();
-void __fixdfsi();
-void __moddi3();
-void __extendsfdf2();
-void __truncdfsf2();
-void __gtdf2();
-void __umoddi3();
-void __floatdidf();
-void __divsf3();
-void __fixdfdi();
-void __floatundidf();
-void __fixsfdi();
-void __fixunssfdi();
-void __fixunsdfdi();
-void __floatdisf();
-void __floatundisf();
+void __udivdi3(void);
+void __unorddf2(void);
+void __adddf3(void);
+void __eqdf2(void);
+void __muldf3(void);
+void __gedf2(void);
+void __ledf2(void);
+void __fixunsdfsi(void);
+void __floatunsidf(void);
+void __subdf3(void);
+void __nedf2(void);
+void __fixdfsi(void);
+void __moddi3(void);
+void __extendsfdf2(void);
+void __truncdfsf2(void);
+void __gtdf2(void);
+void __umoddi3(void);
+void __floatdidf(void);
+void __divsf3(void);
+void __fixdfdi(void);
+void __floatundidf(void);
+void __fixsfdi(void);
+void __fixunssfdi(void);
+void __fixunsdfdi(void);
+void __floatdisf(void);
+void __floatundisf(void);
 
 
 static SymbolMap target_sym_map[] = {
@@ -119,7 +119,7 @@ get_current_target(char *target_buf, uint32 target_buf_size)
 }
 
 static uint32
-get_plt_item_size()
+get_plt_item_size(void)
 {
     return 0;
 }
diff --git a/core/iwasm/aot/debug/jit_debug.c b/core/iwasm/aot/debug/jit_debug.c
index 261c20546..9f92dd393 100644
--- a/core/iwasm/aot/debug/jit_debug.c
+++ b/core/iwasm/aot/debug/jit_debug.c
@@ -69,10 +69,10 @@ typedef struct JITDescriptor {
  * and inline assembler statement inside.
  */
 void attribute_noinline
-__jit_debug_register_code();
+__jit_debug_register_code(void);
 
 void attribute_noinline
-__jit_debug_register_code()
+__jit_debug_register_code(void)
 {
     int x;
     *(char *)&x = '\0';
@@ -96,7 +96,7 @@ extern JITDescriptor __jit_debug_descriptor;
  * This gives the debugger an easy way to inject custom code to
  * handle the events.
  */
-void (*__jit_debug_register_code_ptr)() = __jit_debug_register_code;
+void (*__jit_debug_register_code_ptr)(void) = __jit_debug_register_code;
 
 #ifdef __cplusplus
 }
@@ -171,7 +171,7 @@ DestroyJITCodeEntryInternal(JITCodeEntry *entry)
 }
 
 bool
-jit_debug_engine_init()
+jit_debug_engine_init(void)
 {
     if (jit_debug_engine) {
         return true;
@@ -194,7 +194,7 @@ jit_debug_engine_init()
 }
 
 void
-jit_debug_engine_destroy()
+jit_debug_engine_destroy(void)
 {
     if (jit_debug_engine) {
         WASMJITEntryNode *node, *node_next;
diff --git a/core/iwasm/aot/debug/jit_debug.h b/core/iwasm/aot/debug/jit_debug.h
index 5e3e36519..813c8b782 100644
--- a/core/iwasm/aot/debug/jit_debug.h
+++ b/core/iwasm/aot/debug/jit_debug.h
@@ -11,10 +11,10 @@ extern "C" {
 #endif
 
 bool
-jit_debug_engine_init();
+jit_debug_engine_init(void);
 
 void
-jit_debug_engine_destroy();
+jit_debug_engine_destroy(void);
 
 bool
 jit_code_entry_create(const uint8 *symfile_addr, uint64 symfile_size);
diff --git a/core/iwasm/common/wasm_memory.c b/core/iwasm/common/wasm_memory.c
index 71d337549..82eebbf30 100644
--- a/core/iwasm/common/wasm_memory.c
+++ b/core/iwasm/common/wasm_memory.c
@@ -159,7 +159,7 @@ wasm_runtime_memory_init(mem_alloc_type_t mem_alloc_type,
 }
 
 void
-wasm_runtime_memory_destroy()
+wasm_runtime_memory_destroy(void)
 {
     if (memory_mode == MEMORY_MODE_POOL) {
 #if BH_ENABLE_GC_VERIFY == 0
@@ -176,7 +176,7 @@ wasm_runtime_memory_destroy()
 }
 
 unsigned
-wasm_runtime_memory_pool_size()
+wasm_runtime_memory_pool_size(void)
 {
     if (memory_mode == MEMORY_MODE_POOL)
         return global_pool_size;
diff --git a/core/iwasm/common/wasm_memory.h b/core/iwasm/common/wasm_memory.h
index a5dfefae9..2f20d3f68 100644
--- a/core/iwasm/common/wasm_memory.h
+++ b/core/iwasm/common/wasm_memory.h
@@ -46,10 +46,10 @@ wasm_runtime_memory_init(mem_alloc_type_t mem_alloc_type,
                          const MemAllocOption *alloc_option);
 
 void
-wasm_runtime_memory_destroy();
+wasm_runtime_memory_destroy(void);
 
 unsigned
-wasm_runtime_memory_pool_size();
+wasm_runtime_memory_pool_size(void);
 
 void
 wasm_runtime_set_mem_bound_check_bytes(WASMMemoryInstance *memory,
diff --git a/core/iwasm/common/wasm_native.c b/core/iwasm/common/wasm_native.c
index 9e8764a22..0ff3053fa 100644
--- a/core/iwasm/common/wasm_native.c
+++ b/core/iwasm/common/wasm_native.c
@@ -469,7 +469,7 @@ wasi_context_dtor(WASMModuleInstanceCommon *inst, void *ctx)
 
 #if WASM_ENABLE_QUICK_AOT_ENTRY != 0
 static bool
-quick_aot_entry_init();
+quick_aot_entry_init(void);
 #endif
 
 bool
@@ -1461,7 +1461,7 @@ quick_aot_entry_cmp(const void *quick_aot_entry1, const void *quick_aot_entry2)
 }
 
 static bool
-quick_aot_entry_init()
+quick_aot_entry_init(void)
 {
     qsort(quick_aot_entries, sizeof(quick_aot_entries) / sizeof(QuickAOTEntry),
           sizeof(QuickAOTEntry), quick_aot_entry_cmp);
diff --git a/core/iwasm/common/wasm_native.h b/core/iwasm/common/wasm_native.h
index 5cb78bf91..9a6afee19 100644
--- a/core/iwasm/common/wasm_native.h
+++ b/core/iwasm/common/wasm_native.h
@@ -100,10 +100,10 @@ wasm_native_inherit_contexts(struct WASMModuleInstanceCommon *child,
 #endif /* WASM_ENABLE_MODULE_INST_CONTEXT */
 
 bool
-wasm_native_init();
+wasm_native_init(void);
 
 void
-wasm_native_destroy();
+wasm_native_destroy(void);
 
 #if WASM_ENABLE_QUICK_AOT_ENTRY != 0
 void *
diff --git a/core/iwasm/common/wasm_runtime_common.c b/core/iwasm/common/wasm_runtime_common.c
index 5dd2957de..314dc7ddb 100644
--- a/core/iwasm/common/wasm_runtime_common.c
+++ b/core/iwasm/common/wasm_runtime_common.c
@@ -86,7 +86,7 @@ static bh_list registered_module_list_head;
 static bh_list *const registered_module_list = &registered_module_list_head;
 static korp_mutex registered_module_list_lock;
 static void
-wasm_runtime_destroy_registered_module_list();
+wasm_runtime_destroy_registered_module_list(void);
 #endif /* WASM_ENABLE_MULTI_MODULE */
 
 #define E_TYPE_XIP 4
@@ -97,11 +97,11 @@ val_type_to_val_kind(uint8 value_type);
 #if WASM_ENABLE_GC == 0 && WASM_ENABLE_REF_TYPES != 0
 /* Initialize externref hashmap */
 static bool
-wasm_externref_map_init();
+wasm_externref_map_init(void);
 
 /* Destroy externref hashmap */
 static void
-wasm_externref_map_destroy();
+wasm_externref_map_destroy(void);
 #endif /* end of WASM_ENABLE_GC == 0 && WASM_ENABLE_REF_TYPES != 0 */
 
 static void
@@ -438,7 +438,7 @@ wasm_runtime_get_exec_env_tls()
 #endif /* end of OS_ENABLE_HW_BOUND_CHECK */
 
 static bool
-wasm_runtime_env_init()
+wasm_runtime_env_init(void)
 {
     if (bh_platform_init() != 0)
         return false;
@@ -584,7 +584,7 @@ static korp_mutex runtime_lock = OS_THREAD_MUTEX_INITIALIZER;
 static int32 runtime_ref_count = 0;
 
 static bool
-wasm_runtime_init_internal()
+wasm_runtime_init_internal(void)
 {
     if (!wasm_runtime_memory_init(Alloc_With_System_Allocator, NULL))
         return false;
@@ -622,7 +622,7 @@ wasm_runtime_init()
 }
 
 static void
-wasm_runtime_destroy_internal()
+wasm_runtime_destroy_internal(void)
 {
 #if WASM_ENABLE_GC == 0 && WASM_ENABLE_REF_TYPES != 0
     wasm_externref_map_destroy();
@@ -4747,7 +4747,7 @@ fail:
     || defined(BUILD_TARGET_RISCV32_ILP32D)                          \
     || defined(BUILD_TARGET_RISCV32_ILP32F)                          \
     || defined(BUILD_TARGET_RISCV32_ILP32) || defined(BUILD_TARGET_ARC)
-typedef void (*GenericFunctionPointer)();
+typedef void (*GenericFunctionPointer)(void);
 void
 invokeNative(GenericFunctionPointer f, uint32 *args, uint32 n_stacks);
 
@@ -5312,7 +5312,7 @@ fail:
 #if defined(BUILD_TARGET_X86_32) || defined(BUILD_TARGET_ARM)    \
     || defined(BUILD_TARGET_THUMB) || defined(BUILD_TARGET_MIPS) \
     || defined(BUILD_TARGET_XTENSA)
-typedef void (*GenericFunctionPointer)();
+typedef void (*GenericFunctionPointer)(void);
 void
 invokeNative(GenericFunctionPointer f, uint32 *args, uint32 sz);
 
@@ -5597,7 +5597,7 @@ typedef uint32x4_t __m128i;
 
 #endif /* end of WASM_ENABLE_SIMD != 0 */
 
-typedef void (*GenericFunctionPointer)();
+typedef void (*GenericFunctionPointer)(void);
 void
 invokeNative(GenericFunctionPointer f, uint64 *args, uint64 n_stacks);
 
diff --git a/core/iwasm/common/wasm_runtime_common.h b/core/iwasm/common/wasm_runtime_common.h
index 71264ca7e..fb2c79408 100644
--- a/core/iwasm/common/wasm_runtime_common.h
+++ b/core/iwasm/common/wasm_runtime_common.h
@@ -852,10 +852,10 @@ wasm_runtime_set_module_reader(const module_reader reader,
                                const module_destroyer destroyer);
 
 module_reader
-wasm_runtime_get_module_reader();
+wasm_runtime_get_module_reader(void);
 
 module_destroyer
-wasm_runtime_get_module_destroyer();
+wasm_runtime_get_module_destroyer(void);
 
 bool
 wasm_runtime_register_module_internal(const char *module_name,
@@ -881,7 +881,7 @@ bool
 wasm_runtime_is_loading_module(const char *module_name);
 
 void
-wasm_runtime_destroy_loading_module_list();
+wasm_runtime_destroy_loading_module_list(void);
 
 WASMModuleCommon *
 wasm_runtime_search_sub_module(const WASMModuleCommon *parent_module,
@@ -1168,7 +1168,7 @@ wasm_runtime_quick_invoke_c_api_native(WASMModuleInstanceCommon *module_inst,
                                        uint32 result_count);
 
 void
-wasm_runtime_show_app_heap_corrupted_prompt();
+wasm_runtime_show_app_heap_corrupted_prompt(void);
 
 #if WASM_ENABLE_LOAD_CUSTOM_SECTION != 0
 void
diff --git a/core/iwasm/common/wasm_shared_memory.h b/core/iwasm/common/wasm_shared_memory.h
index 8bbc4a800..e1c5154a5 100644
--- a/core/iwasm/common/wasm_shared_memory.h
+++ b/core/iwasm/common/wasm_shared_memory.h
@@ -17,10 +17,10 @@ extern "C" {
 extern korp_mutex g_shared_memory_lock;
 
 bool
-wasm_shared_memory_init();
+wasm_shared_memory_init(void);
 
 void
-wasm_shared_memory_destroy();
+wasm_shared_memory_destroy(void);
 
 uint16
 shared_memory_inc_reference(WASMMemoryInstance *memory);
diff --git a/core/iwasm/compilation/aot.h b/core/iwasm/compilation/aot.h
index dcf9bbe12..98d2cc6cc 100644
--- a/core/iwasm/compilation/aot.h
+++ b/core/iwasm/compilation/aot.h
@@ -312,7 +312,7 @@ void
 aot_destroy_comp_data(AOTCompData *comp_data);
 
 char *
-aot_get_last_error();
+aot_get_last_error(void);
 
 void
 aot_set_last_error(const char *error);
diff --git a/core/iwasm/compilation/aot_compiler.c b/core/iwasm/compilation/aot_compiler.c
index 96ed8facf..e56004972 100644
--- a/core/iwasm/compilation/aot_compiler.c
+++ b/core/iwasm/compilation/aot_compiler.c
@@ -16,6 +16,7 @@
 #include "aot_emit_parametric.h"
 #include "aot_emit_table.h"
 #include "aot_emit_gc.h"
+#include "aot_stack_frame_comp.h"
 #include "simd/simd_access_lanes.h"
 #include "simd/simd_bitmask_extracts.h"
 #include "simd/simd_bit_shifts.h"
@@ -253,6 +254,13 @@ store_value(AOTCompContext *comp_ctx, LLVMValueRef value, uint8 value_type,
     return true;
 }
 
+void
+aot_call_stack_features_init_default(AOTCallStackFeatures *features)
+{
+    memset(features, 1, sizeof(AOTCallStackFeatures));
+    features->frame_per_function = false;
+}
+
 bool
 aot_frame_store_value(AOTCompContext *comp_ctx, LLVMValueRef value,
                       uint8 value_type, LLVMValueRef cur_frame, uint32 offset)
@@ -337,6 +345,10 @@ aot_gen_commit_values(AOTCompFrame *frame)
     LLVMValueRef value;
     uint32 n;
 
+    if (!frame->comp_ctx->call_stack_features.values) {
+        return true;
+    }
+
     /* First, commit reference flags
      * For LLVM JIT, iterate all local and stack ref flags
      * For AOT, ignore local(params + locals) ref flags */
@@ -569,6 +581,64 @@ aot_gen_commit_values(AOTCompFrame *frame)
     return true;
 }
 
+static bool
+aot_standard_frame_gen_commit_ip(AOTCompContext *comp_ctx,
+                                 AOTFuncContext *func_ctx,
+                                 LLVMValueRef ip_value, bool is_64bit)
+{
+    LLVMValueRef cur_frame = func_ctx->cur_frame;
+    LLVMValueRef value_offset, value_addr, value_ptr;
+    uint32 offset_ip;
+
+    if (!comp_ctx->is_jit_mode)
+        offset_ip = comp_ctx->pointer_size * 4;
+    else
+        offset_ip = offsetof(WASMInterpFrame, ip);
+
+    if (!(value_offset = I32_CONST(offset_ip))) {
+        aot_set_last_error("llvm build const failed");
+        return false;
+    }
+
+    if (!(value_addr =
+              LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE, cur_frame,
+                                    &value_offset, 1, "ip_addr"))) {
+        aot_set_last_error("llvm build in bounds gep failed");
+        return false;
+    }
+
+    if (!(value_ptr = LLVMBuildBitCast(
+              comp_ctx->builder, value_addr,
+              is_64bit ? INT64_PTR_TYPE : INT32_PTR_TYPE, "ip_ptr"))) {
+        aot_set_last_error("llvm build bit cast failed");
+        return false;
+    }
+
+    if (!LLVMBuildStore(comp_ctx->builder, ip_value, value_ptr)) {
+        aot_set_last_error("llvm build store failed");
+        return false;
+    }
+
+    return true;
+}
+
+bool
+aot_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                  LLVMValueRef ip_value, bool is_64bit)
+{
+    switch (comp_ctx->aux_stack_frame_type) {
+        case AOT_STACK_FRAME_TYPE_STANDARD:
+            return aot_standard_frame_gen_commit_ip(comp_ctx, func_ctx,
+                                                    ip_value, is_64bit);
+        case AOT_STACK_FRAME_TYPE_TINY:
+            return aot_tiny_frame_gen_commit_ip(comp_ctx, func_ctx, ip_value);
+        default:
+            aot_set_last_error(
+                "unsupported mode when generating commit_ip code");
+            return false;
+    }
+}
+
 bool
 aot_gen_commit_sp_ip(AOTCompFrame *frame, bool commit_sp, bool commit_ip)
 {
@@ -577,40 +647,19 @@ aot_gen_commit_sp_ip(AOTCompFrame *frame, bool commit_sp, bool commit_ip)
     LLVMValueRef cur_frame = func_ctx->cur_frame;
     LLVMValueRef value_offset, value_addr, value_ptr, value;
     LLVMTypeRef int8_ptr_ptr_type;
-    uint32 offset_ip, offset_sp, n;
+    uint32 offset_sp, n;
     bool is_64bit = (comp_ctx->pointer_size == sizeof(uint64)) ? true : false;
     const AOTValueSlot *sp = frame->sp;
     const uint8 *ip = frame->frame_ip;
 
     if (!comp_ctx->is_jit_mode) {
-        offset_ip = frame->comp_ctx->pointer_size * 4;
         offset_sp = frame->comp_ctx->pointer_size * 5;
     }
     else {
-        offset_ip = offsetof(WASMInterpFrame, ip);
         offset_sp = offsetof(WASMInterpFrame, sp);
     }
 
-    if (commit_ip) {
-        if (!(value_offset = I32_CONST(offset_ip))) {
-            aot_set_last_error("llvm build const failed");
-            return false;
-        }
-
-        if (!(value_addr =
-                  LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE, cur_frame,
-                                        &value_offset, 1, "ip_addr"))) {
-            aot_set_last_error("llvm build in bounds gep failed");
-            return false;
-        }
-
-        if (!(value_ptr = LLVMBuildBitCast(
-                  comp_ctx->builder, value_addr,
-                  is_64bit ? INT64_PTR_TYPE : INT32_PTR_TYPE, "ip_ptr"))) {
-            aot_set_last_error("llvm build bit cast failed");
-            return false;
-        }
-
+    if (commit_ip && comp_ctx->call_stack_features.ip) {
         if (!comp_ctx->is_jit_mode) {
             WASMModule *module = comp_ctx->comp_data->wasm_module;
             if (is_64bit)
@@ -630,13 +679,12 @@ aot_gen_commit_sp_ip(AOTCompFrame *frame, bool commit_sp, bool commit_ip)
             return false;
         }
 
-        if (!LLVMBuildStore(comp_ctx->builder, value, value_ptr)) {
-            aot_set_last_error("llvm build store failed");
+        if (!aot_gen_commit_ip(comp_ctx, func_ctx, value, is_64bit)) {
             return false;
         }
     }
 
-    if (commit_sp) {
+    if (commit_sp && comp_ctx->call_stack_features.values) {
         n = (uint32)(sp - frame->lp);
         value = I32_CONST(offset_of_local(comp_ctx, n));
         if (!value) {
@@ -940,6 +988,7 @@ static bool
 aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
 {
     AOTFuncContext *func_ctx = comp_ctx->func_ctxes[func_index];
+    LLVMValueRef func_index_ref;
     uint8 *frame_ip = func_ctx->aot_func->code, opcode, *p_f32, *p_f64;
     uint8 *frame_ip_end = frame_ip + func_ctx->aot_func->code_size;
     uint8 *param_types = NULL;
@@ -962,16 +1011,27 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
     LLVMMetadataRef location;
 #endif
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    /* Start to translate the opcodes */
+    LLVMPositionBuilderAtEnd(
+        comp_ctx->builder,
+        func_ctx->block_stack.block_list_head->llvm_entry_block);
+
+    if (comp_ctx->aux_stack_frame_type
+        && comp_ctx->call_stack_features.frame_per_function) {
+        INT_CONST(func_index_ref,
+                  func_index + comp_ctx->comp_data->import_func_count, I32_TYPE,
+                  true);
+        if (!aot_alloc_frame_per_function_frame_for_aot_func(comp_ctx, func_ctx,
+                                                             func_index_ref)) {
+            return false;
+        }
+    }
+    if (comp_ctx->aux_stack_frame_type) {
         if (!init_comp_frame(comp_ctx, func_ctx, func_index)) {
             return false;
         }
     }
 
-    /* Start to translate the opcodes */
-    LLVMPositionBuilderAtEnd(
-        comp_ctx->builder,
-        func_ctx->block_stack.block_list_head->llvm_entry_block);
     while (frame_ip < frame_ip_end) {
         opcode = *frame_ip++;
 
diff --git a/core/iwasm/compilation/aot_compiler.h b/core/iwasm/compilation/aot_compiler.h
index ab74b7cb6..895d2416b 100644
--- a/core/iwasm/compilation/aot_compiler.h
+++ b/core/iwasm/compilation/aot_compiler.h
@@ -195,6 +195,15 @@ aot_gen_commit_values(AOTCompFrame *frame);
 bool
 aot_gen_commit_sp_ip(AOTCompFrame *frame, bool commit_sp, bool commit_ip);
 
+/**
+ * Generate instructions to commit IP pointer to the frame.
+ *
+ * @param frame the frame information
+ */
+bool
+aot_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                  LLVMValueRef ip_value, bool is_64bit);
+
 bool
 aot_frame_store_value(AOTCompContext *comp_ctx, LLVMValueRef value,
                       uint8 value_type, LLVMValueRef cur_frame, uint32 offset);
@@ -652,6 +661,15 @@ set_local_gc_ref(AOTCompFrame *frame, int n, LLVMValueRef value, uint8 ref_type)
 #define F64_CONST(v) LLVMConstReal(F64_TYPE, v)
 #define I8_CONST(v) LLVMConstInt(INT8_TYPE, v, true)
 
+#define INT_CONST(variable, value, type, is_signed)        \
+    do {                                                   \
+        variable = LLVMConstInt(type, value, is_signed);   \
+        if (!variable) {                                   \
+            aot_set_last_error("llvm build const failed"); \
+            return false;                                  \
+        }                                                  \
+    } while (0)
+
 #define LLVM_CONST(name) (comp_ctx->llvm_consts.name)
 #define I1_ZERO LLVM_CONST(i1_zero)
 #define I1_ONE LLVM_CONST(i1_one)
diff --git a/core/iwasm/compilation/aot_emit_aot_file.c b/core/iwasm/compilation/aot_emit_aot_file.c
index e05f83b09..20f29057c 100644
--- a/core/iwasm/compilation/aot_emit_aot_file.c
+++ b/core/iwasm/compilation/aot_emit_aot_file.c
@@ -4433,6 +4433,12 @@ aot_obj_data_create(AOTCompContext *comp_ctx)
     if (comp_ctx->enable_gc) {
         obj_data->target_info.feature_flags |= WASM_FEATURE_GARBAGE_COLLECTION;
     }
+    if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_TINY) {
+        obj_data->target_info.feature_flags |= WASM_FEATURE_TINY_STACK_FRAME;
+    }
+    if (comp_ctx->call_stack_features.frame_per_function) {
+        obj_data->target_info.feature_flags |= WASM_FEATURE_FRAME_PER_FUNCTION;
+    }
 
     bh_print_time("Begin to resolve object file info");
 
diff --git a/core/iwasm/compilation/aot_emit_control.c b/core/iwasm/compilation/aot_emit_control.c
index 7d73d8d90..945f63952 100644
--- a/core/iwasm/compilation/aot_emit_control.c
+++ b/core/iwasm/compilation/aot_emit_control.c
@@ -6,6 +6,7 @@
 #include "aot_emit_control.h"
 #include "aot_compiler.h"
 #include "aot_emit_exception.h"
+#include "aot_stack_frame_comp.h"
 #if WASM_ENABLE_GC != 0
 #include "aot_emit_gc.h"
 #endif
@@ -38,13 +39,24 @@ format_block_name(char *name, uint32 name_size, uint32 block_index,
         snprintf(name, name_size, "%s", "func_end");
 }
 
-#define CREATE_BLOCK(new_llvm_block, name)                      \
-    do {                                                        \
-        if (!(new_llvm_block = LLVMAppendBasicBlockInContext(   \
-                  comp_ctx->context, func_ctx->func, name))) {  \
-            aot_set_last_error("add LLVM basic block failed."); \
-            goto fail;                                          \
-        }                                                       \
+#define CREATE_BLOCK(new_llvm_block, name)                                   \
+    do {                                                                     \
+        if (!(new_llvm_block = LLVMAppendBasicBlockInContext(                \
+                  comp_ctx->context, func_ctx->func, name))) {               \
+            aot_set_last_error("add LLVM basic block failed.");              \
+            goto fail;                                                       \
+        }                                                                    \
+        if (!strcmp(name, "func_end") && comp_ctx->aux_stack_frame_type      \
+            && comp_ctx->call_stack_features.frame_per_function) {           \
+            LLVMBasicBlockRef cur_block =                                    \
+                LLVMGetInsertBlock(comp_ctx->builder);                       \
+            SET_BUILDER_POS(new_llvm_block);                                 \
+            if (!aot_free_frame_per_function_frame_for_aot_func(comp_ctx,    \
+                                                                func_ctx)) { \
+                goto fail;                                                   \
+            }                                                                \
+            SET_BUILDER_POS(cur_block);                                      \
+        }                                                                    \
     } while (0)
 
 #define CURR_BLOCK() LLVMGetInsertBlock(comp_ctx->builder)
@@ -93,6 +105,11 @@ format_block_name(char *name, uint32 name_size, uint32 block_index,
                 goto fail;                                                  \
             }                                                               \
             SET_BUILDER_POS(block->llvm_end_block);                         \
+            LLVMValueRef first_instr =                                      \
+                get_first_non_phi(block->llvm_end_block);                   \
+            if (first_instr) {                                              \
+                LLVMPositionBuilderBefore(comp_ctx->builder, first_instr);  \
+            }                                                               \
             for (_i = 0; _i < block->result_count; _i++) {                  \
                 if (!(block->result_phis[_i] = LLVMBuildPhi(                \
                           comp_ctx->builder,                                \
@@ -158,6 +175,18 @@ get_target_block(AOTFuncContext *func_ctx, uint32 br_depth)
     return block;
 }
 
+LLVMValueRef
+get_first_non_phi(LLVMBasicBlockRef block)
+{
+    LLVMValueRef instr = LLVMGetFirstInstruction(block);
+
+    while (instr && LLVMIsAPHINode(instr)) {
+        instr = LLVMGetNextInstruction(instr);
+    }
+
+    return instr;
+}
+
 static void
 clear_frame_locals(AOTCompFrame *aot_frame)
 {
@@ -1361,6 +1390,13 @@ aot_compile_op_return(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         (*p_frame_ip - 1) - comp_ctx->comp_data->wasm_module->buf_code);
 #endif
 
+    if (comp_ctx->aux_stack_frame_type
+        && comp_ctx->call_stack_features.frame_per_function
+        && !aot_free_frame_per_function_frame_for_aot_func(comp_ctx,
+                                                           func_ctx)) {
+        return false;
+    }
+
     if (block_func->result_count) {
         /* Store extra result values to function parameters */
         for (i = 0; i < block_func->result_count - 1; i++) {
diff --git a/core/iwasm/compilation/aot_emit_exception.c b/core/iwasm/compilation/aot_emit_exception.c
index d3dcf719d..1527e83e5 100644
--- a/core/iwasm/compilation/aot_emit_exception.c
+++ b/core/iwasm/compilation/aot_emit_exception.c
@@ -4,49 +4,10 @@
  */
 
 #include "aot_emit_exception.h"
+#include "aot_compiler.h"
 #include "../interpreter/wasm_runtime.h"
 #include "../aot/aot_runtime.h"
 
-static bool
-commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-          LLVMValueRef exce_ip, bool is_64bit)
-{
-    LLVMValueRef cur_frame = func_ctx->cur_frame;
-    LLVMValueRef value_offset, value_addr, value_ptr;
-    uint32 offset_ip;
-
-    if (!comp_ctx->is_jit_mode)
-        offset_ip = comp_ctx->pointer_size * 4;
-    else
-        offset_ip = offsetof(WASMInterpFrame, ip);
-
-    if (!(value_offset = I32_CONST(offset_ip))) {
-        aot_set_last_error("llvm build const failed");
-        return false;
-    }
-
-    if (!(value_addr =
-              LLVMBuildInBoundsGEP2(comp_ctx->builder, INT8_TYPE, cur_frame,
-                                    &value_offset, 1, "ip_addr"))) {
-        aot_set_last_error("llvm build in bounds gep failed");
-        return false;
-    }
-
-    if (!(value_ptr = LLVMBuildBitCast(
-              comp_ctx->builder, value_addr,
-              is_64bit ? INT64_PTR_TYPE : INT32_PTR_TYPE, "ip_ptr"))) {
-        aot_set_last_error("llvm build bit cast failed");
-        return false;
-    }
-
-    if (!LLVMBuildStore(comp_ctx->builder, exce_ip, value_ptr)) {
-        aot_set_last_error("llvm build store failed");
-        return false;
-    }
-
-    return true;
-}
-
 bool
 aot_emit_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
                    int32 exception_id, bool is_cond_br, LLVMValueRef cond_br_if,
@@ -80,7 +41,7 @@ aot_emit_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             return false;
         }
 
-        if (comp_ctx->aot_frame) {
+        if (comp_ctx->aot_frame && comp_ctx->call_stack_features.trap_ip) {
             /* Create exception ip phi */
             if (!(func_ctx->exception_ip_phi = LLVMBuildPhi(
                       comp_ctx->builder, is_64bit ? I64_TYPE : I32_TYPE,
@@ -90,8 +51,8 @@ aot_emit_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             }
 
             /* Commit ip to current frame */
-            if (!commit_ip(comp_ctx, func_ctx, func_ctx->exception_ip_phi,
-                           is_64bit)) {
+            if (!aot_gen_commit_ip(comp_ctx, func_ctx,
+                                   func_ctx->exception_ip_phi, is_64bit)) {
                 return false;
             }
         }
@@ -173,7 +134,7 @@ aot_emit_exception(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     /* Add phi incoming value to got_exception block */
     LLVMAddIncoming(func_ctx->exception_id_phi, &exce_id, &block_curr, 1);
 
-    if (comp_ctx->aot_frame) {
+    if (comp_ctx->aot_frame && comp_ctx->call_stack_features.trap_ip) {
         const uint8 *ip = comp_ctx->aot_frame->frame_ip;
         LLVMValueRef exce_ip = NULL;
 
diff --git a/core/iwasm/compilation/aot_emit_function.c b/core/iwasm/compilation/aot_emit_function.c
index 8f6e3e456..fbef02e20 100644
--- a/core/iwasm/compilation/aot_emit_function.c
+++ b/core/iwasm/compilation/aot_emit_function.c
@@ -7,6 +7,7 @@
 #include "aot_emit_exception.h"
 #include "aot_emit_control.h"
 #include "aot_emit_table.h"
+#include "aot_stack_frame_comp.h"
 #include "../aot/aot_runtime.h"
 #if WASM_ENABLE_GC != 0
 #include "aot_emit_gc.h"
@@ -682,24 +683,29 @@ alloc_frame_for_aot_func(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     new_frame = wasm_stack_top;
 
-    if (!(check_wasm_stack_succ = LLVMAppendBasicBlockInContext(
-              comp_ctx->context, func_ctx->func, "check_wasm_stack_succ"))) {
-        aot_set_last_error("llvm add basic block failed.");
-        return false;
-    }
+    if (comp_ctx->call_stack_features.bounds_checks) {
+        if (!(check_wasm_stack_succ = LLVMAppendBasicBlockInContext(
+                  comp_ctx->context, func_ctx->func,
+                  "check_wasm_stack_succ"))) {
+            aot_set_last_error("llvm add basic block failed.");
+            return false;
+        }
 
-    LLVMMoveBasicBlockAfter(check_wasm_stack_succ,
-                            LLVMGetInsertBlock(comp_ctx->builder));
+        LLVMMoveBasicBlockAfter(check_wasm_stack_succ,
+                                LLVMGetInsertBlock(comp_ctx->builder));
 
-    if (!(cmp = LLVMBuildICmp(comp_ctx->builder, LLVMIntUGT, wasm_stack_top_max,
-                              wasm_stack_top_bound, "cmp"))) {
-        aot_set_last_error("llvm build icmp failed");
-        return false;
-    }
+        if (!(cmp = LLVMBuildICmp(comp_ctx->builder, LLVMIntUGT,
+                                  wasm_stack_top_max, wasm_stack_top_bound,
+                                  "cmp"))) {
+            aot_set_last_error("llvm build icmp failed");
+            return false;
+        }
 
-    if (!(aot_emit_exception(comp_ctx, func_ctx, EXCE_OPERAND_STACK_OVERFLOW,
-                             true, cmp, check_wasm_stack_succ))) {
-        return false;
+        if (!(aot_emit_exception(comp_ctx, func_ctx,
+                                 EXCE_OPERAND_STACK_OVERFLOW, true, cmp,
+                                 check_wasm_stack_succ))) {
+            return false;
+        }
     }
 
 #if WASM_ENABLE_GC != 0
@@ -1285,6 +1291,10 @@ commit_params_to_frame_of_import_func(AOTCompContext *comp_ctx,
 {
     uint32 i, n;
 
+    if (!comp_ctx->call_stack_features.values) {
+        return true;
+    }
+
     for (i = 0, n = 0; i < func_type->param_count; i++, n++) {
         switch (func_type->types[i]) {
             case VALUE_TYPE_I32:
@@ -1394,6 +1404,7 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMValueRef *param_values = NULL, value_ret = NULL, func;
     LLVMValueRef import_func_idx, res;
     LLVMValueRef ext_ret, ext_ret_ptr, ext_ret_idx;
+    LLVMValueRef func_idx_ref;
     int32 i, j = 0, param_count, result_count, ext_ret_count;
     uint64 total_size;
     uint8 wasm_ret_type;
@@ -1438,12 +1449,28 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             return false;
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        if (!alloc_frame_for_aot_func(comp_ctx, func_ctx, func_idx))
-            return false;
-#endif
+    if (comp_ctx->aux_stack_frame_type) {
+        if (func_idx < import_func_count
+            && comp_ctx->call_stack_features.frame_per_function) {
+            INT_CONST(func_idx_ref, func_idx, I32_TYPE, true);
+            if (!aot_alloc_frame_per_function_frame_for_aot_func(
+                    comp_ctx, func_ctx, func_idx_ref)) {
+                return false;
+            }
+        }
+        else if (!comp_ctx->call_stack_features.frame_per_function) {
+            if (comp_ctx->aux_stack_frame_type
+                != AOT_STACK_FRAME_TYPE_STANDARD) {
+                aot_set_last_error("unsupported mode");
+                return false;
+            }
+            if (!alloc_frame_for_aot_func(comp_ctx, func_ctx, func_idx)) {
+                return false;
+            }
+        }
     }
+#endif
 
     /* Get param cell number */
     param_cell_num = func_type->param_cell_num;
@@ -1513,7 +1540,7 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     }
 
     if (func_idx < import_func_count) {
-        if (comp_ctx->enable_aux_stack_frame
+        if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
             && !commit_params_to_frame_of_import_func(
                 comp_ctx, func_ctx, func_type, param_values + 1)) {
             goto fail;
@@ -1804,12 +1831,26 @@ aot_compile_op_call(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         }
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
-        if (!free_frame_for_aot_func(comp_ctx, func_ctx))
-            goto fail;
-#endif
+    if (comp_ctx->aux_stack_frame_type) {
+        if (func_idx < import_func_count
+            && comp_ctx->call_stack_features.frame_per_function) {
+            if (!aot_free_frame_per_function_frame_for_aot_func(comp_ctx,
+                                                                func_ctx)) {
+                goto fail;
+            }
+        }
+        else if (!comp_ctx->call_stack_features.frame_per_function) {
+            if (comp_ctx->aux_stack_frame_type
+                != AOT_STACK_FRAME_TYPE_STANDARD) {
+                aot_set_last_error("unsupported mode");
+            }
+            if (!free_frame_for_aot_func(comp_ctx, func_ctx)) {
+                goto fail;
+            }
+        }
     }
+#endif
 
     /* Insert suspend check point */
     if (comp_ctx->enable_thread_mgr) {
@@ -2430,7 +2471,8 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         goto fail;
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         /*  TODO: use current frame instead of allocating new frame
                   for WASM_OP_RETURN_CALL_INDIRECT */
@@ -2499,7 +2541,13 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     /* Translate call import block */
     LLVMPositionBuilderAtEnd(comp_ctx->builder, block_call_import);
 
-    if (comp_ctx->enable_aux_stack_frame
+    if (comp_ctx->aot_frame && comp_ctx->call_stack_features.frame_per_function
+        && !aot_alloc_frame_per_function_frame_for_aot_func(comp_ctx, func_ctx,
+                                                            func_idx)) {
+        goto fail;
+    }
+
+    if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
         && !commit_params_to_frame_of_import_func(comp_ctx, func_ctx, func_type,
                                                   param_values + 1)) {
         goto fail;
@@ -2536,6 +2584,12 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         && !check_call_return(comp_ctx, func_ctx, res))
         goto fail;
 
+    if (comp_ctx->aot_frame && comp_ctx->call_stack_features.frame_per_function
+        && !aot_free_frame_per_function_frame_for_aot_func(comp_ctx,
+                                                           func_ctx)) {
+        goto fail;
+    }
+
     block_curr = LLVMGetInsertBlock(comp_ctx->builder);
     for (i = 0; i < func_result_count; i++) {
         LLVMAddIncoming(result_phis[i], &value_rets[i], &block_curr, 1);
@@ -2620,7 +2674,8 @@ aot_compile_op_call_indirect(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         PUSH(result_phis[i], func_type->types[func_param_count + i]);
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         if (!free_frame_for_aot_func(comp_ctx, func_ctx))
             goto fail;
@@ -2927,7 +2982,8 @@ aot_compile_op_call_ref(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         goto fail;
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         /*  TODO: use current frame instead of allocating new frame
                   for WASM_OP_RETURN_CALL_REF */
@@ -2996,7 +3052,7 @@ aot_compile_op_call_ref(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     /* Translate call import block */
     LLVMPositionBuilderAtEnd(comp_ctx->builder, block_call_import);
 
-    if (comp_ctx->enable_aux_stack_frame
+    if (comp_ctx->aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
         && !commit_params_to_frame_of_import_func(comp_ctx, func_ctx, func_type,
                                                   param_values + 1)) {
         goto fail;
@@ -3124,7 +3180,8 @@ aot_compile_op_call_ref(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         PUSH(result_phis[i], func_type->types[func_param_count + i]);
     }
 
-    if (comp_ctx->enable_aux_stack_frame) {
+    if (comp_ctx->aux_stack_frame_type
+        && !comp_ctx->call_stack_features.frame_per_function) {
 #if WASM_ENABLE_AOT_STACK_FRAME != 0
         if (!free_frame_for_aot_func(comp_ctx, func_ctx))
             goto fail;
diff --git a/core/iwasm/compilation/aot_llvm.c b/core/iwasm/compilation/aot_llvm.c
index d738cfc0e..820a55e96 100644
--- a/core/iwasm/compilation/aot_llvm.c
+++ b/core/iwasm/compilation/aot_llvm.c
@@ -1771,7 +1771,7 @@ aot_create_func_context(const AOTCompData *comp_data, AOTCompContext *comp_ctx,
         goto fail;
     }
 
-    if (comp_ctx->enable_aux_stack_frame
+    if (comp_ctx->aux_stack_frame_type
         && !create_aux_stack_frame(comp_ctx, func_ctx)) {
         goto fail;
     }
@@ -2577,8 +2577,8 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
     if (option->enable_ref_types)
         comp_ctx->enable_ref_types = true;
 
-    if (option->enable_aux_stack_frame)
-        comp_ctx->enable_aux_stack_frame = true;
+    comp_ctx->aux_stack_frame_type = option->aux_stack_frame_type;
+    comp_ctx->call_stack_features = option->call_stack_features;
 
     if (option->enable_perf_profiling)
         comp_ctx->enable_perf_profiling = true;
@@ -2790,6 +2790,15 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
                 bh_assert(vendor_sys);
                 bh_memcpy_s(default_arch, sizeof(default_arch), default_triple,
                             (uint32)(vendor_sys - default_triple));
+                /**
+                 * On Mac M[1-9]+ LLVM will report arm64 as the
+                 * architecture, for the purposes of wamr this is the
+                 * same as aarch64v8 so we'll normalize it here.
+                 */
+                if (!strcmp(default_arch, "arm64")) {
+                    bh_strcpy_s(default_arch, sizeof(default_arch),
+                                "aarch64v8");
+                }
                 arch1 = default_arch;
 
                 LLVMDisposeMessage(default_triple);
@@ -2960,12 +2969,12 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
                                     sizeof(comp_ctx->target_arch));
 
         if (option->bounds_checks == 1 || option->bounds_checks == 0) {
-            /* Set by user */
+            /* Set by the user */
             comp_ctx->enable_bound_check =
                 (option->bounds_checks == 1) ? true : false;
         }
         else {
-            /* Unset by user, use default value */
+            /* Unset by the user, use the default value */
             if (strstr(comp_ctx->target_arch, "64")
                 && !option->is_sgx_platform) {
                 comp_ctx->enable_bound_check = false;
@@ -2975,17 +2984,17 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option)
             }
         }
 
-        if (comp_ctx->enable_bound_check) {
-            /* Always enable stack boundary check if `bounds-checks`
-               is enabled */
-            comp_ctx->enable_stack_bound_check = true;
-        }
-        else {
-            /* When `bounds-checks` is disabled, we set stack boundary
-               check status according to the input option */
+        if (option->stack_bounds_checks == 1
+            || option->stack_bounds_checks == 0) {
+            /* Set by the user */
             comp_ctx->enable_stack_bound_check =
                 (option->stack_bounds_checks == 1) ? true : false;
         }
+        else {
+            /* Unset by the user, use the default value, it will be the same
+             * value as the bound check */
+            comp_ctx->enable_stack_bound_check = comp_ctx->enable_bound_check;
+        }
 
         if ((comp_ctx->enable_stack_bound_check
              || comp_ctx->enable_stack_estimation)
diff --git a/core/iwasm/compilation/aot_llvm.h b/core/iwasm/compilation/aot_llvm.h
index 270e5ae45..43212e502 100644
--- a/core/iwasm/compilation/aot_llvm.h
+++ b/core/iwasm/compilation/aot_llvm.h
@@ -410,7 +410,10 @@ typedef struct AOTCompContext {
     bool enable_aux_stack_check;
 
     /* Generate auxiliary stack frame */
-    bool enable_aux_stack_frame;
+    AOTStackFrameType aux_stack_frame_type;
+
+    /* Auxiliary call stack features */
+    AOTCallStackFeatures call_stack_features;
 
     /* Function performance profiling */
     bool enable_perf_profiling;
diff --git a/core/iwasm/compilation/aot_stack_frame.h b/core/iwasm/compilation/aot_stack_frame.h
new file mode 100644
index 000000000..6155ee6e9
--- /dev/null
+++ b/core/iwasm/compilation/aot_stack_frame.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2024 Amazon Inc.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _AOT_STACK_FRAME_H_
+#define _AOT_STACK_FRAME_H_
+
+#include "platform_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    /* The non-imported function index of current function */
+    uint32 func_index;
+
+    /* Instruction pointer: offset to the bytecode array */
+    uint32 ip_offset;
+} AOTTinyFrame;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/iwasm/compilation/aot_stack_frame_comp.c b/core/iwasm/compilation/aot_stack_frame_comp.c
new file mode 100644
index 000000000..342dfe806
--- /dev/null
+++ b/core/iwasm/compilation/aot_stack_frame_comp.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2024 Amazon Inc.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+#include "aot_stack_frame_comp.h"
+#include "aot_emit_exception.h"
+
+#define ADD_IN_BOUNDS_GEP(variable, type, pointer, indices, num_indices)     \
+    do {                                                                     \
+        if (!(variable =                                                     \
+                  LLVMBuildInBoundsGEP2(comp_ctx->builder, type, pointer,    \
+                                        indices, num_indices, #variable))) { \
+            aot_set_last_error("llvm build in bounds gep failed");           \
+            return false;                                                    \
+        }                                                                    \
+    } while (0)
+
+#define ADD_STORE(value, pointer)                                 \
+    do {                                                          \
+        if (!LLVMBuildStore(comp_ctx->builder, value, pointer)) { \
+            aot_set_last_error("llvm build store failed");        \
+            return false;                                         \
+        }                                                         \
+    } while (0)
+
+#define ADD_LOAD(value, type, pointer)                                         \
+    do {                                                                       \
+        if (!(value =                                                          \
+                  LLVMBuildLoad2(comp_ctx->builder, type, pointer, #value))) { \
+            aot_set_last_error("llvm build load failed");                      \
+            return false;                                                      \
+        }                                                                      \
+    } while (0)
+
+static bool
+aot_alloc_tiny_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                  AOTFuncContext *func_ctx,
+                                  LLVMValueRef func_index)
+{
+    LLVMValueRef wasm_stack_top_ptr = func_ctx->wasm_stack_top_ptr,
+                 wasm_stack_top_bound = func_ctx->wasm_stack_top_bound,
+                 wasm_stack_top, cmp;
+    LLVMBasicBlockRef check_wasm_stack_succ;
+    LLVMValueRef offset;
+
+    ADD_LOAD(wasm_stack_top, INT8_PTR_TYPE, wasm_stack_top_ptr);
+
+    if (comp_ctx->call_stack_features.bounds_checks) {
+        if (!(check_wasm_stack_succ = LLVMAppendBasicBlockInContext(
+                  comp_ctx->context, func_ctx->func,
+                  "check_wasm_stack_succ"))) {
+            aot_set_last_error("llvm add basic block failed.");
+            return false;
+        }
+
+        LLVMMoveBasicBlockAfter(check_wasm_stack_succ,
+                                LLVMGetInsertBlock(comp_ctx->builder));
+
+        if (!(cmp = LLVMBuildICmp(comp_ctx->builder, LLVMIntUGE, wasm_stack_top,
+                                  wasm_stack_top_bound, "cmp"))) {
+            aot_set_last_error("llvm build icmp failed");
+            return false;
+        }
+
+        if (!(aot_emit_exception(comp_ctx, func_ctx,
+                                 EXCE_OPERAND_STACK_OVERFLOW, true, cmp,
+                                 check_wasm_stack_succ))) {
+            return false;
+        }
+    }
+
+    /* Save the func_idx on the top of the stack */
+    ADD_STORE(func_index, wasm_stack_top);
+
+    /* increment the stack pointer */
+    INT_CONST(offset, sizeof(AOTTinyFrame), I32_TYPE, true);
+    ADD_IN_BOUNDS_GEP(wasm_stack_top, INT8_TYPE, wasm_stack_top, &offset, 1);
+    ADD_STORE(wasm_stack_top, wasm_stack_top_ptr);
+
+    return true;
+}
+
+static bool
+aot_free_tiny_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                 AOTFuncContext *func_ctx)
+{
+    LLVMValueRef wasm_stack_top_ptr = func_ctx->wasm_stack_top_ptr,
+                 wasm_stack_top;
+    LLVMValueRef offset;
+
+    ADD_LOAD(wasm_stack_top, INT8_PTR_TYPE, wasm_stack_top_ptr);
+
+    INT_CONST(offset, -sizeof(AOTTinyFrame),
+              comp_ctx->pointer_size == 8 ? I64_TYPE : I32_TYPE, true);
+    ADD_IN_BOUNDS_GEP(wasm_stack_top, INT8_TYPE, wasm_stack_top, &offset, 1);
+    ADD_STORE(wasm_stack_top, wasm_stack_top_ptr);
+
+    return true;
+}
+
+bool
+aot_tiny_frame_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                             LLVMValueRef ip_value)
+{
+    LLVMValueRef wasm_stack_top_ptr = func_ctx->wasm_stack_top_ptr,
+                 wasm_stack_top;
+    LLVMValueRef offset, ip_addr;
+
+    bh_assert(ip_value);
+
+    ADD_LOAD(wasm_stack_top, INT8_PTR_TYPE, wasm_stack_top_ptr);
+
+    INT_CONST(offset, -4, comp_ctx->pointer_size == 8 ? I64_TYPE : I32_TYPE,
+              true);
+    ADD_IN_BOUNDS_GEP(ip_addr, INT8_TYPE, wasm_stack_top, &offset, 1);
+
+    ADD_STORE(ip_value, ip_addr);
+
+    return true;
+}
+
+bool
+aot_alloc_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                                AOTFuncContext *func_ctx,
+                                                LLVMValueRef func_index)
+{
+    switch (comp_ctx->aux_stack_frame_type) {
+        case AOT_STACK_FRAME_TYPE_TINY:
+            return aot_alloc_tiny_frame_for_aot_func(comp_ctx, func_ctx,
+                                                     func_index);
+        default:
+            aot_set_last_error("unsupported mode");
+            return false;
+    }
+}
+
+bool
+aot_free_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                               AOTFuncContext *func_ctx)
+{
+    switch (comp_ctx->aux_stack_frame_type) {
+        case AOT_STACK_FRAME_TYPE_TINY:
+            return aot_free_tiny_frame_for_aot_func(comp_ctx, func_ctx);
+        default:
+            aot_set_last_error("unsupported mode");
+            return false;
+    }
+}
diff --git a/core/iwasm/compilation/aot_stack_frame_comp.h b/core/iwasm/compilation/aot_stack_frame_comp.h
new file mode 100644
index 000000000..7980b8c08
--- /dev/null
+++ b/core/iwasm/compilation/aot_stack_frame_comp.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2024 Amazon Inc.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _AOT_STACK_FRAME_COMP_H_
+#define _AOT_STACK_FRAME_COMP_H_
+
+#include "aot_stack_frame.h"
+#include "aot_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+aot_alloc_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                                AOTFuncContext *func_ctx,
+                                                LLVMValueRef func_index);
+
+bool
+aot_free_frame_per_function_frame_for_aot_func(AOTCompContext *comp_ctx,
+                                               AOTFuncContext *func_ctx);
+
+bool
+aot_tiny_frame_gen_commit_ip(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
+                             LLVMValueRef ip_value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/iwasm/include/aot_comp_option.h b/core/iwasm/include/aot_comp_option.h
index 617b68f97..67ec81cd3 100644
--- a/core/iwasm/include/aot_comp_option.h
+++ b/core/iwasm/include/aot_comp_option.h
@@ -6,6 +6,39 @@
 #ifndef __AOT_COMP_OPTION_H__
 #define __AOT_COMP_OPTION_H__
 
+typedef struct {
+    /* Enables or disables bounds checks for stack frames. When enabled, the AOT
+     * compiler generates code to check if the stack pointer is within the
+     * bounds of the current stack frame (and if not, traps). */
+    bool bounds_checks;
+
+    /*  Enables or disables instruction pointer (IP) tracking.*/
+    bool ip;
+
+    /* Enables or disables tracking instruction pointer of a trap. Only takes
+     * effect when `ip` is enabled.*/
+    bool trap_ip;
+
+    /* Enables or disables parameters, locals and stack operands. */
+    bool values;
+
+    /* If enabled, stack frame is generated at the beginning of each
+     * function (frame-per-function mode). Otherwise, stack frame is
+     * generated before each call of a function (frame-per-call mode). */
+    bool frame_per_function;
+} AOTCallStackFeatures;
+
+void
+aot_call_stack_features_init_default(AOTCallStackFeatures *features);
+
+typedef enum {
+    AOT_STACK_FRAME_OFF = 0,
+    /* Use a small stack frame data structure (AOTTinyFrame) */
+    AOT_STACK_FRAME_TYPE_TINY,
+    /* Use a regular stack frame data structure (AOTFrame) */
+    AOT_STACK_FRAME_TYPE_STANDARD,
+} AOTStackFrameType;
+
 typedef struct AOTCompOption {
     bool is_jit_mode;
     bool is_indirect_mode;
@@ -21,7 +54,8 @@ typedef struct AOTCompOption {
     bool enable_ref_types;
     bool enable_gc;
     bool enable_aux_stack_check;
-    bool enable_aux_stack_frame;
+    AOTStackFrameType aux_stack_frame_type;
+    AOTCallStackFeatures call_stack_features;
     bool enable_perf_profiling;
     bool enable_memory_profiling;
     bool disable_llvm_intrinsics;
diff --git a/core/iwasm/interpreter/wasm_interp_classic.c b/core/iwasm/interpreter/wasm_interp_classic.c
index 4a8ba4e2c..67f8c2d45 100644
--- a/core/iwasm/interpreter/wasm_interp_classic.c
+++ b/core/iwasm/interpreter/wasm_interp_classic.c
@@ -5739,6 +5739,7 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
                         /* use memmove when memory64 is enabled since len
                            may be larger than UINT32_MAX */
                         memmove(mdst, msrc, len);
+                        (void)dlen;
 #endif
                         break;
                     }
diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c
index 51963759f..1d7ca8f90 100644
--- a/core/iwasm/interpreter/wasm_interp_fast.c
+++ b/core/iwasm/interpreter/wasm_interp_fast.c
@@ -6030,7 +6030,7 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module,
 
 #if WASM_ENABLE_LABELS_AS_VALUES != 0
 void **
-wasm_interp_get_handle_table()
+wasm_interp_get_handle_table(void)
 {
     WASMModuleInstance module;
     memset(&module, 0, sizeof(WASMModuleInstance));
diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c
index 13947ac82..3a21b1fc6 100644
--- a/core/iwasm/interpreter/wasm_loader.c
+++ b/core/iwasm/interpreter/wasm_loader.c
@@ -2474,7 +2474,8 @@ wasm_loader_resolve_tag(const char *module_name, const char *tag_name,
     }
 
     /* check function type */
-    if (!wasm_type_equal(expected_tag_type, tag->tag_type)) {
+    if (!wasm_type_equal(expected_tag_type, tag->tag_type, module->types,
+                         module->type_count)) {
         LOG_DEBUG("%s.%s failed the type check", module_name, tag_name);
         set_error_buf(error_buf, error_buf_size, "incompatible import type");
         return NULL;
@@ -5406,7 +5407,8 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     option.enable_aux_stack_check = true;
 #if WASM_ENABLE_PERF_PROFILING != 0 || WASM_ENABLE_DUMP_CALL_STACK != 0 \
     || WASM_ENABLE_AOT_STACK_FRAME != 0
-    option.enable_aux_stack_frame = true;
+    option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
+    aot_call_stack_features_init_default(&option.call_stack_features);
 #endif
 #if WASM_ENABLE_PERF_PROFILING != 0
     option.enable_perf_profiling = true;
@@ -5750,7 +5752,7 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func,
 
 #if WASM_ENABLE_FAST_INTERP != 0 && WASM_ENABLE_LABELS_AS_VALUES != 0
 void **
-wasm_interp_get_handle_table();
+wasm_interp_get_handle_table(void);
 
 static void **handle_table;
 #endif
diff --git a/core/iwasm/interpreter/wasm_mini_loader.c b/core/iwasm/interpreter/wasm_mini_loader.c
index 8826f98db..968eaf009 100644
--- a/core/iwasm/interpreter/wasm_mini_loader.c
+++ b/core/iwasm/interpreter/wasm_mini_loader.c
@@ -2148,7 +2148,8 @@ init_llvm_jit_functions_stage1(WASMModule *module, char *error_buf,
     option.enable_aux_stack_check = true;
 #if WASM_ENABLE_PERF_PROFILING != 0 || WASM_ENABLE_DUMP_CALL_STACK != 0 \
     || WASM_ENABLE_AOT_STACK_FRAME != 0
-    option.enable_aux_stack_frame = true;
+    option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
+    aot_call_stack_features_init_default(&option.call_stack_features);
 #endif
 #if WASM_ENABLE_PERF_PROFILING != 0
     option.enable_perf_profiling = true;
@@ -2531,7 +2532,7 @@ wasm_loader_prepare_bytecode(WASMModule *module, WASMFunction *func,
 
 #if WASM_ENABLE_FAST_INTERP != 0 && WASM_ENABLE_LABELS_AS_VALUES != 0
 void **
-wasm_interp_get_handle_table();
+wasm_interp_get_handle_table(void);
 
 static void **handle_table;
 #endif
diff --git a/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c b/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c
index aeaafced7..c9512fb43 100644
--- a/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c
+++ b/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c
@@ -29,7 +29,7 @@ typedef struct {
 } ThreadStartArg;
 
 static int32
-allocate_thread_id()
+allocate_thread_id(void)
 {
     os_mutex_lock(&thread_id_lock);
     int32 id = tid_allocator_get_tid(&tid_allocator);
diff --git a/core/iwasm/libraries/thread-mgr/thread_manager.h b/core/iwasm/libraries/thread-mgr/thread_manager.h
index f5ca1eaed..7ad6c772a 100644
--- a/core/iwasm/libraries/thread-mgr/thread_manager.h
+++ b/core/iwasm/libraries/thread-mgr/thread_manager.h
@@ -64,10 +64,10 @@ void
 wasm_cluster_set_max_thread_num(uint32 num);
 
 bool
-thread_manager_init();
+thread_manager_init(void);
 
 void
-thread_manager_destroy();
+thread_manager_destroy(void);
 
 /* Create cluster */
 WASMCluster *
@@ -109,7 +109,7 @@ bool
 wasm_cluster_register_destroy_callback(void (*callback)(WASMCluster *));
 
 void
-wasm_cluster_cancel_all_callbacks();
+wasm_cluster_cancel_all_callbacks(void);
 
 void
 wasm_cluster_suspend_all(WASMCluster *cluster);
@@ -190,7 +190,7 @@ struct WASMCurrentEnvStatus {
 };
 
 WASMCurrentEnvStatus *
-wasm_cluster_create_exenv_status();
+wasm_cluster_create_exenv_status(void);
 
 void
 wasm_cluster_destroy_exenv_status(WASMCurrentEnvStatus *status);
diff --git a/core/iwasm/libraries/wasi-nn/README.md b/core/iwasm/libraries/wasi-nn/README.md
index e0d3a25ce..5536f6d57 100644
--- a/core/iwasm/libraries/wasi-nn/README.md
+++ b/core/iwasm/libraries/wasi-nn/README.md
@@ -4,7 +4,7 @@
 
 ### Host
 
-Enable WASI-NN in the WAMR by spefiying it in the cmake building configuration as follows,
+Enable WASI-NN in the WAMR by specifying it in the cmake building configuration as follows,
 
 ```cmake
 set (WAMR_BUILD_WASI_NN  1)
@@ -17,14 +17,15 @@ $ cmake -DWAMR_BUILD_WASI_NN=1 <other options> ...
 ```
 
 > ![Caution]
-> If enable `WAMR_BUID_WASI_NN`, iwasm will link a shared WAMR library instead of a static one. Wasi-nn backends will be loaded dynamically at runtime. Users shall specify the path of the backend library and register it to the iwasm runtime with `--native-lib=<path of backend library>`. All shared libraries should be placed in the `LD_LIBRARY_PATH`.
+> Enabling WAMR_BUILD_WASI_NN will cause the IWASM to link to a shared WAMR library instead of a static one. The WASI-NN backends will then be loaded dynamically when the program is run. You must ensure that all shared libraries are included in the `LD_LIBRARY_PATH`.
 
 #### Compilation options
 
-- `WAMR_BUILD_WASI_NN`. enable wasi-nn support. can't work alone. need to identify a backend. Match legacy wasi-nn spec naming convention. use `wasi_nn` as import module names.
-- `WAMR_BUILD_WASI_EPHEMERAL_NN`. Match latest wasi-nn spec naming convention. use `wasi_ephemeral_nn` as import module names.
-- `WAMR_BUILD_WASI_NN_TFLITE`. identify the backend as TensorFlow Lite.
-- `WAMR_BUILD_WASI_NN_OPENVINO`. identify the backend as OpenVINO.
+- `WAMR_BUILD_WASI_NN`. This option enables support for WASI-NN. It cannot function independently and requires specifying a backend. It follows the original WASI-NN specification for naming conventions and uses wasi_nn for import module names.
+- `WAMR_BUILD_WASI_EPHEMERAL_NN`. This option adheres to the most recent WASI-NN specification for naming conventions and uses wasi_ephemeral_nn for import module names.
+- `WAMR_BUILD_WASI_NN_TFLITE`. This option designates TensorFlow Lite as the backend.
+- `WAMR_BUILD_WASI_NN_OPENVINO`. This option designates OpenVINO as the backend.
+- `WAMR_BUILD_WASI_NN_LLAMACPP`. This option designates Llama.cpp as the backend.
 
 ### Wasm
 
@@ -44,7 +45,7 @@ typedef enum { fp16 = 0, fp32, up8, ip32 } tensor_type;
 
 It is required to recompile the Wasm application if you want to switch between the two sets of functions.
 
-#### Openvino
+#### Openvino installation
 
 If you're planning to use OpenVINO backends, the first step is to install OpenVINO on your computer. To do this correctly, please follow the official installation guide which you can find at this link: https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-archive-linux.html.
 
@@ -162,17 +163,9 @@ Supported:
 
 ### Testing with WasmEdge-WASINN Examples
 
-To ensure everything is set up correctly, use the examples from [WasmEdge-WASINN-examples](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master). These examples help verify that WASI-NN support in WAMR is functioning as expected.
+To make sure everything is configured properly, refer to the examples provided at [WasmEdge-WASINN-examples](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master). These examples are useful for confirming that the WASI-NN support in WAMR is working correctly.
 
-> Note: The repository contains two types of examples. Some use the [standard wasi-nn](https://github.com/WebAssembly/wasi-nn), while others use [WasmEdge's version of wasi-nn](https://github.com/second-state/wasmedge-wasi-nn), which is enhanced to meet specific customer needs.
-
-The examples test the following machine learning backends:
-
-- OpenVINO
-- PyTorch
-- TensorFlow Lite
-
-Due to the different requirements of each backend, we'll use a Docker container for a hassle-free testing environment.
+Because each backend has its own set of requirements, we recommend using a Docker container to create a straightforward testing environment without complications.
 
 #### Prepare the execution environment
 
@@ -186,9 +179,20 @@ $ docker build -t wasi-nn-smoke:v1.0 -f ./core/iwasm/libraries/wasi-nn/test/Dock
 #### Execute
 
 ```bash
+$ pwd
+/workspaces/wasm-micro-runtime/
 $ docker run --rm wasi-nn-smoke:v1.0
 ```
 
-### Testing with bytecodealliance wasi-nn
+It should be noted that the qwen example is selected as the default one about the Llama.cpp backend because it uses a small model and is easy to run.
+
+```bash
+- openvino_mobile_image. PASS
+- openvino_mobile_raw. PASS
+- openvino_road_segmentation_adas. PASS
+- wasmedge_ggml_qwen. PASS
+```
+
+### Testing with bytecodealliance WASI-NN
 
 For another example, check out [classification-example](https://github.com/bytecodealliance/wasi-nn/tree/main/rust/examples/classification-example), which focuses on OpenVINO. You can run it using the same Docker container mentioned above.
diff --git a/core/iwasm/libraries/wasi-nn/cmake/Findcjson.cmake b/core/iwasm/libraries/wasi-nn/cmake/Findcjson.cmake
new file mode 100644
index 000000000..1136f41ad
--- /dev/null
+++ b/core/iwasm/libraries/wasi-nn/cmake/Findcjson.cmake
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+include(FetchContent)
+
+set(CJSON_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/cjson")
+
+FetchContent_Declare(
+  cjson
+  GIT_REPOSITORY https://github.com/DaveGamble/cJSON.git
+  GIT_TAG        v1.7.18
+  SOURCE_DIR     ${CJSON_SOURCE_DIR}
+)
+
+set(ENABLE_CJSON_TEST OFF CACHE INTERNAL "Turn off tests")
+set(ENABLE_CJSON_UNINSTALL OFF CACHE INTERNAL "Turn off uninstall to avoid targets conflict")
+FetchContent_MakeAvailable(cjson)
diff --git a/core/iwasm/libraries/wasi-nn/cmake/Findllamacpp.cmake b/core/iwasm/libraries/wasi-nn/cmake/Findllamacpp.cmake
new file mode 100644
index 000000000..431e15db5
--- /dev/null
+++ b/core/iwasm/libraries/wasi-nn/cmake/Findllamacpp.cmake
@@ -0,0 +1,18 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+include(FetchContent)
+
+set(LLAMA_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/llama.cpp")
+
+FetchContent_Declare(
+  llamacpp
+  GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
+  GIT_TAG        b3573
+  SOURCE_DIR     ${LLAMA_SOURCE_DIR}
+)
+
+set(LLAMA_BUILD_TESTS OFF)
+set(LLAMA_BUILD_EXAMPLES OFF)
+set(LLAMA_BUILD_SERVER OFF)
+FetchContent_MakeAvailable(llamacpp)
diff --git a/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake b/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
index 052dd9804..39480741d 100644
--- a/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
+++ b/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
@@ -1,47 +1,25 @@
 # Copyright (C) 2019 Intel Corporation. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-find_library(TENSORFLOW_LITE
-  NAMES tensorflow-lite
-  HINTS ${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite
-  NO_DEFAULT_PATHS
+include(FetchContent)
+
+set(TFLITE_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
+
+FetchContent_Declare(
+  tensorflow_lite
+  GIT_REPOSITORY https://github.com/tensorflow/tensorflow.git 
+  GIT_TAG        v2.12.0 
+  GIT_SHALLOW    ON
+  GIT_PROGRESS   ON
+  SOURCE_DIR     ${TFLITE_SOURCE_DIR}
+  SOURCE_SUBDIR  tensorflow/lite
 )
 
-if(NOT TENSORFLOW_LITE)
-  if(NOT EXISTS "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
-    execute_process(
-      COMMAND "${WAMR_ROOT_DIR}/core/deps/install_tensorflow.sh"
-      RESULT_VARIABLE TENSORFLOW_RESULT
-    )
-  else()
-    message("Tensorflow is already downloaded.")
-  endif()
-
-  set(TENSORFLOW_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
-
-  if(WAMR_BUILD_WASI_NN_ENABLE_GPU EQUAL 1)
-    # Tensorflow specific:
-    # * https://www.tensorflow.org/lite/guide/build_cmake#available_options_to_build_tensorflow_lite
-    set (TFLITE_ENABLE_GPU ON)
-  endif()
-
-  if (CMAKE_SIZEOF_VOID_P EQUAL 4)
-    set (TFLITE_ENABLE_XNNPACK OFF)
-  endif()
-
-  add_subdirectory(
-    "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
-    "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite"
-    EXCLUDE_FROM_ALL
-  )
-else ()
-  message(STATUS "TensorFlow Lite library found: ${TENSORFLOW_LITE}")
-  set(TENSORFLOW_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
+if(WAMR_BUILD_WASI_NN_ENABLE_GPU EQUAL 1)
+  set(TFLITE_ENABLE_GPU ON)
+endif()
+if (CMAKE_SIZEOF_VOID_P EQUAL 4)
+  set(TFLITE_ENABLE_XNNPACK OFF)
 endif()
 
-set(TENSORFLOW_LITE_INCLUDE_DIR "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite")
-set(FLATBUFFER_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers/include")
-
-include_directories(${TENSORFLOW_SOURCE_DIR})
-include_directories(${FLATBUFFER_INCLUDE_DIR})
-link_directories(${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite)
+FetchContent_MakeAvailable(tensorflow_lite)
diff --git a/core/iwasm/libraries/wasi-nn/cmake/wasi_nn.cmake b/core/iwasm/libraries/wasi-nn/cmake/wasi_nn.cmake
index e2ad257e0..a903f0af1 100644
--- a/core/iwasm/libraries/wasi-nn/cmake/wasi_nn.cmake
+++ b/core/iwasm/libraries/wasi-nn/cmake/wasi_nn.cmake
@@ -3,27 +3,6 @@
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
 
-if(WAMR_BUILD_WASI_NN_TFLITE EQUAL 1)
-  # Find tensorflow-lite
-  find_package(tensorflow_lite REQUIRED)
-endif()
-
-if(WAMR_BUILD_WASI_NN_OPENVINO EQUAL 1)
-  if(NOT DEFINED ENV{OpenVINO_DIR})
-    message(FATAL_ERROR
-        "OpenVINO_DIR is not defined. "
-        "Please follow https://docs.openvino.ai/2024/get-started/install-openvino.html,"
-        "install openvino, and set environment variable OpenVINO_DIR."
-        "Like OpenVINO_DIR=/usr/lib/openvino-2023.2/ cmake ..."
-        "Or OpenVINO_DIR=/opt/intel/openvino/ cmake ..."
-    )
-  endif()
-
-  list(APPEND CMAKE_MODULE_PATH $ENV{OpenVINO_DIR})
-  # Find OpenVINO
-  find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-endif()
-
 #
 # wasi-nn general
 set(WASI_NN_ROOT ${CMAKE_CURRENT_LIST_DIR}/..)
@@ -42,22 +21,46 @@ add_compile_definitions(
 #
 # - tflite
 if(WAMR_BUILD_WASI_NN_TFLITE EQUAL 1)
+  find_package(tensorflow_lite REQUIRED)
+
   add_library(
     wasi_nn_tflite
     SHARED
       ${WASI_NN_ROOT}/src/wasi_nn_tensorflowlite.cpp
   )
 
+  target_include_directories(
+    wasi_nn_tflite
+    PUBLIC
+      ${tensorflow_lite_SOURCE_DIR}
+  )
+
   target_link_libraries(
     wasi_nn_tflite
     PUBLIC
       libiwasm
       tensorflow-lite
   )
+
+  install(TARGETS wasi_nn_tflite DESTINATION lib)
 endif()
 
 # - openvino
 if(WAMR_BUILD_WASI_NN_OPENVINO EQUAL 1)
+  if(NOT DEFINED ENV{OpenVINO_DIR})
+    message(FATAL_ERROR
+        "OpenVINO_DIR is not defined. "
+        "Please follow https://docs.openvino.ai/2024/get-started/install-openvino.html,"
+        "install openvino, and set environment variable OpenVINO_DIR."
+        "Like OpenVINO_DIR=/usr/lib/openvino-2023.2/ cmake ..."
+        "Or OpenVINO_DIR=/opt/intel/openvino/ cmake ..."
+    )
+  endif()
+
+  list(APPEND CMAKE_MODULE_PATH $ENV{OpenVINO_DIR})
+  # Find OpenVINO
+  find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+
   add_library(
     wasi_nn_openvino
     SHARED
@@ -71,4 +74,37 @@ if(WAMR_BUILD_WASI_NN_OPENVINO EQUAL 1)
       openvino::runtime
       openvino::runtime::c
   )
-endif()
\ No newline at end of file
+
+  install(TARGETS wasi_nn_openvino DESTINATION lib)
+endif()
+
+# - llamacpp
+
+if(WAMR_BUILD_WASI_NN_LLAMACPP EQUAL 1)
+  find_package(cjson REQUIRED)
+  find_package(llamacpp REQUIRED)
+
+  add_library(
+    wasi_nn_llamacpp
+    SHARED
+      ${WASI_NN_ROOT}/src/wasi_nn_llamacpp.c
+  )
+
+  target_include_directories(
+    wasi_nn_llamacpp
+    PUBLIC
+      ${cjson_SOURCE_DIR}
+  )
+
+  target_link_libraries(
+    wasi_nn_llamacpp
+    PUBLIC
+      libiwasm
+      cjson
+      common
+      ggml
+      llama
+  )
+
+  install(TARGETS wasi_nn_llamacpp DESTINATION lib)
+endif()
diff --git a/core/iwasm/libraries/wasi-nn/include/wasi_nn_types.h b/core/iwasm/libraries/wasi-nn/include/wasi_nn_types.h
index d36f5977c..3ac694fc9 100644
--- a/core/iwasm/libraries/wasi-nn/include/wasi_nn_types.h
+++ b/core/iwasm/libraries/wasi-nn/include/wasi_nn_types.h
@@ -43,6 +43,11 @@ typedef enum {
     security,
     // The operation failed for an unspecified reason.
     unknown,
+    // for WasmEdge-wasi-nn
+    end_of_sequence = 100,  // End of Sequence Found.
+    context_full = 101,     // Context Full.
+    prompt_tool_long = 102, // Prompt Too Long.
+    model_not_found = 103,  // Model Not Found.
 } wasi_nn_error;
 
 /**
@@ -140,6 +145,9 @@ typedef uint32_t graph_execution_context;
 typedef wasi_nn_error (*LOAD)(void *, graph_builder_array *, graph_encoding,
                               execution_target, graph *);
 typedef wasi_nn_error (*LOAD_BY_NAME)(void *, const char *, uint32_t, graph *);
+typedef wasi_nn_error (*LOAD_BY_NAME_WITH_CONFIG)(void *, const char *,
+                                                  uint32_t, void *, uint32_t,
+                                                  graph *);
 typedef wasi_nn_error (*INIT_EXECUTION_CONTEXT)(void *, graph,
                                                 graph_execution_context *);
 typedef wasi_nn_error (*SET_INPUT)(void *, graph_execution_context, uint32_t,
@@ -154,6 +162,7 @@ typedef wasi_nn_error (*BACKEND_DEINITIALIZE)(void *);
 typedef struct {
     LOAD load;
     LOAD_BY_NAME load_by_name;
+    LOAD_BY_NAME_WITH_CONFIG load_by_name_with_config;
     INIT_EXECUTION_CONTEXT init_execution_context;
     SET_INPUT set_input;
     COMPUTE compute;
diff --git a/core/iwasm/libraries/wasi-nn/src/wasi_nn.c b/core/iwasm/libraries/wasi-nn/src/wasi_nn.c
index 0d56981fc..4697e931b 100644
--- a/core/iwasm/libraries/wasi-nn/src/wasi_nn.c
+++ b/core/iwasm/libraries/wasi-nn/src/wasi_nn.c
@@ -29,7 +29,7 @@
 struct backends_api_functions {
     void *backend_handle;
     api_function functions;
-} lookup[autodetect] = { 0 };
+} lookup[autodetect + 1] = { 0 };
 
 #define call_wasi_nn_func(backend_encoding, func, wasi_error, ...)         \
     do {                                                                   \
@@ -168,14 +168,7 @@ wasi_nn_destroy()
             lookup[i].backend_handle = NULL;
         }
 
-        lookup[i].functions.init = NULL;
-        lookup[i].functions.deinit = NULL;
-        lookup[i].functions.load = NULL;
-        lookup[i].functions.load_by_name = NULL;
-        lookup[i].functions.init_execution_context = NULL;
-        lookup[i].functions.set_input = NULL;
-        lookup[i].functions.compute = NULL;
-        lookup[i].functions.get_output = NULL;
+        memset(&lookup[i].functions, 0, sizeof(api_function));
     }
 }
 
@@ -208,6 +201,10 @@ choose_a_backend()
         return ggml;
     }
 
+#ifndef NDEBUG
+    NN_WARN_PRINTF("%s", dlerror());
+#endif
+
     handle = dlopen(OPENVINO_BACKEND_LIB, RTLD_LAZY);
     if (handle) {
         NN_INFO_PRINTF("Using openvino backend");
@@ -215,6 +212,10 @@ choose_a_backend()
         return openvino;
     }
 
+#ifndef NDEBUG
+    NN_WARN_PRINTF("%s", dlerror());
+#endif
+
     handle = dlopen(TFLITE_BACKEND_LIB, RTLD_LAZY);
     if (handle) {
         NN_INFO_PRINTF("Using tflite backend");
@@ -222,6 +223,11 @@ choose_a_backend()
         return tensorflowlite;
     }
 
+#ifndef NDEBUG
+    NN_WARN_PRINTF("%s", dlerror());
+#endif
+
+    NN_WARN_PRINTF("No backend found");
     return unknown_backend;
 }
 
@@ -257,6 +263,14 @@ register_backend(void *handle, api_function *functions)
     }
     functions->load_by_name = load_by_name;
 
+    LOAD_BY_NAME_WITH_CONFIG load_by_name_with_config =
+        (LOAD_BY_NAME_WITH_CONFIG)dlsym(handle, "load_by_name_with_config");
+    if (!load_by_name_with_config) {
+        NN_WARN_PRINTF("load_by_name_with_config() not found");
+        // since only llama.cpp backend need to support this function
+    }
+    functions->load_by_name_with_config = load_by_name_with_config;
+
     INIT_EXECUTION_CONTEXT init_execution_context =
         (INIT_EXECUTION_CONTEXT)dlsym(handle, "init_execution_context");
     if (!init_execution_context) {
@@ -329,21 +343,23 @@ graph_encoding_to_backend_lib_name(graph_encoding encoding)
 static bool
 detect_and_load_backend(graph_encoding backend_hint,
                         struct backends_api_functions *backends,
-                        graph_encoding *loaded_backed)
+                        graph_encoding *loaded_backend)
 {
-    if (backend_hint >= autodetect)
+    if (backend_hint > autodetect)
         return false;
 
     if (backend_hint == autodetect)
         backend_hint = choose_a_backend();
 
-    /* if already loaded */
-    if (lookup[backend_hint].backend_handle) {
-        *loaded_backed = backend_hint;
-        return true;
-    }
+    if (backend_hint == unknown_backend)
+        return false;
+
+    *loaded_backend = backend_hint;
+
+    /* if already loaded */
+    if (lookup[backend_hint].backend_handle)
+        return true;
 
-    *loaded_backed = backend_hint;
     const char *backend_lib_name =
         graph_encoding_to_backend_lib_name(backend_hint);
     if (!backend_lib_name)
@@ -353,6 +369,7 @@ detect_and_load_backend(graph_encoding backend_hint,
 }
 
 /* WASI-NN implementation */
+
 #if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
 wasi_nn_error
 wasi_nn_load(wasm_exec_env_t exec_env, graph_builder_wasm *builder,
@@ -392,15 +409,15 @@ wasi_nn_load(wasm_exec_env_t exec_env, graph_builder_array_wasm *builder,
         goto fail;
     }
 
-    graph_encoding loaded_backed = autodetect;
-    if (!detect_and_load_backend(encoding, lookup, &loaded_backed)) {
+    graph_encoding loaded_backend = autodetect;
+    if (!detect_and_load_backend(encoding, lookup, &loaded_backend)) {
         res = invalid_encoding;
         NN_ERR_PRINTF("load backend failed");
         goto fail;
     }
 
     WASINNContext *wasi_nn_ctx = wasm_runtime_get_wasi_nn_ctx(instance);
-    wasi_nn_ctx->backend = loaded_backed;
+    wasi_nn_ctx->backend = loaded_backend;
 
     /* init() the backend */
     call_wasi_nn_func(wasi_nn_ctx->backend, init, res,
@@ -413,7 +430,6 @@ wasi_nn_load(wasm_exec_env_t exec_env, graph_builder_array_wasm *builder,
     if (res != success)
         goto fail;
 
-    wasi_nn_ctx->backend = loaded_backed;
     wasi_nn_ctx->is_model_loaded = true;
 
 fail:
@@ -428,8 +444,6 @@ wasi_nn_error
 wasi_nn_load_by_name(wasm_exec_env_t exec_env, char *name, uint32_t name_len,
                      graph *g)
 {
-    NN_DBG_PRINTF("[WASI NN] LOAD_BY_NAME %s...", name);
-
     wasm_module_inst_t instance = wasm_runtime_get_module_inst(exec_env);
     if (!instance) {
         return runtime_error;
@@ -446,15 +460,23 @@ wasi_nn_load_by_name(wasm_exec_env_t exec_env, char *name, uint32_t name_len,
         return invalid_argument;
     }
 
-    graph_encoding loaded_backed = autodetect;
-    if (detect_and_load_backend(autodetect, lookup, &loaded_backed)) {
+    if (name_len == 0 || name[name_len] != '\0') {
+        NN_ERR_PRINTF("Invalid filename");
+        return invalid_argument;
+    }
+
+    NN_DBG_PRINTF("[WASI NN] LOAD_BY_NAME %s...", name);
+
+    graph_encoding loaded_backend = autodetect;
+    if (!detect_and_load_backend(autodetect, lookup, &loaded_backend)) {
         NN_ERR_PRINTF("load backend failed");
         return invalid_encoding;
     }
 
     WASINNContext *wasi_nn_ctx = wasm_runtime_get_wasi_nn_ctx(instance);
-    wasi_nn_error res;
+    wasi_nn_ctx->backend = loaded_backend;
 
+    wasi_nn_error res;
     /* init() the backend */
     call_wasi_nn_func(wasi_nn_ctx->backend, init, res,
                       &wasi_nn_ctx->backend_ctx);
@@ -466,7 +488,67 @@ wasi_nn_load_by_name(wasm_exec_env_t exec_env, char *name, uint32_t name_len,
     if (res != success)
         return res;
 
-    wasi_nn_ctx->backend = loaded_backed;
+    wasi_nn_ctx->backend = loaded_backend;
+    wasi_nn_ctx->is_model_loaded = true;
+    return success;
+}
+
+wasi_nn_error
+wasi_nn_load_by_name_with_config(wasm_exec_env_t exec_env, char *name,
+                                 int32_t name_len, char *config,
+                                 int32_t config_len, graph *g)
+{
+    wasm_module_inst_t instance = wasm_runtime_get_module_inst(exec_env);
+    if (!instance) {
+        return runtime_error;
+    }
+
+    if (!wasm_runtime_validate_native_addr(instance, name, name_len)) {
+        NN_ERR_PRINTF("name is invalid");
+        return invalid_argument;
+    }
+
+    if (!wasm_runtime_validate_native_addr(instance, g,
+                                           (uint64)sizeof(graph))) {
+        NN_ERR_PRINTF("graph is invalid");
+        return invalid_argument;
+    }
+
+    if (name_len == 0 || name[name_len] != '\0') {
+        NN_ERR_PRINTF("Invalid filename");
+        return invalid_argument;
+    }
+
+    if (!config || config_len == 0 || config[config_len] != '\0') {
+        NN_ERR_PRINTF("Invalid config");
+        return invalid_argument;
+    }
+
+    NN_DBG_PRINTF("[WASI NN] LOAD_BY_NAME_WITH_CONFIG %s %s...", name, config);
+
+    graph_encoding loaded_backend = autodetect;
+    if (!detect_and_load_backend(autodetect, lookup, &loaded_backend)) {
+        NN_ERR_PRINTF("load backend failed");
+        return invalid_encoding;
+    }
+
+    WASINNContext *wasi_nn_ctx = wasm_runtime_get_wasi_nn_ctx(instance);
+    wasi_nn_ctx->backend = loaded_backend;
+
+    wasi_nn_error res;
+    /* init() the backend */
+    call_wasi_nn_func(wasi_nn_ctx->backend, init, res,
+                      &wasi_nn_ctx->backend_ctx);
+    if (res != success)
+        return res;
+
+    call_wasi_nn_func(wasi_nn_ctx->backend, load_by_name_with_config, res,
+                      wasi_nn_ctx->backend_ctx, name, name_len, config,
+                      config_len, g);
+    if (res != success)
+        return res;
+
+    wasi_nn_ctx->backend = loaded_backend;
     wasi_nn_ctx->is_model_loaded = true;
     return success;
 }
@@ -608,6 +690,7 @@ static NativeSymbol native_symbols_wasi_nn[] = {
 #if WASM_ENABLE_WASI_EPHEMERAL_NN != 0
     REG_NATIVE_FUNC(load, "(*iii*)i"),
     REG_NATIVE_FUNC(load_by_name, "(*i*)i"),
+    REG_NATIVE_FUNC(load_by_name_with_config, "(*i*i*)i"),
     REG_NATIVE_FUNC(init_execution_context, "(i*)i"),
     REG_NATIVE_FUNC(set_input, "(ii*)i"),
     REG_NATIVE_FUNC(compute, "(i)i"),
diff --git a/core/iwasm/libraries/wasi-nn/src/wasi_nn_llamacpp.c b/core/iwasm/libraries/wasi-nn/src/wasi_nn_llamacpp.c
new file mode 100644
index 000000000..58d29163c
--- /dev/null
+++ b/core/iwasm/libraries/wasi-nn/src/wasi_nn_llamacpp.c
@@ -0,0 +1,601 @@
+/*
+ * Copyright (C) 2019 Intel Corporation.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+#include "wasi_nn_types.h"
+#include "utils/logger.h"
+#include "llama.h"
+#include "ggml.h"
+#include "cJSON.h"
+
+// build info
+extern int LLAMA_BUILD_NUMBER;
+extern char const *LLAMA_COMMIT;
+extern char const *LLAMA_COMPILER;
+extern char const *LLAMA_BUILD_TARGET;
+
+// compatable with WasmEdge
+// https://github.com/second-state/WasmEdge-WASINN-examples/blob/master/wasmedge-ggml/README.md#parameters
+// https://github.com/WasmEdge/WasmEdge/blob/master/plugins/wasi_nn/ggml.cpp
+struct wasi_nn_llama_config {
+    // Backend(plugin in WasmEdge) parameters:
+    bool enable_log;
+    bool enable_debug_log;
+    bool stream_stdout;
+    // embedding mode
+    bool embedding;
+    // TODO: can it be -1?
+    // can't bigger than ctx_size
+    int32_t n_predict;
+    char *reverse_prompt;
+
+    // Used by LLaVA
+    // multi-model project file
+    char *mmproj;
+    char *image;
+
+    // Model parameters (need to reload the model if updated):
+    // align to definition of struct llama_model_params
+    int32_t n_gpu_layers;
+    int32_t main_gpu;
+    // limited size: llama_max_devices()
+    float *tensor_split;
+    bool use_mmap;
+
+    // Context parameters (used by the llama context):
+    uint32_t ctx_size;
+    uint32_t batch_size;
+    uint32_t ubatch_size;
+    uint32_t threads;
+
+    // Sampling parameters (used by the llama sampling context).
+    float temp;
+    float topP;
+    float repeat_penalty;
+    float presence_penalty;
+    float frequency_penalty;
+};
+
+struct LlamaContext {
+    struct llama_context *ctx;
+    struct llama_model *model;
+    llama_token *prompt;
+    size_t prompt_len;
+    llama_token *generation;
+    size_t generation_len;
+    struct wasi_nn_llama_config config;
+};
+
+static void
+wasm_edge_llama_default_configuration(struct wasi_nn_llama_config *output)
+{
+    output->enable_log = false;
+    output->enable_debug_log = false;
+    output->stream_stdout = false;
+    output->embedding = false;
+    output->n_predict = 512;
+    output->reverse_prompt = NULL;
+
+    output->mmproj = NULL;
+    output->image = NULL;
+
+    output->main_gpu = 0;
+    output->n_gpu_layers = 0;
+    output->tensor_split = NULL;
+    output->use_mmap = true;
+
+    // 0 = from model
+    output->ctx_size = 0;
+    output->batch_size = 512;
+    output->ubatch_size = output->batch_size;
+    output->threads = 1;
+
+    output->temp = 0.80;
+    output->topP = 0.95;
+    output->repeat_penalty = 1.10;
+    output->presence_penalty = 0.0;
+    output->frequency_penalty = 0.0;
+}
+
+static void
+wasm_edge_llama_apply_configuration(const char *config_json,
+                                    struct wasi_nn_llama_config *output)
+{
+    cJSON *root = cJSON_Parse(config_json);
+    if (root == NULL) {
+        const char *error_ptr = cJSON_GetErrorPtr();
+        if (error_ptr != NULL) {
+            NN_WARN_PRINTF("Error before: %s\n", error_ptr);
+        }
+        else {
+            NN_WARN_PRINTF("Failed to parse JSON");
+        }
+        return;
+    }
+
+    cJSON *item = NULL;
+
+    item = cJSON_GetObjectItem(root, "enable-log");
+    if (item != NULL) {
+        output->enable_log = cJSON_IsTrue(item);
+        NN_DBG_PRINTF("apply enable-log %d", output->enable_log);
+    }
+
+    item = cJSON_GetObjectItem(root, "enable-debug-log");
+    if (item != NULL) {
+        output->enable_debug_log = cJSON_IsTrue(item);
+        NN_DBG_PRINTF("apply enable-debug-log %d", output->enable_debug_log);
+    }
+
+    item = cJSON_GetObjectItem(root, "stream-stdout");
+    if (item != NULL) {
+        output->stream_stdout = cJSON_IsTrue(item);
+        NN_DBG_PRINTF("apply stream-stdout %d", output->stream_stdout);
+    }
+
+    item = cJSON_GetObjectItem(root, "embedding");
+    if (item != NULL) {
+        output->embedding = cJSON_IsTrue(item);
+        NN_DBG_PRINTF("apply embedding %d", output->embedding);
+    }
+
+    item = cJSON_GetObjectItem(root, "n-predict");
+    if (item != NULL) {
+        output->n_predict = (int32_t)cJSON_GetNumberValue(item);
+        NN_DBG_PRINTF("apply n-predict %d", output->n_predict);
+    }
+
+    item = cJSON_GetObjectItem(root, "n-gpu-layers");
+    if (item != NULL) {
+        output->n_gpu_layers = (int32_t)cJSON_GetNumberValue(item);
+        NN_DBG_PRINTF("apply n_gpu_layers %d", output->n_gpu_layers);
+    }
+
+    item = cJSON_GetObjectItem(root, "ctx-size");
+    if (item != NULL) {
+        output->ctx_size = (uint32_t)cJSON_GetNumberValue(item);
+        NN_DBG_PRINTF("apply ctx-size %d", output->ctx_size);
+    }
+
+    // more ...
+
+    cJSON_Delete(root);
+}
+
+static struct llama_model_params
+llama_model_params_from_wasi_nn_llama_config(
+    struct wasi_nn_llama_config *config)
+{
+    struct llama_model_params result = llama_model_default_params();
+
+    // TODO: support more
+    result.main_gpu = config->main_gpu;
+    result.n_gpu_layers = config->n_gpu_layers;
+    result.use_mmap = config->use_mmap;
+
+    return result;
+}
+
+static struct llama_context_params
+llama_context_params_from_wasi_nn_llama_config(
+    struct wasi_nn_llama_config *config)
+{
+    struct llama_context_params result = llama_context_default_params();
+
+    // TODO: support more
+    result.n_ctx = config->ctx_size;
+    // result.embeddings = config->embedding;
+
+    return result;
+}
+
+static void
+llama_batch_clear(struct llama_batch *batch)
+{
+    batch->n_tokens = 0;
+}
+
+static void
+llama_batch_add(struct llama_batch *batch, llama_token id, llama_pos pos,
+                llama_seq_id *seq_ids, size_t seq_ids_len, bool logits)
+{
+    batch->token[batch->n_tokens] = id;
+    batch->pos[batch->n_tokens] = pos;
+    batch->n_seq_id[batch->n_tokens] = seq_ids_len;
+    for (size_t i = 0; i < seq_ids_len; ++i) {
+        batch->seq_id[batch->n_tokens][i] = seq_ids[i];
+    }
+    batch->logits[batch->n_tokens] = logits;
+
+    batch->n_tokens++;
+}
+
+// always output ERROR and WARN
+// INFO needs enable_log
+// DEBUG needs enable_debug_log
+static void
+llama_log_callback_local(enum ggml_log_level level, const char *text,
+                         void *user_data)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)user_data;
+
+    if (level == GGML_LOG_LEVEL_DEBUG && !backend_ctx->config.enable_debug_log)
+        return;
+
+    if (level == GGML_LOG_LEVEL_INFO && !backend_ctx->config.enable_log)
+        return;
+
+    printf("%s", text);
+}
+
+static void
+llama_build_output_metadata(const struct LlamaContext *backend_ctx,
+                            char *output_buf, size_t output_buf_size)
+{
+    snprintf(output_buf, output_buf_size,
+             "{\"input_tokens\":%ld, \"output_tokens\":%ld, "
+             "\"llama_build_number\":%d,"
+             "\"llama_commit\":\"%s\"}",
+             backend_ctx->prompt_len, backend_ctx->generation_len,
+             LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+init_backend(void **ctx)
+{
+    struct LlamaContext *backend_ctx = calloc(1, sizeof(struct LlamaContext));
+    if (!backend_ctx) {
+        NN_ERR_PRINTF("Allocate for OpenVINOContext failed");
+        return runtime_error;
+    }
+
+    llama_backend_init();
+    // llama_numa_init();
+    llama_log_set(llama_log_callback_local, backend_ctx);
+
+#ifndef NDEBUG
+    NN_INFO_PRINTF("llama_build_number: % d, llama_commit: %s, llama_compiler: "
+                   "%s, llama_build_target: %s",
+                   LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER,
+                   LLAMA_BUILD_TARGET);
+#endif
+
+    *ctx = (void *)backend_ctx;
+    return success;
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+deinit_backend(void *ctx)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+
+    if (!backend_ctx)
+        return invalid_argument;
+
+    if (backend_ctx->generation)
+        free(backend_ctx->generation);
+
+    if (backend_ctx->prompt)
+        free(backend_ctx->prompt);
+
+    if (backend_ctx->ctx)
+        llama_free(backend_ctx->ctx);
+
+    if (backend_ctx->model)
+        llama_free_model(backend_ctx->model);
+
+    llama_backend_free();
+
+    os_free(backend_ctx);
+    return success;
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+load(void *ctx, graph_builder_array *builder, graph_encoding encoding,
+     execution_target target, graph *g)
+{
+    return unsupported_operation;
+}
+
+static wasi_nn_error
+__load_by_name_with_configuration(void *ctx, const char *filename, graph *g)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+
+    // make sure backend_ctx->config is initialized
+
+    struct llama_model_params model_params =
+        llama_model_params_from_wasi_nn_llama_config(&backend_ctx->config);
+    struct llama_model *model =
+        llama_load_model_from_file(filename, model_params);
+    if (model == NULL) {
+        NN_ERR_PRINTF("Failed to load model from file %s", filename);
+        return runtime_error;
+    }
+
+#ifndef NDEBUG
+    char buf[128] = { 0 };
+    llama_model_desc(model, buf, 127);
+    NN_INFO_PRINTF("Model desc %s", buf);
+#endif
+
+    backend_ctx->model = model;
+
+    return success;
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+load_by_name(void *ctx, const char *filename, uint32_t filename_len, graph *g)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+
+    // use default params
+    wasm_edge_llama_default_configuration(&backend_ctx->config);
+    return __load_by_name_with_configuration(ctx, filename, g);
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+load_by_name_with_config(void *ctx, const char *filename, uint32_t filename_len,
+                         const char *config, uint32_t config_len, graph *g)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+
+    wasm_edge_llama_default_configuration(&backend_ctx->config);
+
+    if (config != NULL) {
+        // parse wasmedge config
+        wasm_edge_llama_apply_configuration(config, &backend_ctx->config);
+    }
+    else {
+        NN_INFO_PRINTF("No configuration provided, use default");
+    }
+
+    return __load_by_name_with_configuration(ctx, filename, g);
+}
+
+// It is assumed that model params shouldn't be changed in Config stage.
+// We only load the model once in the Load stage.
+__attribute__((visibility("default"))) wasi_nn_error
+init_execution_context(void *ctx, graph g, graph_execution_context *exec_ctx)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+
+    struct llama_context_params ctx_params =
+        llama_context_params_from_wasi_nn_llama_config(&backend_ctx->config);
+    struct llama_context *llama_ctx =
+        llama_new_context_with_model(backend_ctx->model, ctx_params);
+    if (llama_ctx == NULL) {
+        NN_ERR_PRINTF("Failed to create context for model");
+        return runtime_error;
+    }
+
+    backend_ctx->ctx = llama_ctx;
+
+    NN_INFO_PRINTF("n_predict = %d, n_ctx = %d", backend_ctx->config.n_predict,
+                   llama_n_ctx(backend_ctx->ctx));
+    return success;
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+set_input(void *ctx, graph_execution_context exec_ctx, uint32_t index,
+          tensor *wasi_nn_tensor)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+    // tensor->data is the prompt string. ends with \0
+    char *prompt_text = (char *)wasi_nn_tensor->data;
+
+#ifndef NDEBUG
+    NN_DBG_PRINTF("--------------------------------------------------");
+    NN_DBG_PRINTF("prompt_text: %s", prompt_text);
+    NN_DBG_PRINTF("--------------------------------------------------");
+#endif
+
+    // tokenize the prompt
+    uint32_t n_token_max = llama_n_ctx(backend_ctx->ctx);
+    uint32_t prompt_text_len = strlen(prompt_text);
+
+    if (backend_ctx->prompt == NULL) {
+        backend_ctx->prompt = calloc(n_token_max, sizeof(llama_token));
+        if (backend_ctx->prompt == NULL) {
+            NN_ERR_PRINTF("Failed to allocate tokens_list");
+            return runtime_error;
+        }
+    }
+
+    int32_t n_tokens =
+        llama_tokenize(backend_ctx->model, prompt_text, prompt_text_len,
+                       backend_ctx->prompt, n_token_max, true, false);
+    if (n_tokens < 0) {
+        NN_ERR_PRINTF("Failed to tokenize prompt text");
+        return runtime_error;
+    }
+
+    backend_ctx->prompt_len = n_tokens;
+
+    // make sure the KV cache is big enough to hold all the prompt and generated
+    // tokens
+    int n_kv_req = n_tokens + (backend_ctx->config.n_predict - n_tokens);
+    if (n_kv_req < 0 || (uint32_t)n_kv_req > n_token_max) {
+        NN_ERR_PRINTF("the required KV cache size is not big enough, either "
+                      "reduce n_predict or increase n_ctx");
+        return runtime_error;
+    }
+
+    return success;
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+compute(void *ctx, graph_execution_context exec_ctx)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+    wasi_nn_error ret = runtime_error;
+
+    // reset the generation buffer
+    if (backend_ctx->generation == NULL) {
+        backend_ctx->generation =
+            calloc(backend_ctx->config.n_predict, sizeof(llama_token));
+        if (backend_ctx->generation == NULL) {
+            NN_ERR_PRINTF("Failed to allocate generation");
+            return runtime_error;
+        }
+    }
+
+    backend_ctx->generation_len = 0;
+
+    // check KV cache
+    uint32_t n_ctx = llama_n_ctx(backend_ctx->ctx);
+    if (n_ctx <= backend_ctx->generation_len) {
+        NN_ERR_PRINTF(
+            "ctx_size(%u) is not big enough(<%ld), please increase it", n_ctx,
+            backend_ctx->generation_len);
+        return context_full;
+    }
+
+    // prepare the batch
+    struct llama_batch batch =
+        llama_batch_init(backend_ctx->config.batch_size, 0, 1);
+
+    // evaluate the initial prompt
+    llama_seq_id seq_ids[1] = { 0 };
+    for (size_t i = 0; i < backend_ctx->prompt_len; i++) {
+        llama_batch_add(&batch, backend_ctx->prompt[i], i, seq_ids,
+                        sizeof(seq_ids) / sizeof(seq_ids[0]), false);
+    }
+
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (batch.n_tokens > backend_ctx->config.n_predict) {
+        NN_DBG_PRINTF("n_predict(%d) is not big enough(%d), please increase it",
+                      backend_ctx->config.n_predict, batch.n_tokens);
+        return prompt_tool_long;
+    }
+
+    if (llama_decode(backend_ctx->ctx, batch) != 0) {
+        NN_ERR_PRINTF("First decode failed");
+        return runtime_error;
+    }
+
+    // main loop
+    int32_t n_cur = batch.n_tokens;
+    int n_decode = 0;
+    int32_t n_vocab = llama_n_vocab(backend_ctx->model);
+    llama_token_data *candidates = NULL;
+
+    candidates = calloc(n_vocab, sizeof(llama_token_data));
+    if (candidates == NULL) {
+        NN_ERR_PRINTF("Failed to allocate candidates");
+        goto fail;
+    }
+
+    while (n_cur <= backend_ctx->config.n_predict) {
+        // sample the next token
+        float *logits =
+            llama_get_logits_ith(backend_ctx->ctx, batch.n_tokens - 1);
+
+        memset(candidates, 0, sizeof(llama_token_data) * n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates[token_id].id = token_id;
+            candidates[token_id].logit = logits[token_id];
+            candidates[token_id].p = 0.0f;
+        }
+
+        llama_token_data_array candidates_p = { candidates, n_vocab, false };
+
+        // sample the most likely token
+        llama_token new_token_id =
+            llama_sample_token_greedy(backend_ctx->ctx, &candidates_p);
+
+        backend_ctx->generation[backend_ctx->generation_len++] = new_token_id;
+
+#ifndef NDEBUG
+        {
+            char buf[128] = { 0 };
+            llama_token_to_piece(backend_ctx->model, new_token_id, buf, 120, 0,
+                                 true);
+            printf("%d(%s),", new_token_id, buf);
+        }
+#endif
+
+        // is it an end of generation?
+        if (llama_token_is_eog(backend_ctx->model, new_token_id)) {
+            printf("\n");
+            NN_INFO_PRINTF("reach the end of generation");
+            break;
+        }
+
+        // prepare the next batch
+        llama_batch_clear(&batch);
+        // push this new token for next evaluation
+        llama_batch_add(&batch, new_token_id, n_cur, seq_ids,
+                        sizeof(seq_ids) / sizeof(seq_ids[0]), true);
+        n_decode++;
+        n_cur++;
+
+        if (llama_decode(backend_ctx->ctx, batch) != 0) {
+            NN_ERR_PRINTF("Secondary decode failed");
+            goto fail;
+        }
+    }
+
+    printf("\n");
+    ret = success;
+fail:
+    llama_batch_free(batch);
+    if (candidates != NULL) {
+        free(candidates);
+    }
+    return ret;
+}
+
+__attribute__((visibility("default"))) wasi_nn_error
+get_output(void *ctx, graph_execution_context exec_ctx, uint32_t index,
+           tensor_data output_tensor, uint32_t *output_tensor_size)
+{
+    struct LlamaContext *backend_ctx = (struct LlamaContext *)ctx;
+
+    // Compatibility with WasmEdge
+    if (index > 1) {
+        NN_ERR_PRINTF("Invalid output index %d", index);
+        return invalid_argument;
+    }
+
+    // Index 1 is for the metadata of the outputs.
+    if (index == 1) {
+        char output_metadata[128] = { 0 };
+        llama_build_output_metadata(backend_ctx, output_metadata, 127);
+
+        if (backend_ctx->config.stream_stdout) {
+            printf("%s\n", output_metadata);
+        }
+
+        memcpy(output_tensor, output_metadata, strlen(output_metadata));
+        *output_tensor_size = strlen(output_metadata);
+        return success;
+    }
+
+    // token -> piece -> output_tensor
+    if (backend_ctx->config.stream_stdout) {
+        printf("\n");
+    }
+
+    size_t end_pos = 0;
+    for (size_t i = 0; i < backend_ctx->generation_len; i++) {
+        char buf[128] = { 0 };
+        llama_token_to_piece(backend_ctx->model, backend_ctx->generation[i],
+                             buf, 120, 0, true);
+
+        if (backend_ctx->config.stream_stdout) {
+            printf("%s", buf);
+        }
+
+        memcpy(output_tensor + end_pos, buf, strlen(buf));
+        end_pos += strlen(buf);
+    }
+
+    if (backend_ctx->config.stream_stdout) {
+        printf("\n");
+    }
+
+    *output_tensor_size = end_pos;
+    return success;
+}
diff --git a/core/iwasm/libraries/wasi-nn/test/Dockerfile.wasi-nn-smoke b/core/iwasm/libraries/wasi-nn/test/Dockerfile.wasi-nn-smoke
index 261c77261..fe3a8c512 100644
--- a/core/iwasm/libraries/wasi-nn/test/Dockerfile.wasi-nn-smoke
+++ b/core/iwasm/libraries/wasi-nn/test/Dockerfile.wasi-nn-smoke
@@ -63,21 +63,35 @@ WORKDIR /workspaces/wasmedge-wasinn-examples
 RUN git clone --depth 1 https://github.com/second-state/WasmEdge-WASINN-examples.git .
 COPY core/iwasm/libraries/wasi-nn/test/bump_wasi_nn_to_0_6_0.patch .
 RUN git apply ./bump_wasi_nn_to_0_6_0.patch
-# recompile with wasi-nn 0.6.0
-RUN cd openvino-mobilenet-image/rust && cargo build --target=wasm32-wasi
-RUN cd openvino-mobilenet-raw/rust && cargo build --target=wasm32-wasi
-RUN cd openvino-road-segmentation-adas/openvino-road-seg-adas && cargo build --target=wasm32-wasi
-RUN cd tflite-birds_v1-image/rust && cargo build --target=wasm32-wasi
 
-# preparation
-RUN cd openvino-mobilenet-image \
+# recompile with wasi-nn 0.6.0
+WORKDIR /workspaces/wasmedge-wasinn-examples/openvino-mobilenet-image/
+RUN pushd rust \
+  && cargo build --target=wasm32-wasi \
+  && popd \
   && ./download_mobilenet.sh . \
   && ls -l mobilenet.xml mobilenet.bin
 
-RUN cd openvino-mobilenet-raw \
+WORKDIR /workspaces/wasmedge-wasinn-examples/openvino-mobilenet-raw/
+RUN pushd rust \
+  && cargo build --target=wasm32-wasi \
+  && popd \
   && ./download_mobilenet.sh . \
   && ls -l mobilenet.xml mobilenet.bin tensor-1x224x224x3-f32.bgr
 
+WORKDIR /workspaces/wasmedge-wasinn-examples/openvino-road-segmentation-adas/
+RUN pushd openvino-road-seg-adas \
+  && cargo build --target=wasm32-wasi
+
+WORKDIR /workspaces/wasmedge-wasinn-examples/tflite-birds_v1-image/
+RUN pushd rust \
+  && cargo build --target=wasm32-wasi
+
+# mount models when running
+WORKDIR /workspaces/wasmedge-wasinn-examples/wasmedge-ggml/qwen
+RUN wget --progress=dot:giga https://www.modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GGUF/resolve/master/qwen1_5-0_5b-chat-q2_k.gguf
+RUN cargo build --target=wasm32-wasi
+
 #
 # iwasm. build from source
 WORKDIR /workspaces/wamr
@@ -88,15 +102,16 @@ WORKDIR /workspaces/wamr/product-mini/platforms/linux
 RUN OpenVINO_DIR=/usr/lib/openvino-2023.2.0 \
     cmake -S . -B build \
     -DWAMR_BUILD_WASI_NN=1 -DWAMR_BUILD_WASI_EPHEMERAL_NN=1 \
-    -DWAMR_BUILD_WASI_NN_OPENVINO=1 -DWAMR_BUILD_WASI_NN_TFLITE=1 \
-  && cmake --build build
-
-ENV PATH=/workspaces/wamr/product-mini/platforms/linux/build:${PATH}
-ENV LD_LIBRARY_PATH=/workspaces/wamr/product-mini/platforms/linux/build
+    -DWAMR_BUILD_WASI_NN_OPENVINO=1 \
+    -DWAMR_BUILD_WASI_NN_TFLITE=1 \
+    -DWAMR_BUILD_WASI_NN_LLAMACPP=1 \
+  && cmake --build build \
+  && cmake --install build
+ 
+ENV LD_LIBRARY_PATH=/usr/local/lib
 
 # add smoke test script
 COPY core/iwasm/libraries/wasi-nn/test/run_smoke_test.py /
 
-#
 WORKDIR /workspaces/wasmedge-wasinn-examples
 CMD ["python3", "/run_smoke_test.py"]
diff --git a/core/iwasm/libraries/wasi-nn/test/run_smoke_test.py b/core/iwasm/libraries/wasi-nn/test/run_smoke_test.py
index a62d9cb7a..304b0c977 100644
--- a/core/iwasm/libraries/wasi-nn/test/run_smoke_test.py
+++ b/core/iwasm/libraries/wasi-nn/test/run_smoke_test.py
@@ -260,6 +260,63 @@ def execute_openvino_road_segmentation_adas(
     print("------------------------------------------------------------")
 
 
+def execute_wasmedge_ggml_qwen(iwasm_bin: str, wasmedge_bin: str, cwd: Path):
+    iwasm_args = ["--dir=."]
+    wasm_file = ["./target/wasm32-wasi/debug/wasmedge-ggml-qwen.wasm"]
+    wasm_args = ["./qwen1_5-0_5b-chat-q2_k.gguf"]
+
+    cmd = [iwasm_bin]
+    cmd.extend(iwasm_args)
+    cmd.extend(wasm_file)
+    cmd.extend(wasm_args)
+
+    # print(f'Execute: {" ".join(cmd)}')
+
+    prompt = "what is the capital of Pakistan"
+
+    with subprocess.Popen(
+        cmd,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        cwd=cwd,
+    ) as p:
+        # USER
+        p.stdout.readline()
+
+        p.stdin.write(b"hi\n")
+        p.stdin.flush()
+        # ASSITANT
+        p.stdout.readline()
+        # xxx
+        p.stdout.readline()
+        # USER
+        p.stdout.readline()
+
+        p.stdin.write(prompt.encode())
+        p.stdin.write(b"\n")
+        p.stdin.flush()
+        # ASSITANT
+        p.stdout.readline()
+        # xxx
+        answer = p.stdout.readline().decode("utf-8")
+        # USER
+        p.stdout.readline()
+
+        p.terminate()
+
+    if "Karachi" in answer:
+        print(f"- wasmedge_ggml_qwen. PASS")
+        return
+
+    print(f"- wasmedge_ggml_qwen. FAILED")
+    print("------------------------------------------------------------")
+    pprint(answer)
+    print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
+    pprint("Karachi")
+    print("------------------------------------------------------------")
+
+
 def execute_wasmedge_wasinn_examples(iwasm_bin: str, wasmedge_bin: str):
     assert Path.cwd().name == "wasmedge-wasinn-examples"
     assert shutil.which(iwasm_bin)
@@ -282,6 +339,9 @@ def execute_wasmedge_wasinn_examples(iwasm_bin: str, wasmedge_bin: str):
         iwasm_bin, wasmedge_bin, openvino_road_segmentation_adas_dir
     )
 
+    wasmedge_ggml_qwem_dir = Path.cwd().joinpath("./wasmedge-ggml/qwen")
+    execute_wasmedge_ggml_qwen(iwasm_bin, wasmedge_bin, wasmedge_ggml_qwem_dir)
+
 
 if __name__ == "__main__":
     execute_wasmedge_wasinn_examples("iwasm", "wasmedge")
diff --git a/core/shared/platform/common/posix/posix_memmap.c b/core/shared/platform/common/posix/posix_memmap.c
index c76abf137..1d972f5fa 100644
--- a/core/shared/platform/common/posix/posix_memmap.c
+++ b/core/shared/platform/common/posix/posix_memmap.c
@@ -138,18 +138,25 @@ os_mmap(void *hint, size_t size, int prot, int flags, os_file_handle file)
 
     /* memory hasn't been mapped or was mapped failed previously */
     if (addr == MAP_FAILED) {
-        /* try 5 times */
-        for (i = 0; i < 5; i++) {
+        /* try 5 times on EAGAIN or ENOMEM, and keep retrying on EINTR */
+        i = 0;
+        while (i < 5) {
             addr = mmap(hint, request_size, map_prot, map_flags, file, 0);
             if (addr != MAP_FAILED)
                 break;
+            if (errno == EINTR)
+                continue;
+            if (errno != EAGAIN && errno != ENOMEM) {
+                break;
+            }
+            i++;
         }
     }
 
     if (addr == MAP_FAILED) {
-#if BH_ENABLE_TRACE_MMAP != 0
-        os_printf("mmap failed\n");
-#endif
+        os_printf("mmap failed with errno: %d, hint: %p, size: %" PRIu64
+                  ", prot: %d, flags: %d",
+                  errno, hint, request_size, map_prot, map_flags);
         return NULL;
     }
 
diff --git a/core/shared/platform/include/platform_api_extension.h b/core/shared/platform/include/platform_api_extension.h
index b1c3b4f4a..37b8399b8 100644
--- a/core/shared/platform/include/platform_api_extension.h
+++ b/core/shared/platform/include/platform_api_extension.h
@@ -379,19 +379,19 @@ os_sem_unlink(const char *name);
  * Initialize process-global state for os_wakeup_blocking_op.
  */
 int
-os_blocking_op_init();
+os_blocking_op_init(void);
 
 /**
  * Start accepting os_wakeup_blocking_op requests for the calling thread.
  */
 void
-os_begin_blocking_op();
+os_begin_blocking_op(void);
 
 /**
  * Stop accepting os_wakeup_blocking_op requests for the calling thread.
  */
 void
-os_end_blocking_op();
+os_end_blocking_op(void);
 
 /**
  * Wake up the specified thread.
@@ -1586,7 +1586,7 @@ os_closedir(os_dir_stream dir_stream);
  * @return the invalid directory stream
  */
 os_dir_stream
-os_get_invalid_dir_stream();
+os_get_invalid_dir_stream(void);
 
 /**
  * Checks whether the given directory stream is valid. An invalid directory
@@ -1605,7 +1605,7 @@ os_is_dir_stream_valid(os_dir_stream *dir_stream);
  * @return the invalid handle
  */
 os_file_handle
-os_get_invalid_handle();
+os_get_invalid_handle(void);
 
 /**
  * Checks whether the given file handle is valid. An invalid handle is
diff --git a/core/shared/platform/nuttx/platform_internal.h b/core/shared/platform/nuttx/platform_internal.h
index 0b54d85a9..fef2122da 100644
--- a/core/shared/platform/nuttx/platform_internal.h
+++ b/core/shared/platform/nuttx/platform_internal.h
@@ -137,7 +137,7 @@ typedef DIR *os_dir_stream;
 typedef int os_raw_file_handle;
 
 static inline os_file_handle
-os_get_invalid_handle()
+os_get_invalid_handle(void)
 {
     return -1;
 }
diff --git a/product-mini/platforms/common/libc_wasi.c b/product-mini/platforms/common/libc_wasi.c
index 84e133bc0..2f0b35125 100644
--- a/product-mini/platforms/common/libc_wasi.c
+++ b/product-mini/platforms/common/libc_wasi.c
@@ -28,7 +28,7 @@ typedef enum {
 } libc_wasi_parse_result_t;
 
 static void
-libc_wasi_print_help()
+libc_wasi_print_help(void)
 {
     printf("  --env=<env>              Pass wasi environment variables with "
            "\"key=value\"\n");
diff --git a/product-mini/platforms/nuttx/CMakeLists.txt b/product-mini/platforms/nuttx/CMakeLists.txt
index ac6c47b91..ca07a5d55 100644
--- a/product-mini/platforms/nuttx/CMakeLists.txt
+++ b/product-mini/platforms/nuttx/CMakeLists.txt
@@ -197,7 +197,7 @@ include(${WAMR_ROOT_DIR}/build-scripts/runtime_lib.cmake)
 # NuttX wamr lib complie required: `WAMR_SOURCES` `WAMR_CFLAGS` `WAMR_INCDIRS`
 # `WAMR_DEFINITIONS`
 set(WAMR_SOURCES ${WAMR_RUNTIME_LIB_SOURCE})
-set(WAMR_CFLAGS -Wno-strict-prototypes -Wno-shadow -Wno-unused-variable
+set(WAMR_CFLAGS -Wno-shadow -Wno-unused-variable
                 -Wno-int-conversion -Wno-implicit-function-declaration)
 get_directory_property(WAMR_INCDIRS INCLUDE_DIRECTORIES)
 get_directory_property(WAMR_DEFINITIONS COMPILE_DEFINITIONS)
diff --git a/product-mini/platforms/nuttx/wamr.mk b/product-mini/platforms/nuttx/wamr.mk
index 38553e863..0ee76c7dd 100644
--- a/product-mini/platforms/nuttx/wamr.mk
+++ b/product-mini/platforms/nuttx/wamr.mk
@@ -400,7 +400,7 @@ CFLAGS += -DWASM_ENABLE_EXCE_HANDLING=0
 CFLAGS += -DWASM_ENABLE_TAGS=0
 endif
 
-CFLAGS += -Wno-strict-prototypes -Wno-shadow -Wno-unused-variable
+CFLAGS += -Wno-shadow -Wno-unused-variable
 CFLAGS += -Wno-int-conversion -Wno-implicit-function-declaration
 
 CFLAGS += -I${CORE_ROOT} \
diff --git a/product-mini/platforms/posix/main.c b/product-mini/platforms/posix/main.c
index c1ba169d5..14dc01f6b 100644
--- a/product-mini/platforms/posix/main.c
+++ b/product-mini/platforms/posix/main.c
@@ -27,7 +27,7 @@ static char **app_argv;
 
 /* clang-format off */
 static int
-print_help()
+print_help(void)
 {
     printf("Usage: iwasm [-options] wasm_file [args...]\n");
     printf("options:\n");
diff --git a/tests/wamr-test-suites/test_wamr.sh b/tests/wamr-test-suites/test_wamr.sh
index 8254cc712..87e156865 100755
--- a/tests/wamr-test-suites/test_wamr.sh
+++ b/tests/wamr-test-suites/test_wamr.sh
@@ -511,7 +511,7 @@ function spec_test()
         pushd spec
 
         # Reset to commit: "Merge pull request #48 from backes/specify-memcpy-immediate-order"
-        git reset --hard 48e69f394869c55b7bbe14ac963c09f4605490b6
+        git reset --hard fbc99efd7a788db300aec3dd62a14577ec404f1b
         git checkout 044d0d2e77bdcbe891f7e0b9dd2ac01d56435f0b -- test/core/elem.wast
         git apply ../../spec-test-script/multi_memory_ignore_cases.patch || exit 1
         if [[ ${RUNNING_MODE} == "aot" ]]; then
diff --git a/wamr-compiler/main.c b/wamr-compiler/main.c
index b3e731e53..53c75c84e 100644
--- a/wamr-compiler/main.c
+++ b/wamr-compiler/main.c
@@ -142,9 +142,7 @@ print_help()
     printf("                              with a runtime without the hardware bounds checks.\n");
     printf("  --stack-bounds-checks=1/0 Enable or disable the bounds checks for native stack:\n");
     printf("                              if the option isn't set, the status is same as `--bounds-check`,\n");
-    printf("                              if the option is set:\n");
-    printf("                                (1) it is always enabled when `--bounds-checks` is enabled,\n");
-    printf("                                (2) else it is enabled/disabled according to the option value\n");
+    printf("                              if the option is set, the status is same as the option value\n");
     printf("  --stack-usage=<file>      Generate a stack-usage file.\n");
     printf("                              Similarly to `clang -fstack-usage`.\n");
     printf("  --format=<format>         Specifies the format of the output file\n");
@@ -164,6 +162,12 @@ print_help()
     printf("                              GC is enabled\n");
     printf("  --disable-aux-stack-check Disable auxiliary stack overflow/underflow check\n");
     printf("  --enable-dump-call-stack  Enable stack trace feature\n");
+    printf("  --call-stack-features=<features>\n");
+    printf("                            A comma-separated list of features when generating call stacks.\n");
+    printf("                            By default, all features are enabled. To disable all features,\n");
+    printf("                            provide an empty list (i.e. --call-stack-features=). This flag\n");
+    printf("                            only only takes effect when --enable-dump-call-stack is set.\n");
+    printf("                            Available features: bounds-checks, ip, trap-ip, values.\n");
     printf("  --enable-perf-profiling   Enable function performance profiling\n");
     printf("  --enable-memory-profiling Enable memory usage profiling\n");
     printf("  --xip                     A shorthand of --enable-indirect-mode --disable-llvm-intrinsics\n");
@@ -261,6 +265,55 @@ split_string(char *str, int *count, const char *delimer)
     return res;
 }
 
+static bool
+parse_call_stack_features(char *features_str,
+                          AOTCallStackFeatures *out_features)
+{
+    int size = 0;
+    char **features;
+    bool ret = true;
+
+    bh_assert(features_str);
+    bh_assert(out_features);
+
+    /* non-empty feature list */
+    features = split_string(features_str, &size, ",");
+    if (!features) {
+        return false;
+    }
+
+    while (size--) {
+        if (!strcmp(features[size], "bounds-checks")) {
+            out_features->bounds_checks = true;
+        }
+        else if (!strcmp(features[size], "ip")) {
+            out_features->ip = true;
+        }
+        else if (!strcmp(features[size], "trap-ip")) {
+            out_features->trap_ip = true;
+        }
+        else if (!strcmp(features[size], "values")) {
+            out_features->values = true;
+        }
+        else {
+            ret = false;
+            printf("Unsupported feature %s\n", features[size]);
+            goto finish;
+        }
+    }
+
+finish:
+    free(features);
+    return ret;
+}
+
+static bool
+can_enable_tiny_frame(const AOTCompOption *opt)
+{
+    return !opt->call_stack_features.values && !opt->enable_gc
+           && !opt->enable_perf_profiling;
+}
+
 static uint32
 resolve_segue_flags(char *str_flags)
 {
@@ -357,6 +410,7 @@ main(int argc, char *argv[])
     option.enable_bulk_memory = true;
     option.enable_ref_types = true;
     option.enable_gc = false;
+    aot_call_stack_features_init_default(&option.call_stack_features);
 
     /* Process options */
     for (argc--, argv++; argc > 0 && argv[0][0] == '-'; argc--, argv++) {
@@ -470,10 +524,23 @@ main(int argc, char *argv[])
             option.enable_aux_stack_check = false;
         }
         else if (!strcmp(argv[0], "--enable-dump-call-stack")) {
-            option.enable_aux_stack_frame = true;
+            option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
+        }
+        else if (!strncmp(argv[0], "--call-stack-features=", 22)) {
+            /* Reset all the features, only enable the user-defined ones */
+            memset(&option.call_stack_features, 0,
+                   sizeof(AOTCallStackFeatures));
+
+            if (argv[0][22] != '\0') {
+                if (!parse_call_stack_features(argv[0] + 22,
+                                               &option.call_stack_features)) {
+                    printf("Failed to parse call-stack-features\n");
+                    PRINT_HELP_AND_EXIT();
+                }
+            }
         }
         else if (!strcmp(argv[0], "--enable-perf-profiling")) {
-            option.enable_aux_stack_frame = true;
+            option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
             option.enable_perf_profiling = true;
         }
         else if (!strcmp(argv[0], "--enable-memory-profiling")) {
@@ -488,7 +555,7 @@ main(int argc, char *argv[])
             option.is_indirect_mode = true;
         }
         else if (!strcmp(argv[0], "--enable-gc")) {
-            option.enable_aux_stack_frame = true;
+            option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_STANDARD;
             option.enable_gc = true;
         }
         else if (!strcmp(argv[0], "--disable-llvm-intrinsics")) {
@@ -590,6 +657,14 @@ main(int argc, char *argv[])
     if (!use_dummy_wasm && (argc == 0 || !out_file_name))
         PRINT_HELP_AND_EXIT();
 
+    if (option.aux_stack_frame_type == AOT_STACK_FRAME_TYPE_STANDARD
+        && can_enable_tiny_frame(&option)) {
+        LOG_VERBOSE("Use tiny frame mode for stack frames");
+        option.aux_stack_frame_type = AOT_STACK_FRAME_TYPE_TINY;
+        /* for now we only enable frame per function for a TINY frame mode */
+        option.call_stack_features.frame_per_function = true;
+    }
+
     if (!size_level_set) {
         /**
          * Set opt level to 1 by default for Windows and MacOS as
@@ -601,8 +676,8 @@ main(int argc, char *argv[])
             LOG_VERBOSE("Set size level to 1 for Windows AOT file");
             option.size_level = 1;
         }
-#if defined(_WIN32) || defined(_WIN32_) || defined(__APPLE__) \
-    || defined(__MACH__)
+#if defined(_WIN32) || defined(_WIN32_) \
+    || ((defined(__APPLE__) || defined(__MACH__)) && !defined(__arm64__))
         if (!option.target_arch && !option.target_abi) {
             LOG_VERBOSE("Set size level to 1 for Windows or MacOS AOT file");
             option.size_level = 1;
@@ -610,6 +685,12 @@ main(int argc, char *argv[])
 #endif
     }
 
+    if (option.enable_gc && !option.call_stack_features.values) {
+        LOG_WARNING("Call stack feature 'values' must be enabled for GC. The "
+                    "feature will be enabled automatically.");
+        option.call_stack_features.values = true;
+    }
+
     if (sgx_mode) {
         option.size_level = 1;
         option.is_sgx_platform = true;