diff --git a/.github/workflows/compilation_on_android_ubuntu.yml b/.github/workflows/compilation_on_android_ubuntu.yml
index aa366833b..2a57f6219 100644
--- a/.github/workflows/compilation_on_android_ubuntu.yml
+++ b/.github/workflows/compilation_on_android_ubuntu.yml
@@ -10,7 +10,8 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_android_ubuntu.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +27,8 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_android_ubuntu.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -338,7 +340,9 @@ jobs:
       ]
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
+        sanitizer: ["", "ubsan"]
         make_options: [
             # Running mode
             $AOT_BUILD_OPTIONS,
@@ -363,6 +367,7 @@ jobs:
             llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2004.outputs.cache_key }}
           - os: ubuntu-22.04
             llvm_cache_key: ${{ needs.build_llvm_libraries_on_ubuntu_2204.outputs.cache_key }}
+
     steps:
       - name: checkout
         uses: actions/checkout@v3
@@ -395,15 +400,16 @@ jobs:
         if: (!endsWith(matrix.make_options, '_INTERP_BUILD_OPTIONS'))
         run: |
           mkdir build && cd build
-          cmake ..
+          cmake -DSANITIZER="${{matrix.sanitizer}}" ..
           cmake --build . --config Release --parallel 4
         working-directory: wamr-compiler
 
       - name: Build Sample [wasm-c-api]
         run: |
-          cmake -S . -B build ${{ matrix.make_options }}
+          VERBOSE=1
+          cmake -S . -B build ${{ matrix.make_options }} -DSANITIZER="${{matrix.sanitizer}}"
           cmake --build build --config Release --parallel 4
-          ctest --test-dir build
+          ctest --test-dir build --output-on-failure
         working-directory: samples/wasm-c-api
 
   build_samples_others:
diff --git a/.github/workflows/compilation_on_macos.yml b/.github/workflows/compilation_on_macos.yml
index ec81773f4..aaa97d038 100644
--- a/.github/workflows/compilation_on_macos.yml
+++ b/.github/workflows/compilation_on_macos.yml
@@ -10,7 +10,8 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_macos.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +27,8 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_macos.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
diff --git a/.github/workflows/compilation_on_nuttx.yml b/.github/workflows/compilation_on_nuttx.yml
index c8553a123..f338c8dea 100644
--- a/.github/workflows/compilation_on_nuttx.yml
+++ b/.github/workflows/compilation_on_nuttx.yml
@@ -10,7 +10,7 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_nuttx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +26,7 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_nuttx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
diff --git a/.github/workflows/compilation_on_sgx.yml b/.github/workflows/compilation_on_sgx.yml
index 985674f6e..f17261118 100644
--- a/.github/workflows/compilation_on_sgx.yml
+++ b/.github/workflows/compilation_on_sgx.yml
@@ -10,7 +10,8 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_sgx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +27,8 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/build_llvm_libraries.yml"
+      - ".github/workflows/compilation_on_sgx.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
diff --git a/.github/workflows/compilation_on_windows.yml b/.github/workflows/compilation_on_windows.yml
index 1cf06d626..0d38e8ae5 100644
--- a/.github/workflows/compilation_on_windows.yml
+++ b/.github/workflows/compilation_on_windows.yml
@@ -10,7 +10,7 @@ on:
       - opened
       - synchronize
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_windows.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
@@ -26,7 +26,7 @@ on:
       - main
       - "dev/**"
     paths:
-      - ".github/**"
+      - ".github/workflows/compilation_on_windows.yml"
       - "build-scripts/**"
       - "core/**"
       - "!core/deps/**"
diff --git a/README.md b/README.md
index 6d7d4778b..8cbdcf495 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 
 **[Guide](https://wamr.gitbook.io/)**&emsp;&emsp;**[Website](https://bytecodealliance.github.io/wamr.dev)**&emsp;&emsp;**[Chat](https://bytecodealliance.zulipchat.com/#narrow/stream/290350-wamr)**
 
-[Build WAMR](./doc/build_wamr.md) | [Build AOT Compiler](./wamr-compiler/README.md) | [Embed WAMR](./doc/embed_wamr.md) | [Export Native API](./doc/export_native_api.md) | [Build Wasm Apps](./doc/build_wasm_app.md) | [Samples](./README.md#samples)
+[Build WAMR](./doc/build_wamr.md) | [Build AOT Compiler](./wamr-compiler/README.md) | [Embed WAMR](./doc/embed_wamr.md) | [Export Native API](./doc/export_native_api.md) | [Build Wasm Apps](./doc/build_wasm_app.md) | [Samples](./samples/README.md)
 
 WebAssembly Micro Runtime (WAMR) is a lightweight standalone WebAssembly (Wasm) runtime with small footprint, high performance and highly configurable features for applications cross from embedded, IoT, edge to Trusted Execution Environment (TEE), smart contract, cloud native and so on. It includes a few parts as below:
 - [**VMcore**](./core/iwasm/): A set of runtime libraries for loading and running Wasm modules. It supports several execution modes including interpreter, Ahead-of-Time compilation(AoT) and Just-in-Time compilation (JIT). The WAMR supports two JIT tiers - Fast JIT, LLVM JIT, and dynamic tier-up from Fast JIT to LLVM JIT.
diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake
index 3e2111db6..bfdbe3aab 100644
--- a/build-scripts/config_common.cmake
+++ b/build-scripts/config_common.cmake
@@ -345,6 +345,13 @@ if (WAMR_BUILD_WASI_NN EQUAL 1)
       message ("     WASI-NN: GPU enabled")
       add_definitions (-DWASI_NN_ENABLE_GPU=1)
   endif ()
+  if (WAMR_BUILD_WASI_NN_ENABLE_EXT EQUAL 1)
+      message ("     WASI-NN: External Delegation enabled")
+      add_definitions (-DWASI_NN_ENABLE_EXTERNAL_DELEGATE=1)
+  endif ()
+  if (DEFINED WASI_NN_EXT_DELEGATE_PATH)
+      add_definitions (-DWASI_NN_EXT_DELEGATE_PATH="${WASI_NN_EXT_DELEGATE_PATH}")
+  endif ()
 endif ()
 if (WAMR_BUILD_ALLOC_WITH_USER_DATA EQUAL 1)
   add_definitions(-DWASM_MEM_ALLOC_WITH_USER_DATA=1)
diff --git a/build-scripts/runtime_lib.cmake b/build-scripts/runtime_lib.cmake
index 80ca85c25..6931ece74 100644
--- a/build-scripts/runtime_lib.cmake
+++ b/build-scripts/runtime_lib.cmake
@@ -101,26 +101,6 @@ if (WAMR_BUILD_LIB_PTHREAD_SEMAPHORE EQUAL 1)
 endif ()
 
 if (WAMR_BUILD_WASI_NN EQUAL 1)
-    if (NOT EXISTS "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
-        execute_process(COMMAND ${WAMR_ROOT_DIR}/core/deps/install_tensorflow.sh
-                        RESULT_VARIABLE TENSORFLOW_RESULT
-        )
-    else ()
-        message("Tensorflow is already downloaded.")
-    endif()
-    set(TENSORFLOW_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
-
-    if (WASI_NN_ENABLE_GPU EQUAL 1)
-        # Tensorflow specific:
-        # * https://www.tensorflow.org/lite/guide/build_cmake#available_options_to_build_tensorflow_lite
-        set (TFLITE_ENABLE_GPU ON)
-    endif ()
-
-    include_directories (${CMAKE_CURRENT_BINARY_DIR}/flatbuffers/include)
-    include_directories (${TENSORFLOW_SOURCE_DIR})
-    add_subdirectory(
-        "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
-        "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite" EXCLUDE_FROM_ALL)
     include (${IWASM_DIR}/libraries/wasi-nn/wasi_nn.cmake)
 endif ()
 
diff --git a/core/iwasm/common/wasm_c_api.c b/core/iwasm/common/wasm_c_api.c
index 639980ca2..7b8cf4779 100644
--- a/core/iwasm/common/wasm_c_api.c
+++ b/core/iwasm/common/wasm_c_api.c
@@ -23,6 +23,9 @@
 #if WASM_ENABLE_WASM_CACHE != 0
 #include <openssl/sha.h>
 #endif
+#if WASM_ENABLE_THREAD_MGR != 0
+#include "thread_manager.h"
+#endif
 
 /*
  * Thread Model:
@@ -3315,7 +3318,17 @@ wasm_func_call(const wasm_func_t *func, const wasm_val_vec_t *params,
         goto failed;
     }
 
-    exec_env = wasm_runtime_get_exec_env_singleton(func->inst_comm_rt);
+#ifdef OS_ENABLE_HW_BOUND_CHECK
+    exec_env = wasm_runtime_get_exec_env_tls();
+#endif
+#if WASM_ENABLE_THREAD_MGR != 0
+    if (!exec_env) {
+        exec_env = wasm_clusters_search_exec_env(func->inst_comm_rt);
+    }
+#endif
+    if (!exec_env) {
+        exec_env = wasm_runtime_get_exec_env_singleton(func->inst_comm_rt);
+    }
     if (!exec_env) {
         goto failed;
     }
diff --git a/core/iwasm/compilation/aot_llvm.c b/core/iwasm/compilation/aot_llvm.c
index 27550560f..dc3fe7f59 100644
--- a/core/iwasm/compilation/aot_llvm.c
+++ b/core/iwasm/compilation/aot_llvm.c
@@ -4,6 +4,7 @@
  */
 
 #include "aot_llvm.h"
+#include "aot_llvm_extra2.h"
 #include "aot_compiler.h"
 #include "aot_emit_exception.h"
 #include "../aot/aot_runtime.h"
@@ -2055,9 +2056,10 @@ aot_create_comp_context(AOTCompData *comp_data, aot_comp_option_t option)
             code_model = LLVMCodeModelSmall;
 
         /* Create the target machine */
-        if (!(comp_ctx->target_machine = LLVMCreateTargetMachine(
+        if (!(comp_ctx->target_machine = LLVMCreateTargetMachineWithOpts(
                   target, triple_norm, cpu, features, opt_level,
-                  LLVMRelocStatic, code_model))) {
+                  LLVMRelocStatic, code_model, false,
+                  option->stack_usage_file))) {
             aot_set_last_error("create LLVM target machine failed.");
             goto fail;
         }
diff --git a/core/iwasm/compilation/aot_llvm.h b/core/iwasm/compilation/aot_llvm.h
index b982e8083..2a1564019 100644
--- a/core/iwasm/compilation/aot_llvm.h
+++ b/core/iwasm/compilation/aot_llvm.h
@@ -415,6 +415,7 @@ typedef struct AOTCompOption {
     uint32 stack_bounds_checks;
     char **custom_sections;
     uint32 custom_sections_count;
+    const char *stack_usage_file;
 } AOTCompOption, *aot_comp_option_t;
 
 bool
diff --git a/core/iwasm/compilation/aot_llvm_extra2.cpp b/core/iwasm/compilation/aot_llvm_extra2.cpp
new file mode 100644
index 000000000..9bd44bbff
--- /dev/null
+++ b/core/iwasm/compilation/aot_llvm_extra2.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c)2023 YAMAMOTO Takashi.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <llvm-c/TargetMachine.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include "bh_assert.h"
+
+#include "aot_llvm_extra2.h"
+
+static llvm::Optional<llvm::Reloc::Model>
+convert(LLVMRelocMode reloc_mode)
+{
+    switch (reloc_mode) {
+        case LLVMRelocDefault:
+            return llvm::None;
+        case LLVMRelocStatic:
+            return llvm::Reloc::Static;
+        case LLVMRelocPIC:
+            return llvm::Reloc::PIC_;
+        case LLVMRelocDynamicNoPic:
+            return llvm::Reloc::DynamicNoPIC;
+        case LLVMRelocROPI:
+            return llvm::Reloc::ROPI;
+        case LLVMRelocRWPI:
+            return llvm::Reloc::RWPI;
+        case LLVMRelocROPI_RWPI:
+            return llvm::Reloc::ROPI_RWPI;
+    }
+    bh_assert(0);
+    return llvm::None;
+}
+
+static llvm::CodeGenOpt::Level
+convert(LLVMCodeGenOptLevel opt_level)
+{
+    switch (opt_level) {
+        case LLVMCodeGenLevelNone:
+            return llvm::CodeGenOpt::None;
+        case LLVMCodeGenLevelLess:
+            return llvm::CodeGenOpt::Less;
+        case LLVMCodeGenLevelDefault:
+            return llvm::CodeGenOpt::Default;
+        case LLVMCodeGenLevelAggressive:
+            return llvm::CodeGenOpt::Aggressive;
+    }
+    bh_assert(0);
+    return llvm::CodeGenOpt::None;
+}
+
+static llvm::Optional<llvm::CodeModel::Model>
+convert(LLVMCodeModel code_model, bool *jit)
+{
+    *jit = false;
+    switch (code_model) {
+        case LLVMCodeModelDefault:
+            return llvm::None;
+        case LLVMCodeModelJITDefault:
+            *jit = true;
+            return llvm::None;
+        case LLVMCodeModelTiny:
+            return llvm::CodeModel::Tiny;
+        case LLVMCodeModelSmall:
+            return llvm::CodeModel::Small;
+        case LLVMCodeModelKernel:
+            return llvm::CodeModel::Kernel;
+        case LLVMCodeModelMedium:
+            return llvm::CodeModel::Medium;
+        case LLVMCodeModelLarge:
+            return llvm::CodeModel::Large;
+    }
+    bh_assert(0);
+    return llvm::None;
+}
+
+LLVMTargetMachineRef
+LLVMCreateTargetMachineWithOpts(LLVMTargetRef ctarget, const char *triple,
+                                const char *cpu, const char *features,
+                                LLVMCodeGenOptLevel opt_level,
+                                LLVMRelocMode reloc_mode,
+                                LLVMCodeModel code_model,
+                                bool EmitStackSizeSection,
+                                const char *StackUsageOutput)
+{
+    llvm::TargetOptions opts;
+
+    // -fstack-size-section equiv
+    // emit it to ".stack_sizes" section in case of ELF
+    // you can read it with "llvm-readobj --stack-sizes"
+    opts.EmitStackSizeSection = EmitStackSizeSection;
+
+    // -fstack-usage equiv
+    if (StackUsageOutput != NULL) {
+        opts.StackUsageOutput = StackUsageOutput;
+    }
+
+    auto target = reinterpret_cast<llvm::Target *>(ctarget);
+    auto rm = convert(reloc_mode);
+    auto ol = convert(opt_level);
+    bool jit;
+    auto cm = convert(code_model, &jit);
+    auto targetmachine = target->createTargetMachine(triple, cpu, features,
+                                                     opts, rm, cm, ol, jit);
+    return reinterpret_cast<LLVMTargetMachineRef>(targetmachine);
+}
diff --git a/core/iwasm/compilation/aot_llvm_extra2.h b/core/iwasm/compilation/aot_llvm_extra2.h
new file mode 100644
index 000000000..ef99622a4
--- /dev/null
+++ b/core/iwasm/compilation/aot_llvm_extra2.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c)2023 YAMAMOTO Takashi.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <llvm-c/TargetMachine.h>
+
+LLVM_C_EXTERN_C_BEGIN
+LLVMTargetMachineRef
+LLVMCreateTargetMachineWithOpts(LLVMTargetRef ctarget, const char *triple,
+                                const char *cpu, const char *features,
+                                LLVMCodeGenOptLevel opt_level,
+                                LLVMRelocMode reloc_mode,
+                                LLVMCodeModel code_model,
+                                bool EmitStackSizeSection,
+                                const char *StackUsageOutput);
+LLVM_C_EXTERN_C_END
diff --git a/core/iwasm/include/aot_export.h b/core/iwasm/include/aot_export.h
index 792a4baa9..e58873bfd 100644
--- a/core/iwasm/include/aot_export.h
+++ b/core/iwasm/include/aot_export.h
@@ -63,6 +63,7 @@ typedef struct AOTCompOption {
     uint32_t stack_bounds_checks;
     char **custom_sections;
     uint32_t custom_sections_count;
+    const char *stack_usage_file;
 } AOTCompOption, *aot_comp_option_t;
 
 bool
diff --git a/core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c b/core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c
index fc82e969e..ae1fd94f7 100644
--- a/core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c
+++ b/core/iwasm/libraries/lib-pthread/lib_pthread_wrapper.c
@@ -594,45 +594,8 @@ pthread_create_wrapper(wasm_exec_env_t exec_env,
         wasm_runtime_set_wasi_ctx(new_module_inst, wasi_ctx);
 #endif
 
-    /* workaround about passing instantiate-linking information */
-    {
-        CApiFuncImport *c_api_func_imports;
-        uint32 import_func_count = 0;
-        uint32 size_in_bytes = 0;
-
-#if WASM_ENABLE_INTERP != 0
-        if (module_inst->module_type == Wasm_Module_Bytecode) {
-            new_c_api_func_imports = &(
-                ((WASMModuleInstance *)new_module_inst)->e->c_api_func_imports);
-            c_api_func_imports =
-                ((WASMModuleInstance *)module_inst)->e->c_api_func_imports;
-            import_func_count = ((WASMModule *)module)->import_function_count;
-        }
-#endif
-#if WASM_ENABLE_AOT != 0
-        if (module_inst->module_type == Wasm_Module_AoT) {
-            AOTModuleInstanceExtra *e =
-                (AOTModuleInstanceExtra *)((AOTModuleInstance *)new_module_inst)
-                    ->e;
-            new_c_api_func_imports = &(e->c_api_func_imports);
-
-            e = (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst)->e;
-            c_api_func_imports = e->c_api_func_imports;
-
-            import_func_count = ((AOTModule *)module)->import_func_count;
-        }
-#endif
-
-        if (import_func_count != 0 && c_api_func_imports) {
-            size_in_bytes = sizeof(CApiFuncImport *) * import_func_count;
-            *new_c_api_func_imports = wasm_runtime_malloc(size_in_bytes);
-            if (!(*new_c_api_func_imports))
-                goto fail;
-
-            bh_memcpy_s(*new_c_api_func_imports, size_in_bytes,
-                        c_api_func_imports, size_in_bytes);
-        }
-    }
+    if (!(wasm_cluster_dup_c_api_imports(new_module_inst, module_inst)))
+        goto fail;
 
     if (!(info_node = wasm_runtime_malloc(sizeof(ThreadInfoNode))))
         goto fail;
diff --git a/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c b/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c
index 30d66076d..6b36c9073 100644
--- a/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c
+++ b/core/iwasm/libraries/lib-wasi-threads/lib_wasi_threads_wrapper.c
@@ -96,6 +96,9 @@ thread_spawn_wrapper(wasm_exec_env_t exec_env, uint32 start_arg)
     wasm_runtime_set_custom_data_internal(
         new_module_inst, wasm_runtime_get_custom_data(module_inst));
 
+    if (!(wasm_cluster_dup_c_api_imports(new_module_inst, module_inst)))
+        goto thread_preparation_fail;
+
 #if WASM_ENABLE_LIBC_WASI != 0
     wasi_ctx = wasm_runtime_get_wasi_ctx(module_inst);
     if (wasi_ctx)
diff --git a/core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.c b/core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.c
new file mode 100644
index 000000000..69e125d40
--- /dev/null
+++ b/core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2023 Amazon.com Inc. or its affiliates. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef __wasi__
+#error This example only compiles to WASM/WASI target
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "wasi_thread_start.h"
+
+enum CONSTANTS {
+    SECOND = 1000 * 1000 * 1000, /* 1 second */
+    TIMEOUT = 1LL * SECOND
+};
+
+typedef struct {
+    start_args_t base;
+} shared_t;
+
+void
+__wasi_thread_start_C(int thread_id, int *start_arg)
+{
+    /* Wait so that the exception is raised after the main thread has finished
+     * already */
+    __builtin_wasm_memory_atomic_wait32(NULL, 0, TIMEOUT);
+    __builtin_trap();
+}
+
+int
+main(int argc, char **argv)
+{
+    shared_t data = { 0 };
+
+    assert(start_args_init(&data.base));
+    int thread_id = __wasi_thread_spawn(&data);
+    assert(thread_id > 0 && "Thread creation failed");
+
+    return EXIT_SUCCESS;
+}
diff --git a/core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.json b/core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.json
new file mode 100644
index 000000000..9dc1e30d2
--- /dev/null
+++ b/core/iwasm/libraries/lib-wasi-threads/test/trap_after_main_thread_finishes.json
@@ -0,0 +1,3 @@
+{
+  "exit_code": 1
+}
diff --git a/core/iwasm/libraries/libc-wasi/libc_wasi_wrapper.c b/core/iwasm/libraries/libc-wasi/libc_wasi_wrapper.c
index ab2808c6e..afb11925a 100644
--- a/core/iwasm/libraries/libc-wasi/libc_wasi_wrapper.c
+++ b/core/iwasm/libraries/libc-wasi/libc_wasi_wrapper.c
@@ -56,8 +56,14 @@ typedef struct WASIContext *wasi_ctx_t;
 wasi_ctx_t
 wasm_runtime_get_wasi_ctx(wasm_module_inst_t module_inst);
 
-static inline size_t
-min(size_t a, size_t b)
+static inline uint64_t
+min_uint64(uint64_t a, uint64_t b)
+{
+    return a > b ? b : a;
+}
+
+static inline uint32_t
+min_uint32(uint32_t a, uint32_t b)
 {
     return a > b ? b : a;
 }
@@ -962,7 +968,7 @@ get_timeout_for_poll_oneoff(const wasi_subscription_t *in,
         const __wasi_subscription_t *s = &in[i];
         if (s->u.type == __WASI_EVENTTYPE_CLOCK
             && (s->u.u.clock.flags & __WASI_SUBSCRIPTION_CLOCK_ABSTIME) == 0) {
-            timeout = min(timeout, s->u.u.clock.timeout);
+            timeout = min_uint64(timeout, s->u.u.clock.timeout);
         }
     }
     return timeout;
@@ -1016,8 +1022,8 @@ execute_interruptible_poll_oneoff(
 
     while (timeout == (__wasi_timestamp_t)-1 || elapsed <= timeout) {
         /* update timeout for clock subscription events */
-        update_clock_subscription_data(in_copy, nsubscriptions,
-                                       min(time_quant, timeout - elapsed));
+        update_clock_subscription_data(
+            in_copy, nsubscriptions, min_uint64(time_quant, timeout - elapsed));
         err = wasmtime_ssp_poll_oneoff(curfds, in_copy, out, nsubscriptions,
                                        nevents);
         elapsed += time_quant;
@@ -1999,7 +2005,7 @@ copy_buffer_to_iovec_app(wasm_module_inst_t module_inst, uint8 *buf_begin,
          * only copy the amount in the app buffer. Otherwise, we fill the iovec
          * buffer and reduce size to copy on the next iteration
          */
-        size_to_copy_into_iovec = min(data->buf_len, size_to_copy);
+        size_to_copy_into_iovec = min_uint32(data->buf_len, size_to_copy);
 
         native_addr = (void *)addr_app_to_native(data->buf_offset);
         bh_memcpy_s(native_addr, size_to_copy_into_iovec, buf,
diff --git a/core/iwasm/libraries/thread-mgr/thread_manager.c b/core/iwasm/libraries/thread-mgr/thread_manager.c
index bfb89c089..9303eb3f5 100644
--- a/core/iwasm/libraries/thread-mgr/thread_manager.c
+++ b/core/iwasm/libraries/thread-mgr/thread_manager.c
@@ -733,6 +733,55 @@ fail1:
     return -1;
 }
 
+bool
+wasm_cluster_dup_c_api_imports(WASMModuleInstanceCommon *module_inst_dst,
+                               const WASMModuleInstanceCommon *module_inst_src)
+{
+    /* workaround about passing instantiate-linking information */
+    CApiFuncImport **new_c_api_func_imports = NULL;
+    CApiFuncImport *c_api_func_imports;
+    uint32 import_func_count = 0;
+    uint32 size_in_bytes = 0;
+
+#if WASM_ENABLE_INTERP != 0
+    if (module_inst_src->module_type == Wasm_Module_Bytecode) {
+        new_c_api_func_imports =
+            &(((WASMModuleInstance *)module_inst_dst)->e->c_api_func_imports);
+        c_api_func_imports = ((const WASMModuleInstance *)module_inst_src)
+                                 ->e->c_api_func_imports;
+        import_func_count =
+            ((WASMModule *)(((const WASMModuleInstance *)module_inst_src)
+                                ->module))
+                ->import_function_count;
+    }
+#endif
+#if WASM_ENABLE_AOT != 0
+    if (module_inst_src->module_type == Wasm_Module_AoT) {
+        AOTModuleInstanceExtra *e =
+            (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst_dst)->e;
+        new_c_api_func_imports = &(e->c_api_func_imports);
+
+        e = (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst_src)->e;
+        c_api_func_imports = e->c_api_func_imports;
+
+        import_func_count =
+            ((AOTModule *)(((AOTModuleInstance *)module_inst_src)->module))
+                ->import_func_count;
+    }
+#endif
+
+    if (import_func_count != 0 && c_api_func_imports) {
+        size_in_bytes = sizeof(CApiFuncImport) * import_func_count;
+        *new_c_api_func_imports = wasm_runtime_malloc(size_in_bytes);
+        if (!(*new_c_api_func_imports))
+            return false;
+
+        bh_memcpy_s(*new_c_api_func_imports, size_in_bytes, c_api_func_imports,
+                    size_in_bytes);
+    }
+    return true;
+}
+
 #if WASM_ENABLE_DEBUG_INTERP != 0
 WASMCurrentEnvStatus *
 wasm_cluster_create_exenv_status()
diff --git a/core/iwasm/libraries/thread-mgr/thread_manager.h b/core/iwasm/libraries/thread-mgr/thread_manager.h
index 899a4cc21..2060869c2 100644
--- a/core/iwasm/libraries/thread-mgr/thread_manager.h
+++ b/core/iwasm/libraries/thread-mgr/thread_manager.h
@@ -74,6 +74,11 @@ wasm_cluster_destroy(WASMCluster *cluster);
 WASMCluster *
 wasm_exec_env_get_cluster(WASMExecEnv *exec_env);
 
+/* Forward registered functions to a new thread */
+bool
+wasm_cluster_dup_c_api_imports(WASMModuleInstanceCommon *module_inst_dst,
+                               const WASMModuleInstanceCommon *module_inst_src);
+
 int32
 wasm_cluster_create_thread(WASMExecEnv *exec_env,
                            wasm_module_inst_t module_inst, bool alloc_aux_stack,
diff --git a/core/iwasm/libraries/wasi-nn/README.md b/core/iwasm/libraries/wasi-nn/README.md
index c5762618d..ac737c281 100644
--- a/core/iwasm/libraries/wasi-nn/README.md
+++ b/core/iwasm/libraries/wasi-nn/README.md
@@ -24,6 +24,7 @@ Build the runtime image for your execution target type.
 `EXECUTION_TYPE` can be:
 * `cpu`
 * `nvidia-gpu`
+* `vx-delegate`
 
 ```
 EXECUTION_TYPE=cpu
@@ -71,6 +72,18 @@ docker run \
     /assets/test_tensorflow.wasm
 ```
 
+* vx-delegate for NPU (x86 simulater)
+
+```
+docker run \
+    -v $PWD/core/iwasm/libraries/wasi-nn/test:/assets wasi-nn-vx-delegate \
+    --dir=/assets \
+    --env="TARGET=gpu" \
+    /assets/test_tensorflow.wasm
+```
+
+
+
 Requirements:
 * [NVIDIA docker](https://github.com/NVIDIA/nvidia-docker).
 
diff --git a/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake b/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
new file mode 100644
index 000000000..bbeac3b14
--- /dev/null
+++ b/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
@@ -0,0 +1,41 @@
+# Copyright (C) 2019 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+find_library(TENSORFLOW_LITE 
+     NAMES tensorflow-lite
+)
+
+if(NOT EXISTS ${TENSORFLOW_LITE})
+    if (NOT EXISTS "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
+        execute_process(COMMAND ${WAMR_ROOT_DIR}/core/deps/install_tensorflow.sh
+                        RESULT_VARIABLE TENSORFLOW_RESULT
+        )
+    else ()
+        message("Tensorflow is already downloaded.")
+    endif()
+    set(TENSORFLOW_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")
+
+    if (WASI_NN_ENABLE_GPU EQUAL 1)
+    # Tensorflow specific:
+    # * https://www.tensorflow.org/lite/guide/build_cmake#available_options_to_build_tensorflow_lite
+    set (TFLITE_ENABLE_GPU ON)
+    endif ()
+
+    include_directories (${CMAKE_CURRENT_BINARY_DIR}/flatbuffers/include)
+    include_directories (${TENSORFLOW_SOURCE_DIR})
+    add_subdirectory(
+        "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
+        "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite" EXCLUDE_FROM_ALL) 
+
+else()
+    find_path(TENSORFLOW_LITE_INCLUDE_DIR
+    NAMES tensorflow/lite/interpreter.h
+    )
+    find_path(FLATBUFFER_INCLUDE_DIR
+    NAMES flatbuffers/flatbuffers.h
+    )
+    include_directories (${TENSORFLOW_LITE_INCLUDE_DIR})
+    include_directories (${FLATBUFFER_INCLUDE_DIR})    
+endif()
+
diff --git a/core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp b/core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp
index 0fe156381..dfd21787c 100644
--- a/core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp
+++ b/core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp
@@ -21,6 +21,10 @@
 #include <tensorflow/lite/delegates/gpu/delegate.h>
 #endif
 
+#if defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+#include <tensorflow/lite/delegates/external/external_delegate.h>
+#endif
+
 /* Maximum number of graphs per WASM instance */
 #define MAX_GRAPHS_PER_INST 10
 /* Maximum number of graph execution context per WASM instance*/
@@ -42,6 +46,7 @@ typedef struct {
     uint32_t current_interpreters;
     Interpreter interpreters[MAX_GRAPH_EXEC_CONTEXTS_PER_INST];
     korp_mutex g_lock;
+    TfLiteDelegate *delegate;
 } TFLiteContext;
 
 /* Utils */
@@ -194,18 +199,40 @@ tensorflowlite_init_execution_context(void *tflite_ctx, graph g,
 #if defined(WASI_NN_ENABLE_GPU)
             NN_WARN_PRINTF("GPU enabled.");
             // https://www.tensorflow.org/lite/performance/gpu
-            auto options = TfLiteGpuDelegateOptionsV2Default();
+            TfLiteGpuDelegateOptionsV2 options =
+                TfLiteGpuDelegateOptionsV2Default();
             options.inference_preference =
                 TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED;
             options.inference_priority1 =
                 TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
-            auto *delegate = TfLiteGpuDelegateV2Create(&options);
+            tfl_ctx->delegate = TfLiteGpuDelegateV2Create(&options);
+            if (tfl_ctx->delegate == NULL) {
+                NN_ERR_PRINTF("Error when generating GPU delegate.");
+                use_default = true;
+                return missing_memory;
+            }
             if (tfl_ctx->interpreters[*ctx]
-                    .interpreter->ModifyGraphWithDelegate(delegate)
+                    .interpreter->ModifyGraphWithDelegate(tfl_ctx->delegate)
                 != kTfLiteOk) {
                 NN_ERR_PRINTF("Error when enabling GPU delegate.");
                 use_default = true;
             }
+#elif defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+            NN_WARN_PRINTF("external delegation enabled.");
+            TfLiteExternalDelegateOptions options =
+                TfLiteExternalDelegateOptionsDefault(WASI_NN_EXT_DELEGATE_PATH);
+            tfl_ctx->delegate = TfLiteExternalDelegateCreate(&options);
+            if (tfl_ctx->delegate == NULL) {
+                NN_ERR_PRINTF("Error when generating External delegate.");
+                use_default = true;
+                return missing_memory;
+            }
+            if (tfl_ctx->interpreters[*ctx]
+                    .interpreter->ModifyGraphWithDelegate(tfl_ctx->delegate)
+                != kTfLiteOk) {
+                NN_ERR_PRINTF("Error when enabling External delegate.");
+                use_default = true;
+            }
 #else
             NN_WARN_PRINTF("GPU not enabled.");
             use_default = true;
@@ -350,6 +377,8 @@ tensorflowlite_initialize(void **tflite_ctx)
         NN_ERR_PRINTF("Error while initializing the lock");
     }
 
+    tfl_ctx->delegate = NULL;
+
     *tflite_ctx = (void *)tfl_ctx;
 }
 
@@ -364,6 +393,14 @@ tensorflowlite_destroy(void *tflite_ctx)
     */
     TFLiteContext *tfl_ctx = (TFLiteContext *)tflite_ctx;
 
+    if (tfl_ctx->delegate != NULL) {
+#if defined(WASI_NN_ENABLE_GPU)
+        TfLiteGpuDelegateV2Delete(tfl_ctx->delegate);
+#elif defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+        TfLiteExternalDelegateDelete(tfl_ctx->delegate);
+#endif
+    }
+
     NN_DBG_PRINTF("Freeing memory.");
     for (int i = 0; i < MAX_GRAPHS_PER_INST; ++i) {
         tfl_ctx->models[i].model.reset();
diff --git a/core/iwasm/libraries/wasi-nn/test/Dockerfile.vx-delegate b/core/iwasm/libraries/wasi-nn/test/Dockerfile.vx-delegate
new file mode 100644
index 000000000..89cc1a9de
--- /dev/null
+++ b/core/iwasm/libraries/wasi-nn/test/Dockerfile.vx-delegate
@@ -0,0 +1,99 @@
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+FROM ubuntu:20.04 AS base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+
+RUN apt-get update && apt-get install -y \
+    cmake build-essential git curl libssl-dev python3
+
+
+# Build TensorFlow Lite VX delegate default built for x86-64 simulator
+WORKDIR /tmp
+RUN git clone https://github.com/VeriSilicon/TIM-VX.git tim-vx
+RUN git clone https://github.com/VeriSilicon/tflite-vx-delegate.git
+RUN git clone https://github.com/tensorflow/tensorflow.git
+
+
+# Build TIM-VX
+WORKDIR /tmp/tim-vx/host_build
+RUN cmake -DCMAKE_INSTALL_PREFIX=/usr/local  ../
+RUN make -j$(grep -c ^processor /proc/cpuinfo)
+RUN make install
+
+WORKDIR /tmp/tim-vx
+#RUN mkdir -p prebuilt-sdk/x86_64_linux/lib/include 
+#RUN cp prebuilt-sdk/x86_64_linux/include/CL prebuilt-sdk/x86_64_linux/lib/include -fr
+
+
+# Build TensorFlow Lite
+WORKDIR /tmp/tensorflow/build
+RUN cmake \
+  -DBUILD_SHARED_LIBS=ON=on \
+  -DTFLITE_ENABLE_RUY=on \
+  -DTFLITE_ENABLE_NNAPI=off \
+  -DTFLITE_ENABLE_XNNPACK=on \
+  -DTFLITE_ENABLE_EXTERNAL_DELEGATE=on \
+  ../tensorflow/lite/
+RUN make -j$(grep -c ^processor /proc/cpuinfo)
+RUN make install
+RUN cp --no-preserve=ownership -d lib*.so* /usr/local/lib
+RUN cp -r --no-preserve=ownership -d flatbuffers/include/flatbuffers /usr/local/include
+# install header files
+RUN install -d /usr/local/include/tensorflow/lite && \
+    cd /tmp/tensorflow/tensorflow/lite && \
+    cp --parents \
+        $(find . -name "*.h*") \
+        /usr/local/include/tensorflow/lite
+# install version.h from core
+RUN install -d /usr/local/include/tensorflow/core/public && \
+    cp /tmp/tensorflow/tensorflow/core/public/version.h /usr/local/include/tensorflow/core/public
+
+
+# Build Vx Delegate default built for x86-64 simulator
+WORKDIR /tmp/tflite-vx-delegate/build
+RUN cmake \
+   -DBUILD_SHARED_LIBS=ON \
+   -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=/tmp/tensorflow \
+   -DTFLITE_LIB_LOC=/usr/local/lib/libtensorflow-lite.so \
+   -DTIM_VX_INSTALL=/usr/local \
+   -DCMAKE_INSTALL_PREFIX=/usr/  \
+   ../
+RUN make vx_delegate -j$(grep -c ^processor /proc/cpuinfo)
+RUN make install
+RUN cp --no-preserve=ownership -d lib*.so* /usr/lib
+# install header files
+RUN install -d /usr/local/include/tensorflow-lite-vx-delegate && \
+    cd  /tmp/tflite-vx-delegate/ && \
+    cp --parents \
+        $(find . -name "*.h*") \
+        /usr/local/include/tensorflow-lite-vx-delegate
+
+ENV VIVANTE_SDK_DIR=/tmp/tim-vx/prebuilt-sdk/x86_64_linux/
+ENV VSIMULATOR_CONFIG=czl
+
+ENV LD_LIBRARY_PATH=/tmp/tim-vx/prebuilt-sdk/x86_64_linux/lib:/usr/local/lib:/lib/x86_64-linux-gnu/:/lib64/:/usr/lib:$LD_LIBRARY_PATH 
+
+
+# Build WASI-NN
+WORKDIR /home/wamr
+
+COPY . .
+
+WORKDIR /home/wamr/core/iwasm/libraries/wasi-nn/test/build
+
+RUN cmake \
+    -DCMAKE_LIBRARY_PATH=${CMAKE_LIBRARY_PATH}:/usr/local/lib/ \
+    -DCMAKE_INCLUDE_PATH=${CMAKE_INCLUDE_PATH}:/usr/local/include/ \
+    -DWAMR_BUILD_WASI_NN=1 \
+    -DWAMR_BUILD_WASI_NN_ENABLE_EXT=1 \
+    -DWASI_NN_EXT_DELEGATE_PATH="/usr/lib/libvx_delegate.so" \
+    ..
+
+RUN make -j $(grep -c ^processor /proc/cpuinfo)
+
+RUN cp /home/wamr/core/iwasm/libraries/wasi-nn/test/build/iwasm /run/iwasm
+
+ENTRYPOINT [ "/run/iwasm" ]
diff --git a/core/iwasm/libraries/wasi-nn/wasi_nn.cmake b/core/iwasm/libraries/wasi-nn/wasi_nn.cmake
index 1bb53c086..019782c2e 100644
--- a/core/iwasm/libraries/wasi-nn/wasi_nn.cmake
+++ b/core/iwasm/libraries/wasi-nn/wasi_nn.cmake
@@ -1,6 +1,11 @@
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake)
+
+# Find tensorflow-lite
+find_package(tensorflow_lite REQUIRED)
+
 set (WASI_NN_DIR ${CMAKE_CURRENT_LIST_DIR})
 
 include_directories (${WASI_NN_DIR})
diff --git a/core/shared/mem-alloc/ems/ems_alloc.c b/core/shared/mem-alloc/ems/ems_alloc.c
index 6f03fa58f..5c2a628a2 100644
--- a/core/shared/mem-alloc/ems/ems_alloc.c
+++ b/core/shared/mem-alloc/ems/ems_alloc.c
@@ -25,7 +25,7 @@ static bool
 remove_tree_node(gc_heap_t *heap, hmu_tree_node_t *p)
 {
     hmu_tree_node_t *q = NULL, **slot = NULL, *parent;
-    hmu_tree_node_t *root = &heap->kfc_tree_root;
+    hmu_tree_node_t *root = heap->kfc_tree_root;
     gc_uint8 *base_addr = heap->base_addr;
     gc_uint8 *end_addr = base_addr + heap->current_size;
 
@@ -38,13 +38,17 @@ remove_tree_node(gc_heap_t *heap, hmu_tree_node_t *p)
         goto fail;
     }
 
-    /* get the slot which holds pointer to node p*/
+    /* get the slot which holds pointer to node p */
     if (p == p->parent->right) {
-        slot = &p->parent->right;
+        /* Don't use `slot = &p->parent->right` to avoid compiler warning */
+        slot = (hmu_tree_node_t **)((uint8 *)p->parent
+                                    + offsetof(hmu_tree_node_t, right));
     }
     else if (p == p->parent->left) {
-        /* p should be a child of its parent*/
-        slot = &p->parent->left;
+        /* p should be a child of its parent */
+        /* Don't use `slot = &p->parent->left` to avoid compiler warning */
+        slot = (hmu_tree_node_t **)((uint8 *)p->parent
+                                    + offsetof(hmu_tree_node_t, left));
     }
     else {
         goto fail;
@@ -241,7 +245,7 @@ gci_add_fc(gc_heap_t *heap, hmu_t *hmu, gc_size_t size)
     node->left = node->right = node->parent = NULL;
 
     /* find proper node to link this new node to */
-    root = &heap->kfc_tree_root;
+    root = heap->kfc_tree_root;
     tp = root;
     bh_assert(tp->size < size);
     while (1) {
@@ -289,6 +293,7 @@ alloc_hmu(gc_heap_t *heap, gc_size_t size)
     uint32 node_idx = 0, init_node_idx = 0;
     hmu_tree_node_t *root = NULL, *tp = NULL, *last_tp = NULL;
     hmu_t *next, *rest;
+    uintptr_t tp_ret;
 
     bh_assert(gci_is_heap_valid(heap));
     bh_assert(size > 0 && !(size & 7));
@@ -354,7 +359,7 @@ alloc_hmu(gc_heap_t *heap, gc_size_t size)
     }
 
     /* need to find a node in tree*/
-    root = &heap->kfc_tree_root;
+    root = heap->kfc_tree_root;
 
     /* find the best node*/
     bh_assert(root);
@@ -402,7 +407,8 @@ alloc_hmu(gc_heap_t *heap, gc_size_t size)
             heap->highmark_size = heap->current_size - heap->total_free_size;
 
         hmu_set_size((hmu_t *)last_tp, size);
-        return (hmu_t *)last_tp;
+        tp_ret = (uintptr_t)last_tp;
+        return (hmu_t *)tp_ret;
     }
 
     return NULL;
diff --git a/core/shared/mem-alloc/ems/ems_gc_internal.h b/core/shared/mem-alloc/ems/ems_gc_internal.h
index 39b1ff8f1..e1ff9d61d 100644
--- a/core/shared/mem-alloc/ems/ems_gc_internal.h
+++ b/core/shared/mem-alloc/ems/ems_gc_internal.h
@@ -204,13 +204,47 @@ set_hmu_normal_node_next(hmu_normal_node_t *node, hmu_normal_node_t *next)
     }
 }
 
+/**
+ * Define hmu_tree_node as a packed struct, since it is at the 4-byte
+ * aligned address and the size of hmu_head is 4, so in 64-bit target,
+ * the left/right/parent fields will be at 8-byte aligned address,
+ * we can access them directly.
+ */
+#if UINTPTR_MAX == UINT64_MAX
+#if defined(_MSC_VER)
+__pragma(pack(push, 1));
+#define __attr_packed
+#elif defined(__GNUC__) || defined(__clang__)
+#define __attr_packed __attribute__((packed))
+#else
+#error "packed attribute isn't used to define struct hmu_tree_node"
+#endif
+#else /* else of UINTPTR_MAX == UINT64_MAX */
+#define __attr_packed
+#endif
+
 typedef struct hmu_tree_node {
     hmu_t hmu_header;
-    gc_size_t size;
     struct hmu_tree_node *left;
     struct hmu_tree_node *right;
     struct hmu_tree_node *parent;
-} hmu_tree_node_t;
+    gc_size_t size;
+} __attr_packed hmu_tree_node_t;
+
+#if UINTPTR_MAX == UINT64_MAX
+#if defined(_MSC_VER)
+__pragma(pack(pop));
+#endif
+#endif
+
+bh_static_assert(sizeof(hmu_tree_node_t) == 8 + 3 * sizeof(void *));
+bh_static_assert(offsetof(hmu_tree_node_t, left) == 4);
+
+#define ASSERT_TREE_NODE_ALIGNED_ACCESS(tree_node)                          \
+    do {                                                                    \
+        bh_assert((((uintptr_t)&tree_node->left) & (sizeof(uintptr_t) - 1)) \
+                  == 0);                                                    \
+    } while (0)
 
 typedef struct gc_heap_struct {
     /* for double checking*/
@@ -223,8 +257,16 @@ typedef struct gc_heap_struct {
 
     hmu_normal_list_t kfc_normal_list[HMU_NORMAL_NODE_CNT];
 
-    /* order in kfc_tree is: size[left] <= size[cur] < size[right]*/
-    hmu_tree_node_t kfc_tree_root;
+#if UINTPTR_MAX == UINT64_MAX
+    /* make kfc_tree_root_buf 4-byte aligned and not 8-byte aligned,
+       so kfc_tree_root's left/right/parent fields are 8-byte aligned
+       and we can access them directly */
+    uint32 __padding;
+#endif
+    uint8 kfc_tree_root_buf[sizeof(hmu_tree_node_t)];
+    /* point to kfc_tree_root_buf, the order in kfc_tree is:
+         size[left] <= size[cur] < size[right] */
+    hmu_tree_node_t *kfc_tree_root;
 
     /* whether heap is corrupted, e.g. the hmu nodes are modified
        by user */
diff --git a/core/shared/mem-alloc/ems/ems_kfc.c b/core/shared/mem-alloc/ems/ems_kfc.c
index fe7732533..80d202679 100644
--- a/core/shared/mem-alloc/ems/ems_kfc.c
+++ b/core/shared/mem-alloc/ems/ems_kfc.c
@@ -27,7 +27,7 @@ gc_init_internal(gc_heap_t *heap, char *base_addr, gc_size_t heap_max_size)
     heap->total_free_size = heap->current_size;
     heap->highmark_size = 0;
 
-    root = &heap->kfc_tree_root;
+    root = heap->kfc_tree_root = (hmu_tree_node_t *)heap->kfc_tree_root_buf;
     memset(root, 0, sizeof *root);
     root->size = sizeof *root;
     hmu_set_ut(&root->hmu_header, HMU_FC);
@@ -38,6 +38,9 @@ gc_init_internal(gc_heap_t *heap, char *base_addr, gc_size_t heap_max_size)
     hmu_set_ut(&q->hmu_header, HMU_FC);
     hmu_set_size(&q->hmu_header, heap->current_size);
 
+    ASSERT_TREE_NODE_ALIGNED_ACCESS(q);
+    ASSERT_TREE_NODE_ALIGNED_ACCESS(root);
+
     hmu_mark_pinuse(&q->hmu_header);
     root->right = q;
     q->parent = root;
@@ -165,6 +168,7 @@ gc_migrate(gc_handle_t handle, char *pool_buf_new, gc_size_t pool_buf_size)
     intptr_t offset = (uint8 *)base_addr_new - (uint8 *)heap->base_addr;
     hmu_t *cur = NULL, *end = NULL;
     hmu_tree_node_t *tree_node;
+    uint8 **p_left, **p_right, **p_parent;
     gc_size_t heap_max_size, size;
 
     if ((((uintptr_t)pool_buf_new) & 7) != 0) {
@@ -188,9 +192,18 @@ gc_migrate(gc_handle_t handle, char *pool_buf_new, gc_size_t pool_buf_size)
     }
 
     heap->base_addr = (uint8 *)base_addr_new;
-    adjust_ptr((uint8 **)&heap->kfc_tree_root.left, offset);
-    adjust_ptr((uint8 **)&heap->kfc_tree_root.right, offset);
-    adjust_ptr((uint8 **)&heap->kfc_tree_root.parent, offset);
+
+    ASSERT_TREE_NODE_ALIGNED_ACCESS(heap->kfc_tree_root);
+
+    p_left = (uint8 **)((uint8 *)heap->kfc_tree_root
+                        + offsetof(hmu_tree_node_t, left));
+    p_right = (uint8 **)((uint8 *)heap->kfc_tree_root
+                         + offsetof(hmu_tree_node_t, right));
+    p_parent = (uint8 **)((uint8 *)heap->kfc_tree_root
+                          + offsetof(hmu_tree_node_t, parent));
+    adjust_ptr(p_left, offset);
+    adjust_ptr(p_right, offset);
+    adjust_ptr(p_parent, offset);
 
     cur = (hmu_t *)heap->base_addr;
     end = (hmu_t *)((char *)heap->base_addr + heap->current_size);
@@ -206,12 +219,21 @@ gc_migrate(gc_handle_t handle, char *pool_buf_new, gc_size_t pool_buf_size)
 
         if (hmu_get_ut(cur) == HMU_FC && !HMU_IS_FC_NORMAL(size)) {
             tree_node = (hmu_tree_node_t *)cur;
-            adjust_ptr((uint8 **)&tree_node->left, offset);
-            adjust_ptr((uint8 **)&tree_node->right, offset);
-            if (tree_node->parent != &heap->kfc_tree_root)
+
+            ASSERT_TREE_NODE_ALIGNED_ACCESS(tree_node);
+
+            p_left = (uint8 **)((uint8 *)tree_node
+                                + offsetof(hmu_tree_node_t, left));
+            p_right = (uint8 **)((uint8 *)tree_node
+                                 + offsetof(hmu_tree_node_t, right));
+            p_parent = (uint8 **)((uint8 *)tree_node
+                                  + offsetof(hmu_tree_node_t, parent));
+            adjust_ptr(p_left, offset);
+            adjust_ptr(p_right, offset);
+            if (tree_node->parent != heap->kfc_tree_root)
                 /* The root node belongs to heap structure,
                    it is fixed part and isn't changed. */
-                adjust_ptr((uint8 **)&tree_node->parent, offset);
+                adjust_ptr(p_parent, offset);
         }
         cur = (hmu_t *)((char *)cur + size);
     }
diff --git a/doc/embed_wamr.md b/doc/embed_wamr.md
index ae3fe4181..050384027 100644
--- a/doc/embed_wamr.md
+++ b/doc/embed_wamr.md
@@ -1,7 +1,7 @@
 Embedding WAMR guideline
 =====================================
 
-**Note**: This document is about how to embed WAMR into C/C++ host applications, for other languages, please refer to: [Embed WAMR into Python](../language-bindings/go), [Embed WAMR into Go](../language-bindings/go).
+**Note**: This document is about how to embed WAMR into C/C++ host applications, for other languages, please refer to: [Embed WAMR into Python](../language-bindings/python), [Embed WAMR into Go](../language-bindings/go).
 
 All the embedding APIs supported by the runtime are defined under folder [core/iwasm/include](../core/iwasm/include). The API details are available in the header files.
 
diff --git a/language-bindings/python/README.md b/language-bindings/python/README.md
index 9e504a9c0..ec82ee191 100644
--- a/language-bindings/python/README.md
+++ b/language-bindings/python/README.md
@@ -30,5 +30,5 @@ import wamr.wasmcapi.ffi as ffi
 
 For more information:
 
-* [WAMR API](./wamr_api)
-* [WASM-C-API](./wasm_c_api)
+* [WAMR API](./wamr-api)
+* [WASM-C-API](./wasm-c-api)
diff --git a/product-mini/README.md b/product-mini/README.md
index 1499440e0..5847b2468 100644
--- a/product-mini/README.md
+++ b/product-mini/README.md
@@ -18,14 +18,17 @@ Note that all ESP-IDF toolchain files live under `$IDF_PATH/tools/cmake/`.
 ## Linux
 
 First of all please install the dependent packages.
-Run command below in Ubuntu-18.04:
-
+Run command below in Ubuntu-22.04:
 ``` Bash
-sudo apt install build-essential cmake g++-multilib libgcc-8-dev lib32gcc-8-dev
+sudo apt install build-essential cmake g++-multilib libgcc-11-dev lib32gcc-11-dev ccache
 ```
-Or in Ubuntu-16.04:
+Or in Ubuntu-20.04
 ``` Bash
-sudo apt install build-essential cmake g++-multilib libgcc-5-dev lib32gcc-5-dev
+sudo apt install build-essential cmake g++-multilib libgcc-9-dev lib32gcc-9-dev ccache
+```
+Or in Ubuntu-18.04:
+``` Bash
+sudo apt install build-essential cmake g++-multilib libgcc-8-dev lib32gcc-8-dev ccache
 ```
 Or in Fedora:
 ``` Bash
diff --git a/product-mini/platforms/posix/main.c b/product-mini/platforms/posix/main.c
index 8727ed389..2e96ccddd 100644
--- a/product-mini/platforms/posix/main.c
+++ b/product-mini/platforms/posix/main.c
@@ -745,8 +745,12 @@ main(int argc, char *argv[])
 
 #if WASM_ENABLE_LIBC_WASI != 0
     if (ret == 0) {
-        /* propagate wasi exit code. */
+        /* wait for threads to finish and propagate wasi exit code. */
         ret = wasm_runtime_get_wasi_exit_code(wasm_module_inst);
+        if (wasm_runtime_get_exception(wasm_module_inst)) {
+            /* got an exception in spawned thread */
+            ret = 1;
+        }
     }
 #endif
 
diff --git a/product-mini/platforms/windows/main.c b/product-mini/platforms/windows/main.c
index 05647b5db..26fa7dcc9 100644
--- a/product-mini/platforms/windows/main.c
+++ b/product-mini/platforms/windows/main.c
@@ -549,8 +549,12 @@ main(int argc, char *argv[])
 
 #if WASM_ENABLE_LIBC_WASI != 0
     if (ret == 0) {
-        /* propagate wasi exit code. */
+        /* wait for threads to finish and propagate wasi exit code. */
         ret = wasm_runtime_get_wasi_exit_code(wasm_module_inst);
+        if (wasm_runtime_get_exception(wasm_module_inst)) {
+            /* got an exception in spawned thread */
+            ret = 1;
+        }
     }
 #endif
 
diff --git a/samples/wasm-c-api/CMakeLists.txt b/samples/wasm-c-api/CMakeLists.txt
index e2be3b8f4..c528fe16d 100644
--- a/samples/wasm-c-api/CMakeLists.txt
+++ b/samples/wasm-c-api/CMakeLists.txt
@@ -24,7 +24,7 @@ if (APPLE)
   add_definitions(-DBH_PLATFORM_DARWIN)
 endif ()
 
-# Resetdefault linker flags
+# Reset default linker flags
 set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "")
 set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")
 
@@ -87,6 +87,15 @@ endif()
 set(WAMR_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
 include (${WAMR_ROOT_DIR}/build-scripts/runtime_lib.cmake)
 
+if (NOT DEFINED SANITIZER)
+  set(SANITIZER "")
+elseif (SANITIZER STREQUAL "ubsan")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment" )
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
+elseif (NOT (SANITIZER STREQUAL "") )
+  message(SEND_ERROR "Unsupported sanitizer: ${SANITIZER}")
+endif()
+
 add_library(vmlib STATIC ${WAMR_RUNTIME_LIB_SOURCE})
 if (MSVC)
   target_compile_definitions(vmlib PRIVATE WASM_API_EXTERN=)
diff --git a/tests/benchmarks/polybench/build.sh b/tests/benchmarks/polybench/build.sh
index 43cd3321a..bc7bf4c10 100755
--- a/tests/benchmarks/polybench/build.sh
+++ b/tests/benchmarks/polybench/build.sh
@@ -34,7 +34,8 @@ do
                 utilities/polybench.c ${file}                       \
                 -Wl,--export=__heap_base -Wl,--export=__data_end    \
                 -Wl,--export=malloc -Wl,--export=free               \
-                -DPOLYBENCH_TIME -o ${OUT_DIR}/${file_name%.*}.wasm
+                -DPOLYBENCH_TIME -o ${OUT_DIR}/${file_name%.*}.wasm \
+                -D_WASI_EMULATED_PROCESS_CLOCKS
 
         echo "Compile ${file_name%.*}.wasm into ${file_name%.*}.aot"
         ${WAMRC_CMD} -o ${OUT_DIR}/${file_name%.*}.aot \
diff --git a/wamr-compiler/README.md b/wamr-compiler/README.md
index a3a791f99..b9e566af2 100644
--- a/wamr-compiler/README.md
+++ b/wamr-compiler/README.md
@@ -1,7 +1,15 @@
 
 ### Build wamrc AOT compiler
 
-Both wasm binary file and AOT file are supported by iwasm. The wamrc AOT compiler is to compile wasm binary file to AOT file which can also be run by iwasm. Execute following commands to build **wamrc** compiler for Linux:
+Both wasm binary file and AOT file are supported by iwasm. The wamrc AOT compiler is to compile wasm binary file to AOT file which can also be run by iwasm. You can execute following commands to build **wamrc** compiler:
+
+For **Linux**(Ubuntu 20.04 as an example):
+
+First, make sure necessary dependency are installed:
+
+```shell
+sudo apt-get install git build-essential cmake g++-multilib libgcc-9-dev lib32gcc-9-dev ccache 
+```
 
 ```shell
 cd wamr-compiler
@@ -13,6 +21,7 @@ make
 ```
 
 For **Windows**：
+
 ```shell
 cd wamr-compiler
 python build_llvm.py
@@ -20,4 +29,4 @@ mkdir build && cd build
 cmake ..
 cmake --build . --config Release
 # wamrc.exe is generated under .\Release directory
-```
\ No newline at end of file
+```
diff --git a/wamr-compiler/main.c b/wamr-compiler/main.c
index f185a17b7..bd8691c4b 100644
--- a/wamr-compiler/main.c
+++ b/wamr-compiler/main.c
@@ -42,6 +42,8 @@ print_help()
     printf("                              if the option is set:\n");
     printf("                                (1) it is always enabled when `--bounds-checks` is enabled,\n");
     printf("                                (2) else it is enabled/disabled according to the option value\n");
+    printf("  --stack-usage=<file>      Generate a stack-usage file.\n");
+    printf("                              Similarly to `clang -fstack-usage`.\n");
     printf("  --format=<format>         Specifies the format of the output file\n");
     printf("                            The format supported:\n");
     printf("                              aot (default)  AoT file\n");
@@ -204,6 +206,9 @@ main(int argc, char *argv[])
         else if (!strncmp(argv[0], "--stack-bounds-checks=", 22)) {
             option.stack_bounds_checks = (atoi(argv[0] + 22) == 1) ? 1 : 0;
         }
+        else if (!strncmp(argv[0], "--stack-usage=", 14)) {
+            option.stack_usage_file = argv[0] + 14;
+        }
         else if (!strncmp(argv[0], "--format=", 9)) {
             if (argv[0][9] == '\0')
                 PRINT_HELP_AND_EXIT();