diff --git a/.github/workflows/compilation_on_android_ubuntu.yml b/.github/workflows/compilation_on_android_ubuntu.yml index 057082ebc..83cd154af 100644 --- a/.github/workflows/compilation_on_android_ubuntu.yml +++ b/.github/workflows/compilation_on_android_ubuntu.yml @@ -158,6 +158,7 @@ jobs: "-DWAMR_BUILD_PERF_PROFILING=1", "-DWAMR_BUILD_REF_TYPES=1", "-DWAMR_BUILD_SIMD=1", + "-DWAMR_BUILD_LIB_SIMDE=1", "-DWAMR_BUILD_TAIL_CALL=1", "-DWAMR_DISABLE_HW_BOUND_CHECK=1", "-DWAMR_BUILD_MEMORY64=1", @@ -178,11 +179,9 @@ jobs: make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1" - make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1" - # SIMD only on JIT/AOT mode + # SIMD only on JIT/AOT/fast interpreter mode - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS make_options_feature: "-DWAMR_BUILD_SIMD=1" - - make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS - make_options_feature: "-DWAMR_BUILD_SIMD=1" # DEBUG_INTERP only on CLASSIC INTERP mode - make_options_run_mode: $AOT_BUILD_OPTIONS make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1" @@ -649,11 +648,9 @@ jobs: test_option: $WAMR_COMPILER_TEST_OPTIONS exclude: # incompatible modes and features - # classic-interp and fast-interp don't support simd + # classic-interp doesn't support simd - running_mode: "classic-interp" test_option: $SIMD_TEST_OPTIONS - - running_mode: "fast-interp" - test_option: $SIMD_TEST_OPTIONS # llvm jit doesn't support multi module - running_mode: "jit" test_option: $MULTI_MODULES_TEST_OPTIONS diff --git a/.github/workflows/compilation_on_sgx.yml b/.github/workflows/compilation_on_sgx.yml index 70597c366..b865a59fe 100644 --- a/.github/workflows/compilation_on_sgx.yml +++ b/.github/workflows/compilation_on_sgx.yml @@ -49,7 +49,7 @@ env: # ref types enabled in wamrc by default, so we need to enable it for iwasm in AOT mode AOT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0 -DWAMR_BUILD_REF_TYPES=1" CLASSIC_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0" - FAST_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0" + FAST_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0 -DWAMR_BUILD_SIMD=0" FAST_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=1" LLVM_LAZY_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=1" LLVM_EAGER_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=0" @@ -97,7 +97,7 @@ jobs: "-DWAMR_BUILD_PERF_PROFILING=1", "-DWAMR_BUILD_REF_TYPES=1", # doesn't support - # "-DWAMR_BUILD_SIMD=1", + "-DWAMR_BUILD_SIMD=0", "-DWAMR_BUILD_TAIL_CALL=1", "-DWAMR_DISABLE_HW_BOUND_CHECK=1", "-DWAMR_BUILD_SGX_IPFS=1", diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake index 312944be5..2c0bf3c7c 100644 --- a/build-scripts/config_common.cmake +++ b/build-scripts/config_common.cmake @@ -300,6 +300,9 @@ endif () if (WAMR_BUILD_LIB_RATS EQUAL 1) message (" Lib rats enabled") endif() +if ((WAMR_BUILD_LIB_SIMDE EQUAL 1)) + message (" Lib simde enabled") +endif() ################## WAMR features ################## if (WAMR_BUILD_MULTI_MODULE EQUAL 1) add_definitions (-DWASM_ENABLE_MULTI_MODULE=1) @@ -371,11 +374,17 @@ else () message (" Wakeup of blocking operations enabled") endif () if (WAMR_BUILD_SIMD EQUAL 1) - if (NOT WAMR_BUILD_TARGET MATCHES "RISCV64.*") - add_definitions (-DWASM_ENABLE_SIMD=1) - else () + if (WAMR_BUILD_FAST_INTERP EQUAL 1 AND WAMR_BUILD_SIMDE EQUAL 0) + set(SIMD_ENABLED 0) + message(" SIMD disabled for fast-interp as simde is not being built") + elseif (WAMR_BUILD_TARGET MATCHES "RISCV64.*") + set(SIMD_ENABLED 0) message (" SIMD disabled due to not supported on target RISCV64") + else() + set(SIMD_ENABLED 1) + message (" SIMD enabled") endif () + add_definitions(-DWASM_ENABLE_SIMD=${SIMD_ENABLED}) endif () if (WAMR_BUILD_AOT_STACK_FRAME EQUAL 1) add_definitions (-DWASM_ENABLE_AOT_STACK_FRAME=1) diff --git a/build-scripts/runtime_lib.cmake b/build-scripts/runtime_lib.cmake index c3edfe503..994414ffa 100644 --- a/build-scripts/runtime_lib.cmake +++ b/build-scripts/runtime_lib.cmake @@ -155,6 +155,16 @@ if (WAMR_BUILD_LIB_RATS EQUAL 1) include (${IWASM_DIR}/libraries/lib-rats/lib_rats.cmake) endif () +if (WAMR_BUILD_SIMD EQUAL 1 AND WAMR_BUILD_FAST_INTERP EQUAL 1) + if (WAMR_BUILD_PLATFORM STREQUAL "windows") + message(STATUS "SIMDe doesnt support platform " ${WAMR_BUILD_PLATFORM}) + set(WAMR_BUILD_SIMDE 0) + else() + include (${IWASM_DIR}/libraries/simde/simde.cmake) + set (WAMR_BUILD_SIMDE 1) + endif() +endif () + if (WAMR_BUILD_WASM_CACHE EQUAL 1) include (${WAMR_ROOT_DIR}/build-scripts/involve_boringssl.cmake) endif () diff --git a/core/config.h b/core/config.h index d71aaca39..cb1189c96 100644 --- a/core/config.h +++ b/core/config.h @@ -322,6 +322,12 @@ #define WASM_ENABLE_SIMD 0 #endif +/* Disable SIMDe (used in the fast interpreter for SIMD opcodes) +unless used elsewhere */ +#ifndef WASM_ENABLE_SIMDE +#define WASM_ENABLE_SIMDE 0 +#endif + /* GC performance profiling */ #ifndef WASM_ENABLE_GC_PERF_PROFILING #define WASM_ENABLE_GC_PERF_PROFILING 0 diff --git a/core/iwasm/common/wasm_loader_common.c b/core/iwasm/common/wasm_loader_common.c index 97ea5efd1..6018f90a6 100644 --- a/core/iwasm/common/wasm_loader_common.c +++ b/core/iwasm/common/wasm_loader_common.c @@ -151,7 +151,8 @@ is_valid_value_type(uint8 type) bool is_valid_value_type_for_interpreter(uint8 value_type) { -#if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) +#if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) \ + && (WASM_ENABLE_FAST_INTERP == 0) /* * Note: regardless of WASM_ENABLE_SIMD, our interpreters don't have * SIMD implemented. It's safer to reject v128, especially for the diff --git a/core/iwasm/common/wasm_runtime_common.h b/core/iwasm/common/wasm_runtime_common.h index c6425af20..8ac032bf8 100644 --- a/core/iwasm/common/wasm_runtime_common.h +++ b/core/iwasm/common/wasm_runtime_common.h @@ -37,6 +37,10 @@ extern "C" { do { \ *(int64 *)(addr) = (int64)(value); \ } while (0) +#define PUT_V128_TO_ADDR(addr, value) \ + do { \ + *(V128 *)(addr) = (value); \ + } while (0) #define PUT_F64_TO_ADDR(addr, value) \ do { \ *(float64 *)(addr) = (float64)(value); \ @@ -49,6 +53,7 @@ extern "C" { #define GET_I64_FROM_ADDR(addr) (*(int64 *)(addr)) #define GET_F64_FROM_ADDR(addr) (*(float64 *)(addr)) #define GET_REF_FROM_ADDR(addr) (*(void **)(addr)) +#define GET_V128_FROM_ADDR(addr) (*(V128 *)(addr)) /* For STORE opcodes */ #define STORE_I64 PUT_I64_TO_ADDR @@ -68,6 +73,12 @@ STORE_U8(void *addr, uint8_t value) *(uint8 *)addr = value; } +static inline void +STORE_V128(void *addr, V128 value) +{ + *(V128 *)addr = value; +} + /* For LOAD opcodes */ #define LOAD_I64(addr) (*(int64 *)(addr)) #define LOAD_F64(addr) (*(float64 *)(addr)) @@ -75,6 +86,7 @@ STORE_U8(void *addr, uint8_t value) #define LOAD_U32(addr) (*(uint32 *)(addr)) #define LOAD_I16(addr) (*(int16 *)(addr)) #define LOAD_U16(addr) (*(uint16 *)(addr)) +#define LOAD_V128(addr) (*(V128 *)(addr)) #define STORE_PTR(addr, ptr) \ do { \ @@ -83,6 +95,15 @@ STORE_U8(void *addr, uint8_t value) #else /* WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 */ +#define PUT_V128_TO_ADDR(addr, value) \ + do { \ + uint32 *addr_u32 = (uint32 *)(addr); \ + addr_u32[0] = (value).i32x4[0]; \ + addr_u32[1] = (value).i32x4[1]; \ + addr_u32[2] = (value).i32x4[2]; \ + addr_u32[3] = (value).i32x4[3]; \ + } while (0) + #define PUT_I64_TO_ADDR(addr, value) \ do { \ uint32 *addr_u32 = (uint32 *)(addr); \ @@ -124,6 +145,17 @@ STORE_U8(void *addr, uint8_t value) } while (0) #endif +static inline V128 +GET_V128_FROM_ADDR(uint32 *addr) +{ + V128 ret; + ret.i32x4[0] = addr[0]; + ret.i32x4[1] = addr[1]; + ret.i32x4[2] = addr[2]; + ret.i32x4[3] = addr[3]; + return ret; +} + static inline int64 GET_I64_FROM_ADDR(uint32 *addr) { @@ -239,7 +271,94 @@ STORE_U16(void *addr, uint16_t value) ((uint8_t *)(addr))[0] = u.u8[0]; ((uint8_t *)(addr))[1] = u.u8[1]; } + +static inline void +STORE_V128(void *addr, V128 value) +{ + uintptr_t addr_ = (uintptr_t)(addr); + union { + V128 val; + uint64 u64[2]; + uint32 u32[4]; + uint16 u16[8]; + uint8 u8[16]; + } u; + + if ((addr_ & (uintptr_t)15) == 0) { + *(V128 *)addr = value; + } + else if ((addr_ & (uintptr_t)7) == 0) { + u.val = value; + ((uint64 *)(addr))[0] = u.u64[0]; + ((uint64 *)(addr))[1] = u.u64[1]; + } + else if ((addr_ & (uintptr_t)3) == 0) { + u.val = value; + ((uint32 *)addr)[0] = u.u32[0]; + ((uint32 *)addr)[1] = u.u32[1]; + ((uint32 *)addr)[2] = u.u32[2]; + ((uint32 *)addr)[3] = u.u32[3]; + } + else if ((addr_ & (uintptr_t)1) == 0) { + u.val = value; + ((uint16 *)addr)[0] = u.u16[0]; + ((uint16 *)addr)[1] = u.u16[1]; + ((uint16 *)addr)[2] = u.u16[2]; + ((uint16 *)addr)[3] = u.u16[3]; + ((uint16 *)addr)[4] = u.u16[4]; + ((uint16 *)addr)[5] = u.u16[5]; + ((uint16 *)addr)[6] = u.u16[6]; + ((uint16 *)addr)[7] = u.u16[7]; + } + else { + u.val = value; + for (int i = 0; i < 16; i++) + ((uint8 *)addr)[i] = u.u8[i]; + } +} + /* For LOAD opcodes */ +static inline V128 +LOAD_V128(void *addr) +{ + uintptr_t addr1 = (uintptr_t)addr; + union { + V128 val; + uint64 u64[2]; + uint32 u32[4]; + uint16 u16[8]; + uint8 u8[16]; + } u; + if ((addr1 & (uintptr_t)15) == 0) + return *(V128 *)addr; + + if ((addr1 & (uintptr_t)7) == 0) { + u.u64[0] = ((uint64 *)addr)[0]; + u.u64[1] = ((uint64 *)addr)[1]; + } + else if ((addr1 & (uintptr_t)3) == 0) { + u.u32[0] = ((uint32 *)addr)[0]; + u.u32[1] = ((uint32 *)addr)[1]; + u.u32[2] = ((uint32 *)addr)[2]; + u.u32[3] = ((uint32 *)addr)[3]; + } + else if ((addr1 & (uintptr_t)1) == 0) { + u.u16[0] = ((uint16 *)addr)[0]; + u.u16[1] = ((uint16 *)addr)[1]; + u.u16[2] = ((uint16 *)addr)[2]; + u.u16[3] = ((uint16 *)addr)[3]; + u.u16[4] = ((uint16 *)addr)[4]; + u.u16[5] = ((uint16 *)addr)[5]; + u.u16[6] = ((uint16 *)addr)[6]; + u.u16[7] = ((uint16 *)addr)[7]; + } + else { + for (int i = 0; i < 16; i++) + u.u8[i] = ((uint8 *)addr)[i]; + } + return u.val; +} + static inline int64 LOAD_I64(void *addr) { diff --git a/core/iwasm/interpreter/wasm_interp_fast.c b/core/iwasm/interpreter/wasm_interp_fast.c index 96b9ba2b2..21554953d 100644 --- a/core/iwasm/interpreter/wasm_interp_fast.c +++ b/core/iwasm/interpreter/wasm_interp_fast.c @@ -21,6 +21,10 @@ #include "../common/wasm_shared_memory.h" #endif +#if WASM_ENABLE_SIMDE != 0 +#include "simde/wasm/simd128.h" +#endif + typedef int32 CellType_I32; typedef int64 CellType_I64; typedef float32 CellType_F32; @@ -454,6 +458,8 @@ wasm_interp_get_frame_ref(WASMInterpFrame *frame) (type) GET_I64_FROM_ADDR(frame_lp + *(int16 *)(frame_ip + off)) #define GET_OPERAND_F64(type, off) \ (type) GET_F64_FROM_ADDR(frame_lp + *(int16 *)(frame_ip + off)) +#define GET_OPERAND_V128(off) \ + GET_V128_FROM_ADDR(frame_lp + *(int16 *)(frame_ip + off)) #define GET_OPERAND_REF(type, off) \ (type) GET_REF_FROM_ADDR(frame_lp + *(int16 *)(frame_ip + off)) @@ -504,6 +510,8 @@ wasm_interp_get_frame_ref(WASMInterpFrame *frame) #define POP_I64() (GET_I64_FROM_ADDR(frame_lp + GET_OFFSET())) +#define POP_V128() (GET_V128_FROM_ADDR(frame_lp + GET_OFFSET())) + #define POP_F64() (GET_F64_FROM_ADDR(frame_lp + GET_OFFSET())) #define POP_REF() \ @@ -1692,6 +1700,11 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, GET_OPERAND(uint64, I64, off)); ret_offset += 2; } + else if (ret_types[ret_idx] == VALUE_TYPE_V128) { + PUT_V128_TO_ADDR(prev_frame->lp + ret_offset, + GET_OPERAND_V128(off)); + ret_offset += 4; + } #if WASM_ENABLE_GC != 0 else if (wasm_is_type_reftype(ret_types[ret_idx])) { PUT_REF_TO_ADDR(prev_frame->lp + ret_offset, @@ -3531,6 +3544,24 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, HANDLE_OP_END(); } +#if WASM_ENABLE_SIMD != 0 + HANDLE_OP(EXT_OP_SET_LOCAL_FAST_V128) + HANDLE_OP(EXT_OP_TEE_LOCAL_FAST_V128) + { + /* clang-format off */ +#if WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 + local_offset = *frame_ip++; +#else + local_offset = *frame_ip; + frame_ip += 2; +#endif + /* clang-format on */ + PUT_V128_TO_ADDR((uint32 *)(frame_lp + local_offset), + GET_OPERAND_V128(0)); + frame_ip += 2; + HANDLE_OP_END(); + } +#endif HANDLE_OP(WASM_OP_GET_GLOBAL) { global_idx = read_uint32(frame_ip); @@ -3567,7 +3598,19 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, GET_I64_FROM_ADDR((uint32 *)global_addr)); HANDLE_OP_END(); } - +#if WASM_ENABLE_SIMD != 0 + HANDLE_OP(WASM_OP_GET_GLOBAL_V128) + { + global_idx = read_uint32(frame_ip); + bh_assert(global_idx < module->e->global_count); + global = globals + global_idx; + global_addr = get_global_addr(global_data, global); + addr_ret = GET_OFFSET(); + PUT_V128_TO_ADDR(frame_lp + addr_ret, + GET_V128_FROM_ADDR((uint32 *)global_addr)); + HANDLE_OP_END(); + } +#endif HANDLE_OP(WASM_OP_SET_GLOBAL) { global_idx = read_uint32(frame_ip); @@ -3634,6 +3677,19 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, GET_I64_FROM_ADDR(frame_lp + addr1)); HANDLE_OP_END(); } +#if WASM_ENABLE_SIMDE != 0 + HANDLE_OP(WASM_OP_SET_GLOBAL_V128) + { + global_idx = read_uint32(frame_ip); + bh_assert(global_idx < module->e->global_count); + global = globals + global_idx; + global_addr = get_global_addr(global_data, global); + addr1 = GET_OFFSET(); + PUT_V128_TO_ADDR((uint32 *)global_addr, + GET_V128_FROM_ADDR(frame_lp + addr1)); + HANDLE_OP_END(); + } +#endif /* memory load instructions */ HANDLE_OP(WASM_OP_I32_LOAD) @@ -4879,6 +4935,28 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, HANDLE_OP_END(); } +#if WASM_ENABLE_SIMD != 0 + HANDLE_OP(EXT_OP_COPY_STACK_TOP_V128) + { + addr1 = GET_OFFSET(); + addr2 = GET_OFFSET(); + + PUT_V128_TO_ADDR(frame_lp + addr2, + GET_V128_FROM_ADDR(frame_lp + addr1)); + +#if WASM_ENABLE_GC != 0 + /* Ignore constants because they are not reference */ + if (addr1 >= 0) { + if (*FRAME_REF(addr1)) { + CLEAR_FRAME_REF(addr1); + SET_FRAME_REF(addr2); + } + } +#endif + + HANDLE_OP_END(); + } +#endif HANDLE_OP(EXT_OP_COPY_STACK_VALUES) { @@ -5737,6 +5815,1571 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, #endif goto call_func_from_entry; } +#if WASM_ENABLE_SIMDE != 0 +#define SIMD_V128_TO_SIMDE_V128(v) \ + ({ \ + bh_assert(sizeof(V128) == sizeof(simde_v128_t)); \ + simde_v128_t result; \ + bh_memcpy_s(&result, sizeof(simde_v128_t), &(v), sizeof(V128)); \ + result; \ + }) + +#define SIMDE_V128_TO_SIMD_V128(sv, v) \ + do { \ + bh_assert(sizeof(V128) == sizeof(simde_v128_t)); \ + bh_memcpy_s(&(v), sizeof(V128), &(sv), sizeof(simde_v128_t)); \ + } while (0) + + HANDLE_OP(WASM_OP_SIMD_PREFIX) + { + GET_OPCODE(); + + switch (opcode) { + /* Memory */ + case SIMD_v128_load: + { + uint32 offset, addr; + offset = read_uint32(frame_ip); + addr = POP_I32(); + addr_ret = GET_OFFSET(); + CHECK_MEMORY_OVERFLOW(16); + PUT_V128_TO_ADDR(frame_lp + addr_ret, LOAD_V128(maddr)); + break; + } +#define SIMD_LOAD_OP(simde_func) \ + do { \ + uint32 offset, addr; \ + offset = read_uint32(frame_ip); \ + addr = POP_I32(); \ + addr_ret = GET_OFFSET(); \ + CHECK_MEMORY_OVERFLOW(8); \ + \ + simde_v128_t simde_result = simde_func(maddr); \ + \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + \ + } while (0) + case SIMD_v128_load8x8_s: + { + SIMD_LOAD_OP(simde_wasm_i16x8_load8x8); + break; + } + case SIMD_v128_load8x8_u: + { + SIMD_LOAD_OP(simde_wasm_u16x8_load8x8); + break; + } + case SIMD_v128_load16x4_s: + { + SIMD_LOAD_OP(simde_wasm_i32x4_load16x4); + break; + } + case SIMD_v128_load16x4_u: + { + SIMD_LOAD_OP(simde_wasm_u32x4_load16x4); + break; + } + case SIMD_v128_load32x2_s: + { + SIMD_LOAD_OP(simde_wasm_i64x2_load32x2); + break; + } + case SIMD_v128_load32x2_u: + { + SIMD_LOAD_OP(simde_wasm_u64x2_load32x2); + break; + } +#define SIMD_LOAD_SPLAT_OP(simde_func, width) \ + do { \ + uint32 offset, addr; \ + offset = read_uint32(frame_ip); \ + addr = POP_I32(); \ + addr_ret = GET_OFFSET(); \ + CHECK_MEMORY_OVERFLOW(width / 8); \ + \ + simde_v128_t simde_result = simde_func(maddr); \ + \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + + case SIMD_v128_load8_splat: + { + SIMD_LOAD_SPLAT_OP(simde_wasm_v128_load8_splat, 8); + break; + } + case SIMD_v128_load16_splat: + { + SIMD_LOAD_SPLAT_OP(simde_wasm_v128_load16_splat, 16); + break; + } + case SIMD_v128_load32_splat: + { + SIMD_LOAD_SPLAT_OP(simde_wasm_v128_load32_splat, 32); + break; + } + case SIMD_v128_load64_splat: + { + SIMD_LOAD_SPLAT_OP(simde_wasm_v128_load64_splat, 64); + break; + } + case SIMD_v128_store: + { + uint32 offset, addr; + offset = read_uint32(frame_ip); + V128 data = POP_V128(); + addr = POP_I32(); + + CHECK_MEMORY_OVERFLOW(16); + STORE_V128(maddr, data); + break; + } + + /* Basic */ + case SIMD_v128_const: + { + uint8 *orig_ip = frame_ip; + + frame_ip += sizeof(V128); + addr_ret = GET_OFFSET(); + + PUT_V128_TO_ADDR(frame_lp + addr_ret, *(V128 *)orig_ip); + break; + } + /* TODO: Add a faster SIMD implementation */ + case SIMD_v8x16_shuffle: + { + V128 indices; + bh_memcpy_s(&indices, sizeof(V128), frame_ip, + sizeof(V128)); + frame_ip += sizeof(V128); + + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + addr_ret = GET_OFFSET(); + + V128 result; + for (int i = 0; i < 16; i++) { + uint8_t index = indices.i8x16[i]; + if (index < 16) { + result.i8x16[i] = v1.i8x16[index]; + } + else { + result.i8x16[i] = v2.i8x16[index - 16]; + } + } + + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + case SIMD_v8x16_swizzle: + { + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + addr_ret = GET_OFFSET(); + simde_v128_t simde_result = simde_wasm_i8x16_swizzle( + SIMD_V128_TO_SIMDE_V128(v1), + SIMD_V128_TO_SIMDE_V128(v2)); + + V128 result; + SIMDE_V128_TO_SIMD_V128(simde_result, result); + + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + + /* Splat */ +#define SIMD_SPLAT_OP(simde_func, pop_func, val_type) \ + do { \ + val_type val = pop_func(); \ + addr_ret = GET_OFFSET(); \ + \ + simde_v128_t simde_result = simde_func(val); \ + \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + +#define SIMD_SPLAT_OP_I32(simde_func) SIMD_SPLAT_OP(simde_func, POP_I32, uint32) +#define SIMD_SPLAT_OP_I64(simde_func) SIMD_SPLAT_OP(simde_func, POP_I64, uint64) +#define SIMD_SPLAT_OP_F32(simde_func) \ + SIMD_SPLAT_OP(simde_func, POP_F32, float32) +#define SIMD_SPLAT_OP_F64(simde_func) \ + SIMD_SPLAT_OP(simde_func, POP_F64, float64) + + case SIMD_i8x16_splat: + { + uint32 val = POP_I32(); + addr_ret = GET_OFFSET(); + + simde_v128_t simde_result = simde_wasm_i8x16_splat(val); + + V128 result; + SIMDE_V128_TO_SIMD_V128(simde_result, result); + + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + case SIMD_i16x8_splat: + { + SIMD_SPLAT_OP_I32(simde_wasm_i16x8_splat); + break; + } + case SIMD_i32x4_splat: + { + SIMD_SPLAT_OP_I32(simde_wasm_i32x4_splat); + break; + } + case SIMD_i64x2_splat: + { + SIMD_SPLAT_OP_I64(simde_wasm_i64x2_splat); + break; + } + case SIMD_f32x4_splat: + { + SIMD_SPLAT_OP_F32(simde_wasm_f32x4_splat); + break; + } + case SIMD_f64x2_splat: + { + SIMD_SPLAT_OP_F64(simde_wasm_f64x2_splat); + break; + } +#if WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 +#define SIMD_LANE_HANDLE_UNALIGNED_ACCESS() +#else +#define SIMD_LANE_HANDLE_UNALIGNED_ACCESS() *frame_ip++; +#endif +#define SIMD_EXTRACT_LANE_OP(register, return_type, push_elem) \ + do { \ + uint8 lane = *frame_ip++; \ + SIMD_LANE_HANDLE_UNALIGNED_ACCESS(); \ + V128 v = POP_V128(); \ + push_elem((return_type)(v.register[lane])); \ + } while (0) +#define SIMD_REPLACE_LANE_OP(register, return_type, pop_elem) \ + do { \ + uint8 lane = *frame_ip++; \ + SIMD_LANE_HANDLE_UNALIGNED_ACCESS(); \ + return_type replacement = pop_elem(); \ + V128 v = POP_V128(); \ + v.register[lane] = replacement; \ + addr_ret = GET_OFFSET(); \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, v); \ + } while (0) + case SIMD_i8x16_extract_lane_s: + { + SIMD_EXTRACT_LANE_OP(i8x16, int8, PUSH_I32); + break; + } + case SIMD_i8x16_extract_lane_u: + { + SIMD_EXTRACT_LANE_OP(i8x16, uint8, PUSH_I32); + break; + } + case SIMD_i8x16_replace_lane: + { + SIMD_REPLACE_LANE_OP(i8x16, int8, POP_I32); + break; + } + case SIMD_i16x8_extract_lane_s: + { + SIMD_EXTRACT_LANE_OP(i16x8, int16, PUSH_I32); + break; + } + case SIMD_i16x8_extract_lane_u: + { + SIMD_EXTRACT_LANE_OP(i16x8, uint16, PUSH_I32); + break; + } + case SIMD_i16x8_replace_lane: + { + SIMD_REPLACE_LANE_OP(i16x8, int16, POP_I32); + break; + } + case SIMD_i32x4_extract_lane: + { + SIMD_EXTRACT_LANE_OP(i32x4, int32, PUSH_I32); + break; + } + case SIMD_i32x4_replace_lane: + { + SIMD_REPLACE_LANE_OP(i32x4, int32, POP_I32); + break; + } + case SIMD_i64x2_extract_lane: + { + SIMD_EXTRACT_LANE_OP(i64x2, int64, PUSH_I64); + break; + } + case SIMD_i64x2_replace_lane: + { + SIMD_REPLACE_LANE_OP(i64x2, int64, POP_I64); + break; + } + case SIMD_f32x4_extract_lane: + { + SIMD_EXTRACT_LANE_OP(f32x4, float32, PUSH_F32); + break; + } + case SIMD_f32x4_replace_lane: + { + SIMD_REPLACE_LANE_OP(f32x4, float32, POP_F32); + break; + } + case SIMD_f64x2_extract_lane: + { + SIMD_EXTRACT_LANE_OP(f64x2, float64, PUSH_F64); + break; + } + case SIMD_f64x2_replace_lane: + { + SIMD_REPLACE_LANE_OP(f64x2, float64, POP_F64); + break; + } + +#define SIMD_DOUBLE_OP(simde_func) \ + do { \ + V128 v2 = POP_V128(); \ + V128 v1 = POP_V128(); \ + addr_ret = GET_OFFSET(); \ + \ + simde_v128_t simde_result = simde_func(SIMD_V128_TO_SIMDE_V128(v1), \ + SIMD_V128_TO_SIMDE_V128(v2)); \ + \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + + /* i8x16 comparison operations */ + case SIMD_i8x16_eq: + { + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + addr_ret = GET_OFFSET(); + + simde_v128_t simde_result = + simde_wasm_i8x16_eq(SIMD_V128_TO_SIMDE_V128(v1), + SIMD_V128_TO_SIMDE_V128(v2)); + + V128 result; + SIMDE_V128_TO_SIMD_V128(simde_result, result); + + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + case SIMD_i8x16_ne: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_ne); + break; + } + case SIMD_i8x16_lt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_lt); + break; + } + case SIMD_i8x16_lt_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_lt); + break; + } + case SIMD_i8x16_gt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_gt); + break; + } + case SIMD_i8x16_gt_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_gt); + break; + } + case SIMD_i8x16_le_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_le); + break; + } + case SIMD_i8x16_le_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_le); + break; + } + case SIMD_i8x16_ge_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_ge); + break; + } + case SIMD_i8x16_ge_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_ge); + break; + } + + /* i16x8 comparison operations */ + case SIMD_i16x8_eq: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_eq); + break; + } + case SIMD_i16x8_ne: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_ne); + break; + } + case SIMD_i16x8_lt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_lt); + break; + } + case SIMD_i16x8_lt_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_lt); + break; + } + case SIMD_i16x8_gt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_gt); + break; + } + case SIMD_i16x8_gt_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_gt); + break; + } + case SIMD_i16x8_le_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_le); + break; + } + case SIMD_i16x8_le_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_le); + break; + } + case SIMD_i16x8_ge_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_ge); + break; + } + case SIMD_i16x8_ge_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_ge); + break; + } + + /* i32x4 comparison operations */ + case SIMD_i32x4_eq: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_eq); + break; + } + case SIMD_i32x4_ne: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_ne); + break; + } + case SIMD_i32x4_lt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_lt); + break; + } + case SIMD_i32x4_lt_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_lt); + break; + } + case SIMD_i32x4_gt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_gt); + break; + } + case SIMD_i32x4_gt_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_gt); + break; + } + case SIMD_i32x4_le_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_le); + break; + } + case SIMD_i32x4_le_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_le); + break; + } + case SIMD_i32x4_ge_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_ge); + break; + } + case SIMD_i32x4_ge_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_ge); + break; + } + + /* f32x4 comparison operations */ + case SIMD_f32x4_eq: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_eq); + break; + } + case SIMD_f32x4_ne: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_ne); + break; + } + case SIMD_f32x4_lt: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_lt); + break; + } + case SIMD_f32x4_gt: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_gt); + break; + } + case SIMD_f32x4_le: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_le); + break; + } + case SIMD_f32x4_ge: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_ge); + break; + } + + /* f64x2 comparison operations */ + case SIMD_f64x2_eq: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_eq); + break; + } + case SIMD_f64x2_ne: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_ne); + break; + } + case SIMD_f64x2_lt: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_lt); + break; + } + case SIMD_f64x2_gt: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_gt); + break; + } + case SIMD_f64x2_le: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_le); + break; + } + case SIMD_f64x2_ge: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_ge); + break; + } + + /* v128 bitwise operations */ +#define SIMD_V128_BITWISE_OP_COMMON(result_expr_0, result_expr_1) \ + do { \ + V128 result; \ + result.i64x2[0] = (result_expr_0); \ + result.i64x2[1] = (result_expr_1); \ + addr_ret = GET_OFFSET(); \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + + case SIMD_v128_not: + { + V128 value = POP_V128(); + SIMD_V128_BITWISE_OP_COMMON(~value.i64x2[0], + ~value.i64x2[1]); + break; + } + case SIMD_v128_and: + { + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + SIMD_V128_BITWISE_OP_COMMON(v1.i64x2[0] & v2.i64x2[0], + v1.i64x2[1] & v2.i64x2[1]); + break; + } + case SIMD_v128_andnot: + { + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + SIMD_V128_BITWISE_OP_COMMON( + v1.i64x2[0] & (~v2.i64x2[0]), + v1.i64x2[1] & (~v2.i64x2[1])); + break; + } + case SIMD_v128_or: + { + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + SIMD_V128_BITWISE_OP_COMMON(v1.i64x2[0] | v2.i64x2[0], + v1.i64x2[1] | v2.i64x2[1]); + break; + } + case SIMD_v128_xor: + { + V128 v2 = POP_V128(); + V128 v1 = POP_V128(); + SIMD_V128_BITWISE_OP_COMMON(v1.i64x2[0] ^ v2.i64x2[0], + v1.i64x2[1] ^ v2.i64x2[1]); + break; + } + case SIMD_v128_bitselect: + { + V128 v1 = POP_V128(); + V128 v2 = POP_V128(); + V128 v3 = POP_V128(); + addr_ret = GET_OFFSET(); + + simde_v128_t simde_result = simde_wasm_v128_bitselect( + SIMD_V128_TO_SIMDE_V128(v3), + SIMD_V128_TO_SIMDE_V128(v2), + SIMD_V128_TO_SIMDE_V128(v1)); + + V128 result; + SIMDE_V128_TO_SIMD_V128(simde_result, result); + + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); + break; + } + case SIMD_v128_any_true: + { + V128 value = POP_V128(); + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = + value.i64x2[0] != 0 || value.i64x2[1] != 0; + break; + } + +#define SIMD_LOAD_LANE_COMMON(vec, register, lane, width) \ + do { \ + addr_ret = GET_OFFSET(); \ + CHECK_MEMORY_OVERFLOW(width / 8); \ + if (width == 64) { \ + vec.register[lane] = GET_I64_FROM_ADDR(maddr); \ + } \ + else { \ + vec.register[lane] = *(uint##width *)(maddr); \ + } \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, vec); \ + } while (0) + +#define SIMD_LOAD_LANE_OP(register, width) \ + do { \ + uint32 offset, addr; \ + offset = read_uint32(frame_ip); \ + V128 vec = POP_V128(); \ + addr = POP_I32(); \ + int lane = *frame_ip++; \ + SIMD_LANE_HANDLE_UNALIGNED_ACCESS(); \ + SIMD_LOAD_LANE_COMMON(vec, register, lane, width); \ + } while (0) + + case SIMD_v128_load8_lane: + { + SIMD_LOAD_LANE_OP(i8x16, 8); + break; + } + case SIMD_v128_load16_lane: + { + SIMD_LOAD_LANE_OP(i16x8, 16); + break; + } + case SIMD_v128_load32_lane: + { + SIMD_LOAD_LANE_OP(i32x4, 32); + break; + } + case SIMD_v128_load64_lane: + { + SIMD_LOAD_LANE_OP(i64x2, 64); + break; + } +#define SIMD_STORE_LANE_OP(register, width) \ + do { \ + uint32 offset, addr; \ + offset = read_uint32(frame_ip); \ + V128 vec = POP_V128(); \ + addr = POP_I32(); \ + int lane = *frame_ip++; \ + SIMD_LANE_HANDLE_UNALIGNED_ACCESS(); \ + CHECK_MEMORY_OVERFLOW(width / 8); \ + if (width == 64) { \ + STORE_I64(maddr, vec.register[lane]); \ + } \ + else { \ + *(uint##width *)(maddr) = vec.register[lane]; \ + } \ + } while (0) + + case SIMD_v128_store8_lane: + { + SIMD_STORE_LANE_OP(i8x16, 8); + break; + } + + case SIMD_v128_store16_lane: + { + SIMD_STORE_LANE_OP(i16x8, 16); + break; + } + + case SIMD_v128_store32_lane: + { + SIMD_STORE_LANE_OP(i32x4, 32); + break; + } + + case SIMD_v128_store64_lane: + { + SIMD_STORE_LANE_OP(i64x2, 64); + break; + } +#define SIMD_LOAD_ZERO_OP(register, width) \ + do { \ + uint32 offset, addr; \ + offset = read_uint32(frame_ip); \ + addr = POP_I32(); \ + int32 lane = 0; \ + V128 vec = { 0 }; \ + SIMD_LOAD_LANE_COMMON(vec, register, lane, width); \ + } while (0) + + case SIMD_v128_load32_zero: + { + SIMD_LOAD_ZERO_OP(i32x4, 32); + break; + } + case SIMD_v128_load64_zero: + { + SIMD_LOAD_ZERO_OP(i64x2, 64); + break; + } + +#define SIMD_SINGLE_OP(simde_func) \ + do { \ + V128 v1 = POP_V128(); \ + addr_ret = GET_OFFSET(); \ + \ + simde_v128_t simde_result = simde_func(SIMD_V128_TO_SIMDE_V128(v1)); \ + \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + + /* Float conversion */ + case SIMD_f32x4_demote_f64x2_zero: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_demote_f64x2_zero); + break; + } + case SIMD_f64x2_promote_low_f32x4_zero: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_promote_low_f32x4); + break; + } + + /* i8x16 operations */ + case SIMD_i8x16_abs: + { + SIMD_SINGLE_OP(simde_wasm_i8x16_abs); + break; + } + case SIMD_i8x16_neg: + { + SIMD_SINGLE_OP(simde_wasm_i8x16_neg); + break; + } + case SIMD_i8x16_popcnt: + { + SIMD_SINGLE_OP(simde_wasm_i8x16_popcnt); + break; + } + case SIMD_i8x16_all_true: + { + V128 v1 = POP_V128(); + + bool result = simde_wasm_i8x16_all_true( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + + case SIMD_i8x16_bitmask: + { + V128 v1 = POP_V128(); + + uint32_t result = simde_wasm_i8x16_bitmask( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + case SIMD_i8x16_narrow_i16x8_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_narrow_i16x8); + break; + } + case SIMD_i8x16_narrow_i16x8_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_narrow_i16x8); + break; + } + case SIMD_f32x4_ceil: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_ceil); + break; + } + case SIMD_f32x4_floor: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_floor); + break; + } + case SIMD_f32x4_trunc: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_trunc); + break; + } + case SIMD_f32x4_nearest: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_nearest); + break; + } +#define SIMD_LANE_SHIFT(simde_func) \ + do { \ + int32 count = POP_I32(); \ + V128 v1 = POP_V128(); \ + addr_ret = GET_OFFSET(); \ + \ + simde_v128_t simde_result = \ + simde_func(SIMD_V128_TO_SIMDE_V128(v1), count); \ + \ + V128 result; \ + SIMDE_V128_TO_SIMD_V128(simde_result, result); \ + \ + PUT_V128_TO_ADDR(frame_lp + addr_ret, result); \ + } while (0) + case SIMD_i8x16_shl: + { + SIMD_LANE_SHIFT(simde_wasm_i8x16_shl); + break; + } + case SIMD_i8x16_shr_s: + { + SIMD_LANE_SHIFT(simde_wasm_i8x16_shr); + break; + } + case SIMD_i8x16_shr_u: + { + SIMD_LANE_SHIFT(simde_wasm_u8x16_shr); + break; + } + case SIMD_i8x16_add: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_add); + break; + } + case SIMD_i8x16_add_sat_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_add_sat); + break; + } + case SIMD_i8x16_add_sat_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_add_sat); + break; + } + case SIMD_i8x16_sub: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_sub); + break; + } + case SIMD_i8x16_sub_sat_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_sub_sat); + break; + } + case SIMD_i8x16_sub_sat_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_sub_sat); + break; + } + case SIMD_f64x2_ceil: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_ceil); + break; + } + case SIMD_f64x2_floor: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_floor); + break; + } + case SIMD_i8x16_min_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_min); + break; + } + case SIMD_i8x16_min_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_min); + break; + } + case SIMD_i8x16_max_s: + { + SIMD_DOUBLE_OP(simde_wasm_i8x16_max); + break; + } + case SIMD_i8x16_max_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_max); + break; + } + case SIMD_f64x2_trunc: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_trunc); + break; + } + case SIMD_i8x16_avgr_u: + { + SIMD_DOUBLE_OP(simde_wasm_u8x16_avgr); + break; + } + case SIMD_i16x8_extadd_pairwise_i8x16_s: + { + SIMD_SINGLE_OP(simde_wasm_i16x8_extadd_pairwise_i8x16); + break; + } + case SIMD_i16x8_extadd_pairwise_i8x16_u: + { + SIMD_SINGLE_OP(simde_wasm_u16x8_extadd_pairwise_u8x16); + break; + } + case SIMD_i32x4_extadd_pairwise_i16x8_s: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_extadd_pairwise_i16x8); + break; + } + case SIMD_i32x4_extadd_pairwise_i16x8_u: + { + SIMD_SINGLE_OP(simde_wasm_u32x4_extadd_pairwise_u16x8); + break; + } + + /* i16x8 operations */ + case SIMD_i16x8_abs: + { + SIMD_SINGLE_OP(simde_wasm_i16x8_abs); + break; + } + case SIMD_i16x8_neg: + { + SIMD_SINGLE_OP(simde_wasm_i16x8_neg); + break; + } + case SIMD_i16x8_q15mulr_sat_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_q15mulr_sat); + break; + } + case SIMD_i16x8_all_true: + { + V128 v1 = POP_V128(); + + bool result = simde_wasm_i16x8_all_true( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + case SIMD_i16x8_bitmask: + { + V128 v1 = POP_V128(); + + uint32_t result = simde_wasm_i16x8_bitmask( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + case SIMD_i16x8_narrow_i32x4_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_narrow_i32x4); + break; + } + case SIMD_i16x8_narrow_i32x4_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_narrow_i32x4); + break; + } + case SIMD_i16x8_extend_low_i8x16_s: + { + SIMD_SINGLE_OP(simde_wasm_i16x8_extend_low_i8x16); + break; + } + case SIMD_i16x8_extend_high_i8x16_s: + { + SIMD_SINGLE_OP(simde_wasm_i16x8_extend_high_i8x16); + break; + } + case SIMD_i16x8_extend_low_i8x16_u: + { + SIMD_SINGLE_OP(simde_wasm_u16x8_extend_low_u8x16); + break; + } + case SIMD_i16x8_extend_high_i8x16_u: + { + SIMD_SINGLE_OP(simde_wasm_u16x8_extend_high_u8x16); + break; + } + case SIMD_i16x8_shl: + { + SIMD_LANE_SHIFT(simde_wasm_i16x8_shl); + break; + } + case SIMD_i16x8_shr_s: + { + SIMD_LANE_SHIFT(simde_wasm_i16x8_shr); + break; + } + case SIMD_i16x8_shr_u: + { + SIMD_LANE_SHIFT(simde_wasm_u16x8_shr); + break; + } + case SIMD_i16x8_add: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_add); + break; + } + case SIMD_i16x8_add_sat_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_add_sat); + break; + } + case SIMD_i16x8_add_sat_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_add_sat); + break; + } + case SIMD_i16x8_sub: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_sub); + break; + } + case SIMD_i16x8_sub_sat_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_sub_sat); + break; + } + case SIMD_i16x8_sub_sat_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_sub_sat); + break; + } + case SIMD_f64x2_nearest: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_nearest); + break; + } + case SIMD_i16x8_mul: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_mul); + break; + } + case SIMD_i16x8_min_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_min); + break; + } + case SIMD_i16x8_min_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_min); + break; + } + case SIMD_i16x8_max_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_max); + break; + } + case SIMD_i16x8_max_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_max); + break; + } + case SIMD_i16x8_avgr_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_avgr); + break; + } + case SIMD_i16x8_extmul_low_i8x16_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_extmul_low_i8x16); + break; + } + case SIMD_i16x8_extmul_high_i8x16_s: + { + SIMD_DOUBLE_OP(simde_wasm_i16x8_extmul_high_i8x16); + break; + } + case SIMD_i16x8_extmul_low_i8x16_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_extmul_low_u8x16); + break; + } + case SIMD_i16x8_extmul_high_i8x16_u: + { + SIMD_DOUBLE_OP(simde_wasm_u16x8_extmul_high_u8x16); + break; + } + + /* i32x4 operations */ + case SIMD_i32x4_abs: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_abs); + break; + } + case SIMD_i32x4_neg: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_neg); + break; + } + case SIMD_i32x4_all_true: + { + V128 v1 = POP_V128(); + + bool result = simde_wasm_i32x4_all_true( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + case SIMD_i32x4_bitmask: + { + V128 v1 = POP_V128(); + + uint32_t result = simde_wasm_i32x4_bitmask( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + case SIMD_i32x4_extend_low_i16x8_s: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_extend_low_i16x8); + break; + } + case SIMD_i32x4_extend_high_i16x8_s: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_extend_high_i16x8); + break; + } + case SIMD_i32x4_extend_low_i16x8_u: + { + SIMD_SINGLE_OP(simde_wasm_u32x4_extend_low_u16x8); + break; + } + case SIMD_i32x4_extend_high_i16x8_u: + { + SIMD_SINGLE_OP(simde_wasm_u32x4_extend_high_u16x8); + break; + } + case SIMD_i32x4_shl: + { + SIMD_LANE_SHIFT(simde_wasm_i32x4_shl); + break; + } + case SIMD_i32x4_shr_s: + { + SIMD_LANE_SHIFT(simde_wasm_i32x4_shr); + break; + } + case SIMD_i32x4_shr_u: + { + SIMD_LANE_SHIFT(simde_wasm_u32x4_shr); + break; + } + case SIMD_i32x4_add: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_add); + break; + } + case SIMD_i32x4_sub: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_sub); + break; + } + case SIMD_i32x4_mul: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_mul); + break; + } + case SIMD_i32x4_min_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_min); + break; + } + case SIMD_i32x4_min_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_min); + break; + } + case SIMD_i32x4_max_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_max); + break; + } + case SIMD_i32x4_max_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_max); + break; + } + case SIMD_i32x4_dot_i16x8_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_dot_i16x8); + break; + } + case SIMD_i32x4_extmul_low_i16x8_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_extmul_low_i16x8); + break; + } + case SIMD_i32x4_extmul_high_i16x8_s: + { + SIMD_DOUBLE_OP(simde_wasm_i32x4_extmul_high_i16x8); + break; + } + case SIMD_i32x4_extmul_low_i16x8_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_extmul_low_u16x8); + break; + } + case SIMD_i32x4_extmul_high_i16x8_u: + { + SIMD_DOUBLE_OP(simde_wasm_u32x4_extmul_high_u16x8); + break; + } + + /* i64x2 operations */ + case SIMD_i64x2_abs: + { + SIMD_SINGLE_OP(simde_wasm_i64x2_abs); + break; + } + case SIMD_i64x2_neg: + { + SIMD_SINGLE_OP(simde_wasm_i64x2_neg); + break; + } + case SIMD_i64x2_all_true: + { + V128 v1 = POP_V128(); + + bool result = simde_wasm_i64x2_all_true( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + case SIMD_i64x2_bitmask: + { + V128 v1 = POP_V128(); + + uint32_t result = simde_wasm_i64x2_bitmask( + SIMD_V128_TO_SIMDE_V128(v1)); + + addr_ret = GET_OFFSET(); + frame_lp[addr_ret] = result; + break; + } + case SIMD_i64x2_extend_low_i32x4_s: + { + SIMD_SINGLE_OP(simde_wasm_i64x2_extend_low_i32x4); + break; + } + case SIMD_i64x2_extend_high_i32x4_s: + { + SIMD_SINGLE_OP(simde_wasm_i64x2_extend_high_i32x4); + break; + } + case SIMD_i64x2_extend_low_i32x4_u: + { + SIMD_SINGLE_OP(simde_wasm_u64x2_extend_low_u32x4); + break; + } + case SIMD_i64x2_extend_high_i32x4_u: + { + SIMD_SINGLE_OP(simde_wasm_u64x2_extend_high_u32x4); + break; + } + case SIMD_i64x2_shl: + { + SIMD_LANE_SHIFT(simde_wasm_i64x2_shl); + break; + } + case SIMD_i64x2_shr_s: + { + SIMD_LANE_SHIFT(simde_wasm_i64x2_shr); + break; + } + case SIMD_i64x2_shr_u: + { + SIMD_LANE_SHIFT(simde_wasm_u64x2_shr); + break; + } + case SIMD_i64x2_add: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_add); + break; + } + case SIMD_i64x2_sub: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_sub); + break; + } + case SIMD_i64x2_mul: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_mul); + break; + } + case SIMD_i64x2_eq: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_eq); + break; + } + case SIMD_i64x2_ne: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_ne); + break; + } + case SIMD_i64x2_lt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_lt); + break; + } + case SIMD_i64x2_gt_s: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_gt); + break; + } + case SIMD_i64x2_le_s: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_le); + break; + } + case SIMD_i64x2_ge_s: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_ge); + break; + } + case SIMD_i64x2_extmul_low_i32x4_s: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_extmul_low_i32x4); + break; + } + case SIMD_i64x2_extmul_high_i32x4_s: + { + SIMD_DOUBLE_OP(simde_wasm_i64x2_extmul_high_i32x4); + break; + } + case SIMD_i64x2_extmul_low_i32x4_u: + { + SIMD_DOUBLE_OP(simde_wasm_u64x2_extmul_low_u32x4); + break; + } + case SIMD_i64x2_extmul_high_i32x4_u: + { + SIMD_DOUBLE_OP(simde_wasm_u64x2_extmul_high_u32x4); + break; + } + + /* f32x4 opertions */ + case SIMD_f32x4_abs: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_abs); + break; + } + case SIMD_f32x4_neg: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_neg); + break; + } + case SIMD_f32x4_sqrt: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_sqrt); + break; + } + case SIMD_f32x4_add: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_add); + break; + } + case SIMD_f32x4_sub: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_sub); + break; + } + case SIMD_f32x4_mul: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_mul); + break; + } + case SIMD_f32x4_div: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_div); + break; + } + case SIMD_f32x4_min: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_min); + break; + } + case SIMD_f32x4_max: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_max); + break; + } + case SIMD_f32x4_pmin: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_pmin); + break; + } + case SIMD_f32x4_pmax: + { + SIMD_DOUBLE_OP(simde_wasm_f32x4_pmax); + break; + } + + /* f64x2 operations */ + case SIMD_f64x2_abs: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_abs); + break; + } + case SIMD_f64x2_neg: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_neg); + break; + } + case SIMD_f64x2_sqrt: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_sqrt); + break; + } + case SIMD_f64x2_add: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_add); + break; + } + case SIMD_f64x2_sub: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_sub); + break; + } + case SIMD_f64x2_mul: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_mul); + break; + } + case SIMD_f64x2_div: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_div); + break; + } + case SIMD_f64x2_min: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_min); + break; + } + case SIMD_f64x2_max: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_max); + break; + } + case SIMD_f64x2_pmin: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_pmin); + break; + } + case SIMD_f64x2_pmax: + { + SIMD_DOUBLE_OP(simde_wasm_f64x2_pmax); + break; + } + + /* Conversion operations */ + case SIMD_i32x4_trunc_sat_f32x4_s: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_trunc_sat_f32x4); + break; + } + case SIMD_i32x4_trunc_sat_f32x4_u: + { + SIMD_SINGLE_OP(simde_wasm_u32x4_trunc_sat_f32x4); + break; + } + case SIMD_f32x4_convert_i32x4_s: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_convert_i32x4); + break; + } + case SIMD_f32x4_convert_i32x4_u: + { + SIMD_SINGLE_OP(simde_wasm_f32x4_convert_u32x4); + break; + } + case SIMD_i32x4_trunc_sat_f64x2_s_zero: + { + SIMD_SINGLE_OP(simde_wasm_i32x4_trunc_sat_f64x2_zero); + break; + } + case SIMD_i32x4_trunc_sat_f64x2_u_zero: + { + SIMD_SINGLE_OP(simde_wasm_u32x4_trunc_sat_f64x2_zero); + break; + } + case SIMD_f64x2_convert_low_i32x4_s: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_convert_low_i32x4); + break; + } + case SIMD_f64x2_convert_low_i32x4_u: + { + SIMD_SINGLE_OP(simde_wasm_f64x2_convert_low_u32x4); + break; + } + + default: + wasm_set_exception(module, "unsupported SIMD opcode"); + } + HANDLE_OP_END(); + } +#endif HANDLE_OP(WASM_OP_CALL) { @@ -5845,7 +7488,7 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, #if WASM_ENABLE_LABELS_AS_VALUES == 0 continue; #else - FETCH_OPCODE_AND_DISPATCH(); + FETCH_OPCODE_AND_DISPATCH(); #endif #if WASM_ENABLE_TAIL_CALL != 0 || WASM_ENABLE_GC != 0 @@ -5914,8 +7557,14 @@ wasm_interp_call_func_bytecode(WASMModuleInstance *module, } for (i = 0; i < cur_func->param_count; i++) { - if (cur_func->param_types[i] == VALUE_TYPE_I64 - || cur_func->param_types[i] == VALUE_TYPE_F64) { + if (cur_func->param_types[i] == VALUE_TYPE_V128) { + PUT_V128_TO_ADDR( + outs_area->lp, + GET_OPERAND_V128(2 * (cur_func->param_count - i - 1))); + outs_area->lp += 4; + } + else if (cur_func->param_types[i] == VALUE_TYPE_I64 + || cur_func->param_types[i] == VALUE_TYPE_F64) { PUT_I64_TO_ADDR( outs_area->lp, GET_OPERAND(uint64, I64, diff --git a/core/iwasm/interpreter/wasm_loader.c b/core/iwasm/interpreter/wasm_loader.c index 31424d21e..29813d875 100644 --- a/core/iwasm/interpreter/wasm_loader.c +++ b/core/iwasm/interpreter/wasm_loader.c @@ -319,7 +319,8 @@ is_byte_a_type(uint8 type) } #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) static V128 read_i8x16(uint8 *p_buf, char *error_buf, uint32 error_buf_size) { @@ -332,7 +333,8 @@ read_i8x16(uint8 *p_buf, char *error_buf, uint32 error_buf_size) return result; } -#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ +#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \ + (WASM_ENABLE_FAST_INTERP != 0) */ #endif /* end of WASM_ENABLE_SIMD */ static void * @@ -725,7 +727,8 @@ load_init_expr(WASMModule *module, const uint8 **p_buf, const uint8 *buf_end, goto fail; break; #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) /* v128.const */ case INIT_EXPR_TYPE_V128_CONST: { @@ -754,7 +757,8 @@ load_init_expr(WASMModule *module, const uint8 **p_buf, const uint8 *buf_end, #endif break; } -#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ +#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \ + (WASM_ENABLE_FAST_INTERP != 0) */ #endif /* end of WASM_ENABLE_SIMD */ #if WASM_ENABLE_REF_TYPES != 0 || WASM_ENABLE_GC != 0 @@ -4174,7 +4178,8 @@ load_export_section(const uint8 *buf, const uint8 *buf_end, WASMModule *module, return false; } #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) /* TODO: check func type, if it has v128 param or result, report error */ #endif @@ -7347,6 +7352,10 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache, case WASM_OP_SET_GLOBAL: case WASM_OP_GET_GLOBAL_64: case WASM_OP_SET_GLOBAL_64: +#if WASM_ENABLE_SIMDE != 0 + case WASM_OP_GET_GLOBAL_V128: + case WASM_OP_SET_GLOBAL_V128: +#endif case WASM_OP_SET_GLOBAL_AUX_STACK: skip_leb_uint32(p, p_end); /* local index */ break; @@ -7723,7 +7732,8 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache, } #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) case WASM_OP_SIMD_PREFIX: { uint32 opcode1; @@ -7816,7 +7826,8 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache, } break; } -#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ +#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \ + (WASM_ENABLE_FAST_INTERP != 0) */ #endif /* end of WASM_ENABLE_SIMD */ #if WASM_ENABLE_SHARED_MEMORY != 0 @@ -7991,6 +8002,10 @@ typedef struct WASMLoaderContext { int32 *i32_consts; uint32 i32_const_max_num; uint32 i32_const_num; + /* const buffer for V128 */ + V128 *v128_consts; + uint32 v128_const_max_num; + uint32 v128_const_num; /* processed code */ uint8 *p_code_compiled; @@ -8224,6 +8239,8 @@ wasm_loader_ctx_destroy(WASMLoaderContext *ctx) wasm_runtime_free(ctx->i64_consts); if (ctx->i32_consts) wasm_runtime_free(ctx->i32_consts); + if (ctx->v128_consts) + wasm_runtime_free(ctx->v128_consts); #endif wasm_runtime_free(ctx); } @@ -8281,6 +8298,11 @@ wasm_loader_ctx_init(WASMFunction *func, char *error_buf, uint32 error_buf_size) loader_malloc(sizeof(int32) * loader_ctx->i32_const_max_num, error_buf, error_buf_size))) goto fail; + loader_ctx->v128_const_max_num = 8; + if (!(loader_ctx->v128_consts = + loader_malloc(sizeof(V128) * loader_ctx->v128_const_max_num, + error_buf, error_buf_size))) + goto fail; if (func->param_cell_num >= (int32)INT16_MAX - func->local_cell_num) { set_error_buf(error_buf, error_buf_size, @@ -9139,6 +9161,7 @@ preserve_referenced_local(WASMLoaderContext *loader_ctx, uint8 opcode, bool *preserved, char *error_buf, uint32 error_buf_size) { + uint32 i = 0; int16 preserved_offset = (int16)local_index; @@ -9162,6 +9185,13 @@ preserve_referenced_local(WASMLoaderContext *loader_ctx, uint8 opcode, loader_ctx->preserved_local_offset++; emit_label(EXT_OP_COPY_STACK_TOP); } +#if WASM_ENABLE_SIMDE != 0 + else if (local_type == VALUE_TYPE_V128) { + if (loader_ctx->p_code_compiled) + loader_ctx->preserved_local_offset += 4; + emit_label(EXT_OP_COPY_STACK_TOP_V128); + } +#endif else { if (loader_ctx->p_code_compiled) loader_ctx->preserved_local_offset += 2; @@ -9174,10 +9204,15 @@ preserve_referenced_local(WASMLoaderContext *loader_ctx, uint8 opcode, loader_ctx->frame_offset_bottom[i] = preserved_offset; } - if (is_32bit_type(cur_type)) + if (cur_type == VALUE_TYPE_V128) { + i += 4; + } + else if (is_32bit_type(cur_type)) { i++; - else + } + else { i += 2; + } } (void)error_buf; @@ -9206,7 +9241,10 @@ preserve_local_for_block(WASMLoaderContext *loader_ctx, uint8 opcode, return false; } - if (is_32bit_type(cur_type)) { + if (cur_type == VALUE_TYPE_V128) { + i += 4; + } + else if (is_32bit_type(cur_type)) { i++; } else { @@ -9545,6 +9583,15 @@ cmp_i32_const(const void *p_i32_const1, const void *p_i32_const2) return (i32_const1 < i32_const2) ? -1 : (i32_const1 > i32_const2) ? 1 : 0; } +static int +cmp_v128_const(const void *p_v128_const1, const void *p_v128_const2) +{ + V128 v128_const1 = *(V128 *)p_v128_const1; + V128 v128_const2 = *(V128 *)p_v128_const2; + + return memcmp(&v128_const1, &v128_const2, sizeof(V128)); +} + static bool wasm_loader_get_const_offset(WASMLoaderContext *ctx, uint8 type, void *value, int16 *offset, char *error_buf, @@ -9578,6 +9625,32 @@ wasm_loader_get_const_offset(WASMLoaderContext *ctx, uint8 type, void *value, } ctx->i64_consts[ctx->i64_const_num++] = *(int64 *)value; } + else if (type == VALUE_TYPE_V128) { + /* No slot left, emit const instead */ + if (ctx->v128_const_num * 4 > INT16_MAX - 2) { + *offset = 0; + return true; + } + + /* Traverse the list if the const num is small */ + if (ctx->v128_const_num < 10) { + for (uint32 i = 0; i < ctx->v128_const_num; i++) { + if (memcmp(&ctx->v128_consts[i], value, sizeof(V128)) + == 0) { + *offset = -1; + return true; + } + } + } + + if (ctx->v128_const_num >= ctx->v128_const_max_num) { + MEM_REALLOC(ctx->v128_consts, + sizeof(V128) * ctx->v128_const_max_num, + sizeof(V128) * (ctx->v128_const_max_num * 2)); + ctx->v128_const_max_num *= 2; + } + ctx->v128_consts[ctx->v128_const_num++] = *(V128 *)value; + } else { /* Treat i32 and f32 as the same by reading i32 value from the raw bytes */ @@ -9623,6 +9696,17 @@ wasm_loader_get_const_offset(WASMLoaderContext *ctx, uint8 type, void *value, *offset = -(uint32)(ctx->i64_const_num * 2 + ctx->i32_const_num) + (uint32)(i64_const - ctx->i64_consts) * 2; } + else if (type == VALUE_TYPE_V128) { + V128 key = *(V128 *)value, *v128_const; + v128_const = bsearch(&key, ctx->v128_consts, ctx->v128_const_num, + sizeof(V128), cmp_v128_const); + if (!v128_const) { /* not found, emit const instead */ + *offset = 0; + return true; + } + *offset = -(uint32)(ctx->v128_const_num) + + (uint32)(v128_const - ctx->v128_consts); + } else { int32 key = *(int32 *)value, *i32_const; i32_const = bsearch(&key, ctx->i32_consts, ctx->i32_const_num, @@ -9819,17 +9903,23 @@ reserve_block_ret(WASMLoaderContext *loader_ctx, uint8 opcode, block_type, &return_types, &reftype_maps, &reftype_map_count); #endif - /* If there is only one return value, use EXT_OP_COPY_STACK_TOP/_I64 instead - * of EXT_OP_COPY_STACK_VALUES for interpreter performance. */ + /* If there is only one return value, use EXT_OP_COPY_STACK_TOP/_I64/V128 + * instead of EXT_OP_COPY_STACK_VALUES for interpreter performance. */ if (return_count == 1) { uint8 cell = (uint8)wasm_value_type_cell_num(return_types[0]); - if (cell <= 2 /* V128 isn't supported whose cell num is 4 */ - && block->dynamic_offset != *(loader_ctx->frame_offset - cell)) { + if (block->dynamic_offset != *(loader_ctx->frame_offset - cell)) { /* insert op_copy before else opcode */ if (opcode == WASM_OP_ELSE) skip_label(); - emit_label(cell == 1 ? EXT_OP_COPY_STACK_TOP - : EXT_OP_COPY_STACK_TOP_I64); +#if WASM_ENABLE_SIMDE != 0 + if (cell == 4) { + emit_label(EXT_OP_COPY_STACK_TOP_V128); + } +#endif + if (cell <= 2) { + emit_label(cell == 1 ? EXT_OP_COPY_STACK_TOP + : EXT_OP_COPY_STACK_TOP_I64); + } emit_operand(loader_ctx, *(loader_ctx->frame_offset - cell)); emit_operand(loader_ctx, block->dynamic_offset); @@ -9864,11 +9954,37 @@ reserve_block_ret(WASMLoaderContext *loader_ctx, uint8 opcode, for (i = (int32)return_count - 1; i >= 0; i--) { uint8 cells = (uint8)wasm_value_type_cell_num(return_types[i]); - frame_offset -= cells; - dynamic_offset -= cells; - if (dynamic_offset != *frame_offset) { - value_count++; - total_cel_num += cells; + if (frame_offset - cells < loader_ctx->frame_offset_bottom) { + set_error_buf(error_buf, error_buf_size, "frame offset underflow"); + goto fail; + } + + if (cells == 4) { + bool needs_copy = false; + int16 v128_dynamic = dynamic_offset - cells; + + for (int j = 0; j < 4; j++) { + if (*(frame_offset - j - 1) != (v128_dynamic + j)) { + needs_copy = true; + break; + } + } + + if (needs_copy) { + value_count++; + total_cel_num += cells; + } + + frame_offset -= cells; + dynamic_offset = v128_dynamic; + } + else { + frame_offset -= cells; + dynamic_offset -= cells; + if (dynamic_offset != *frame_offset) { + value_count++; + total_cel_num += cells; + } } } @@ -9904,19 +10020,50 @@ reserve_block_ret(WASMLoaderContext *loader_ctx, uint8 opcode, dynamic_offset = dynamic_offset_org; for (i = (int32)return_count - 1, j = 0; i >= 0; i--) { uint8 cell = (uint8)wasm_value_type_cell_num(return_types[i]); - frame_offset -= cell; - dynamic_offset -= cell; - if (dynamic_offset != *frame_offset) { - /* cell num */ - cells[j] = cell; - /* src offset */ - src_offsets[j] = *frame_offset; - /* dst offset */ - dst_offsets[j] = dynamic_offset; - j++; + + if (cell == 4) { + bool needs_copy = false; + int16 v128_dynamic = dynamic_offset - cell; + + for (int k = 0; k < 4; k++) { + if (*(frame_offset - k - 1) != (v128_dynamic + k)) { + needs_copy = true; + break; + } + } + + if (needs_copy) { + cells[j] = cell; + src_offsets[j] = *(frame_offset - cell); + dst_offsets[j] = v128_dynamic; + j++; + } + + frame_offset -= cell; + dynamic_offset = v128_dynamic; } + else { + frame_offset -= cell; + dynamic_offset -= cell; + if (dynamic_offset != *frame_offset) { + cells[j] = cell; + /* src offset */ + src_offsets[j] = *frame_offset; + /* dst offset */ + dst_offsets[j] = dynamic_offset; + j++; + } + } + if (opcode == WASM_OP_ELSE) { - *frame_offset = dynamic_offset; + if (cell == 4) { + for (int k = 0; k < cell; k++) { + *(frame_offset + k) = dynamic_offset + k; + } + } + else { + *frame_offset = dynamic_offset; + } } else { loader_ctx->frame_offset = frame_offset; @@ -10075,7 +10222,8 @@ check_memory_access_align(uint8 opcode, uint32 align, char *error_buf, } #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) static bool check_simd_memory_access_align(uint8 opcode, uint32 align, char *error_buf, uint32 error_buf_size) @@ -11172,6 +11320,39 @@ re_scan: } } + if (loader_ctx->v128_const_num > 0) { + V128 *v128_consts_old = loader_ctx->v128_consts; + + /* Sort the v128 consts */ + qsort(v128_consts_old, loader_ctx->v128_const_num, sizeof(V128), + cmp_v128_const); + + /* Remove the duplicated v128 consts */ + uint32 k = 1; + for (i = 1; i < loader_ctx->v128_const_num; i++) { + if (!(memcmp(&v128_consts_old[i], &v128_consts_old[i - 1], + sizeof(V128)) + == 0)) { + v128_consts_old[k++] = v128_consts_old[i]; + } + } + + if (k < loader_ctx->v128_const_num) { + V128 *v128_consts_new; + /* Try to reallocate memory with a smaller size */ + if ((v128_consts_new = + wasm_runtime_malloc((uint32)sizeof(V128) * k))) { + bh_memcpy_s(v128_consts_new, (uint32)sizeof(V128) * k, + v128_consts_old, (uint32)sizeof(V128) * k); + /* Free the old memory */ + wasm_runtime_free(v128_consts_old); + loader_ctx->v128_consts = v128_consts_new; + loader_ctx->v128_const_max_num = k; + } + loader_ctx->v128_const_num = k; + } + } + if (loader_ctx->i32_const_num > 0) { int32 *i32_consts_old = loader_ctx->i32_consts; @@ -12492,10 +12673,20 @@ re_scan: #endif } #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) else if (*(loader_ctx->frame_ref - 1) == VALUE_TYPE_V128) { loader_ctx->frame_ref -= 4; loader_ctx->stack_cell_num -= 4; +#if WASM_ENABLE_FAST_INTERP != 0 + skip_label(); + loader_ctx->frame_offset -= 4; + if ((*(loader_ctx->frame_offset) + > loader_ctx->start_dynamic_offset) + && (*(loader_ctx->frame_offset) + < loader_ctx->max_dynamic_offset)) + loader_ctx->dynamic_offset -= 4; +#endif } #endif #endif @@ -12582,10 +12773,12 @@ re_scan: #endif /* end of WASM_ENABLE_FAST_INTERP */ break; #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) case VALUE_TYPE_V128: break; -#endif /* (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ +#endif /* (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \ + (WASM_ENABLE_FAST_INTERP != 0) */ #endif /* WASM_ENABLE_SIMD != 0 */ default: { @@ -12680,8 +12873,9 @@ re_scan: uint8 opcode_tmp = WASM_OP_SELECT; if (type == VALUE_TYPE_V128) { -#if (WASM_ENABLE_SIMD == 0) \ - || ((WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0)) +#if (WASM_ENABLE_SIMD == 0) \ + || ((WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) \ + && (WASM_ENABLE_FAST_INTERP == 0)) set_error_buf(error_buf, error_buf_size, "SIMD v128 type isn't supported"); goto fail; @@ -13177,10 +13371,21 @@ re_scan: emit_label(EXT_OP_SET_LOCAL_FAST); emit_byte(loader_ctx, (uint8)local_offset); } - else { + else if (is_64bit_type(local_type)) { emit_label(EXT_OP_SET_LOCAL_FAST_I64); emit_byte(loader_ctx, (uint8)local_offset); } +#if WASM_ENABLE_SIMDE != 0 + else if (local_type == VALUE_TYPE_V128) { + emit_label(EXT_OP_SET_LOCAL_FAST_V128); + emit_byte(loader_ctx, (uint8)local_offset); + } +#endif + else { + set_error_buf(error_buf, error_buf_size, + "unknown local type"); + goto fail; + } POP_OFFSET_TYPE(local_type); } } @@ -13253,6 +13458,12 @@ re_scan: emit_label(EXT_OP_TEE_LOCAL_FAST); emit_byte(loader_ctx, (uint8)local_offset); } +#if WASM_ENABLE_SIMDE != 0 + else if (local_type == VALUE_TYPE_V128) { + emit_label(EXT_OP_TEE_LOCAL_FAST_V128); + emit_byte(loader_ctx, (uint8)local_offset); + } +#endif else { emit_label(EXT_OP_TEE_LOCAL_FAST_I64); emit_byte(loader_ctx, (uint8)local_offset); @@ -13341,12 +13552,18 @@ re_scan: #endif *p_org = WASM_OP_GET_GLOBAL_64; } -#else /* else of WASM_ENABLE_FAST_INTERP */ +#else /* else of WASM_ENABLE_FAST_INTERP */ if (global_type == VALUE_TYPE_I64 || global_type == VALUE_TYPE_F64) { skip_label(); emit_label(WASM_OP_GET_GLOBAL_64); } +#if WASM_ENABLE_SIMDE != 0 + if (global_type == VALUE_TYPE_V128) { + skip_label(); + emit_label(WASM_OP_GET_GLOBAL_V128); + } +#endif /* end of WASM_ENABLE_SIMDE */ emit_uint32(loader_ctx, global_idx); PUSH_OFFSET_TYPE(global_type); #endif /* end of WASM_ENABLE_FAST_INTERP */ @@ -13430,7 +13647,7 @@ re_scan: func->has_op_set_global_aux_stack = true; #endif } -#else /* else of WASM_ENABLE_FAST_INTERP */ +#else /* else of WASM_ENABLE_FAST_INTERP */ if (global_type == VALUE_TYPE_I64 || global_type == VALUE_TYPE_F64) { skip_label(); @@ -13441,6 +13658,12 @@ re_scan: skip_label(); emit_label(WASM_OP_SET_GLOBAL_AUX_STACK); } +#if WASM_ENABLE_SIMDE != 0 + else if (global_type == VALUE_TYPE_V128) { + skip_label(); + emit_label(WASM_OP_SET_GLOBAL_V128); + } +#endif /* end of WASM_ENABLE_SIMDE */ emit_uint32(loader_ctx, global_idx); POP_OFFSET_TYPE(global_type); #endif /* end of WASM_ENABLE_FAST_INTERP */ @@ -15285,7 +15508,8 @@ re_scan: } #if WASM_ENABLE_SIMD != 0 -#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) +#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \ + || (WASM_ENABLE_FAST_INTERP != 0) case WASM_OP_SIMD_PREFIX: { uint32 opcode1; @@ -15297,6 +15521,10 @@ re_scan: pb_read_leb_uint32(p, p_end, opcode1); +#if WASM_ENABLE_FAST_INTERP != 0 + emit_byte(loader_ctx, opcode1); +#endif + /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h */ switch (opcode1) { @@ -15324,6 +15552,10 @@ re_scan: pb_read_leb_mem_offset(p, p_end, mem_offset); /* offset */ +#if WASM_ENABLE_FAST_INTERP != 0 + emit_uint32(loader_ctx, mem_offset); +#endif + POP_AND_PUSH(mem_offset_type, VALUE_TYPE_V128); #if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0 func->has_memory_operations = true; @@ -15344,6 +15576,10 @@ re_scan: pb_read_leb_mem_offset(p, p_end, mem_offset); /* offset */ +#if WASM_ENABLE_FAST_INTERP != 0 + emit_uint32(loader_ctx, mem_offset); +#endif + POP_V128(); POP_MEM_OFFSET(); #if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0 @@ -15355,7 +15591,13 @@ re_scan: /* basic operation */ case SIMD_v128_const: { + uint64 high, low; CHECK_BUF1(p, p_end, 16); +#if WASM_ENABLE_FAST_INTERP != 0 + wasm_runtime_read_v128(p, &high, &low); + emit_uint64(loader_ctx, high); + emit_uint64(loader_ctx, low); +#endif p += 16; PUSH_V128(); break; @@ -15367,12 +15609,17 @@ re_scan: CHECK_BUF1(p, p_end, 16); mask = read_i8x16(p, error_buf, error_buf_size); - p += 16; if (!check_simd_shuffle_mask(mask, error_buf, error_buf_size)) { goto fail; } - +#if WASM_ENABLE_FAST_INTERP != 0 + uint64 high, low; + wasm_runtime_read_v128(p, &high, &low); + emit_uint64(loader_ctx, high); + emit_uint64(loader_ctx, low); +#endif + p += 16; POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); break; } @@ -15443,14 +15690,25 @@ re_scan: error_buf_size)) { goto fail; } - +#if WASM_ENABLE_FAST_INTERP != 0 + emit_byte(loader_ctx, lane); +#endif if (replace[opcode1 - SIMD_i8x16_extract_lane_s]) { +#if WASM_ENABLE_FAST_INTERP != 0 + if (!(wasm_loader_pop_frame_ref_offset( + loader_ctx, + replace[opcode1 + - SIMD_i8x16_extract_lane_s], + error_buf, error_buf_size))) + goto fail; +#else if (!(wasm_loader_pop_frame_ref( loader_ctx, replace[opcode1 - SIMD_i8x16_extract_lane_s], error_buf, error_buf_size))) goto fail; +#endif /* end of WASM_ENABLE_FAST_INTERP != 0 */ } POP_AND_PUSH( @@ -15569,9 +15827,14 @@ re_scan: error_buf_size)) { goto fail; } - +#if WASM_ENABLE_FAST_INTERP != 0 + emit_uint32(loader_ctx, mem_offset); +#endif POP_V128(); POP_MEM_OFFSET(); +#if WASM_ENABLE_FAST_INTERP != 0 + emit_byte(loader_ctx, lane); +#endif if (opcode1 < SIMD_v128_store8_lane) { PUSH_V128(); } @@ -15594,7 +15857,9 @@ re_scan: pb_read_leb_mem_offset(p, p_end, mem_offset); /* offset */ - +#if WASM_ENABLE_FAST_INTERP != 0 + emit_uint32(loader_ctx, mem_offset); +#endif POP_AND_PUSH(mem_offset_type, VALUE_TYPE_V128); #if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0 func->has_memory_operations = true; @@ -15943,7 +16208,8 @@ re_scan: } break; } -#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ +#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \ + (WASM_ENABLE_FAST_INTERP != 0) */ #endif /* end of WASM_ENABLE_SIMD */ #if WASM_ENABLE_SHARED_MEMORY != 0 @@ -16123,8 +16389,9 @@ re_scan: if (loader_ctx->p_code_compiled == NULL) goto re_scan; - func->const_cell_num = - loader_ctx->i64_const_num * 2 + loader_ctx->i32_const_num; + func->const_cell_num = loader_ctx->i64_const_num * 2 + + loader_ctx->v128_const_num * 4 + + loader_ctx->i32_const_num; if (func->const_cell_num > 0) { if (!(func->consts = loader_malloc((uint64)sizeof(uint32) * func->const_cell_num, @@ -16143,6 +16410,12 @@ re_scan: loader_ctx->i32_consts, (uint32)sizeof(int32) * loader_ctx->i32_const_num); } + if (loader_ctx->v128_const_num > 0) { + bh_memcpy_s(func->consts, + (uint32)sizeof(V128) * loader_ctx->v128_const_num, + loader_ctx->v128_consts, + (uint32)sizeof(V128) * loader_ctx->v128_const_num); + } } func->max_stack_cell_num = loader_ctx->preserved_local_offset diff --git a/core/iwasm/interpreter/wasm_opcode.h b/core/iwasm/interpreter/wasm_opcode.h index 76647454b..9660bb123 100644 --- a/core/iwasm/interpreter/wasm_opcode.h +++ b/core/iwasm/interpreter/wasm_opcode.h @@ -278,6 +278,15 @@ typedef enum WASMOpcode { DEBUG_OP_BREAK = 0xdc, /* debug break point */ #endif +#if WASM_ENABLE_JIT != 0 \ + || WASM_ENABLE_FAST_INTERP != 0 && WASM_ENABLE_SIMD != 0 + EXT_OP_SET_LOCAL_FAST_V128 = 0xdd, + EXT_OP_TEE_LOCAL_FAST_V128 = 0xde, + EXT_OP_COPY_STACK_TOP_V128 = 0xdf, + WASM_OP_GET_GLOBAL_V128 = 0xe0, + WASM_OP_SET_GLOBAL_V128 = 0xe1, +#endif + /* Post-MVP extend op prefix */ WASM_OP_GC_PREFIX = 0xfb, WASM_OP_MISC_PREFIX = 0xfc, @@ -779,16 +788,27 @@ typedef enum WASMAtomicEXTOpcode { #else #define DEF_DEBUG_BREAK_HANDLE() #endif - #define SET_GOTO_TABLE_ELEM(opcode) [opcode] = HANDLE_OPCODE(opcode) -#if WASM_ENABLE_JIT != 0 && WASM_ENABLE_SIMD != 0 +#if (WASM_ENABLE_JIT != 0 || WASM_ENABLE_FAST_INTERP != 0) \ + && WASM_ENABLE_SIMD != 0 #define SET_GOTO_TABLE_SIMD_PREFIX_ELEM() \ SET_GOTO_TABLE_ELEM(WASM_OP_SIMD_PREFIX), #else #define SET_GOTO_TABLE_SIMD_PREFIX_ELEM() #endif +#if (WASM_ENABLE_FAST_INTERP != 0) && WASM_ENABLE_SIMD != 0 +#define DEF_EXT_V128_HANDLE() \ + SET_GOTO_TABLE_ELEM(EXT_OP_SET_LOCAL_FAST_V128), /* 0xdd */ \ + SET_GOTO_TABLE_ELEM(EXT_OP_TEE_LOCAL_FAST_V128), /* 0xde */ \ + SET_GOTO_TABLE_ELEM(EXT_OP_COPY_STACK_TOP_V128), /* 0xdf */ \ + SET_GOTO_TABLE_ELEM(WASM_OP_GET_GLOBAL_V128), /* 0xe0 */ \ + SET_GOTO_TABLE_ELEM(WASM_OP_SET_GLOBAL_V128), /* 0xe1 */ + +#else +#define DEF_EXT_V128_HANDLE() +#endif /* * Macro used to generate computed goto tables for the C interpreter. */ @@ -1020,7 +1040,7 @@ typedef enum WASMAtomicEXTOpcode { SET_GOTO_TABLE_ELEM(WASM_OP_MISC_PREFIX), /* 0xfc */ \ SET_GOTO_TABLE_SIMD_PREFIX_ELEM() /* 0xfd */ \ SET_GOTO_TABLE_ELEM(WASM_OP_ATOMIC_PREFIX), /* 0xfe */ \ - DEF_DEBUG_BREAK_HANDLE() \ + DEF_DEBUG_BREAK_HANDLE() DEF_EXT_V128_HANDLE() \ }; #ifdef __cplusplus diff --git a/core/iwasm/libraries/simde/simde.cmake b/core/iwasm/libraries/simde/simde.cmake new file mode 100644 index 000000000..eeb0e8d1f --- /dev/null +++ b/core/iwasm/libraries/simde/simde.cmake @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Amazon Inc. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# simde is a header only library + +set (LIB_SIMDE_DIR ${CMAKE_CURRENT_LIST_DIR}) + +add_definitions (-DWASM_ENABLE_SIMDE=1) + +include_directories(${LIB_SIMDE_DIR} ${LIB_SIMDE_DIR}/simde) + +include(FetchContent) + +FetchContent_Declare( + simde + GIT_REPOSITORY https://github.com/simd-everywhere/simde + GIT_TAG v0.8.2 +) + +message("-- Fetching simde ..") +FetchContent_MakeAvailable(simde) +include_directories("${simde_SOURCE_DIR}") diff --git a/doc/build_wamr.md b/doc/build_wamr.md index b8cfcc989..36b15e568 100644 --- a/doc/build_wamr.md +++ b/doc/build_wamr.md @@ -132,7 +132,11 @@ cmake -DWAMR_BUILD_PLATFORM=linux -DWAMR_BUILD_TARGET=ARM ### **Enable 128-bit SIMD feature** - **WAMR_BUILD_SIMD**=1/0, default to enable if not set -> Note: only supported in AOT mode x86-64 target. +> Note: supported in AOT mode, JIT mode, and fast-interpreter mode with SIMDe library. + +### **Enable SIMDe library for SIMD in fast interpreter** +- **WAMR_BUILD_LIB_SIMDE**=1/0, default to disable if not set +> Note: If enabled, SIMDe (SIMD Everywhere) library will be used to implement SIMD operations in fast interpreter mode. ### **Enable Exception Handling** - **WAMR_BUILD_EXCE_HANDLING**=1/0, default to disable if not set @@ -335,4 +339,11 @@ Or if we want to enable interpreter, disable AOT and WASI, and build as X86_32, ``` Bash cmake .. -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_AOT=0 -DWAMR_BUILD_LIBC_WASI=0 -DWAMR_BUILD_TARGET=X86_32 -``` \ No newline at end of file +``` + +When enabling SIMD for fast interpreter mode, you'll need to enable both SIMD and the SIMDe library: + +``` Bash + +cmake .. -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_SIMD=1 -DWAMR_BUILD_LIB_SIMDE=1 +``` diff --git a/tests/wamr-test-suites/test_wamr.sh b/tests/wamr-test-suites/test_wamr.sh index 31f8b3746..183033751 100755 --- a/tests/wamr-test-suites/test_wamr.sh +++ b/tests/wamr-test-suites/test_wamr.sh @@ -913,8 +913,8 @@ function do_execute_in_running_mode() fi if [[ ${ENABLE_SIMD} -eq 1 ]]; then - if [[ "${RUNNING_MODE}" != "jit" && "${RUNNING_MODE}" != "aot" ]]; then - echo "support simd in llvm-jit mode and aot mode" + if [[ "${RUNNING_MODE}" != "jit" && "${RUNNING_MODE}" != "aot" && "${RUNNING_MODE}" != "fast-interp" ]]; then + echo "support simd in llvm-jit, aot and fast-interp mode" return 0; fi fi