Merge dev/simd for fast-interp (#4131)

* Implement the first few SIMD opcodes for fast interpreter (v128.const, v128.any_true) (#3818)

Tested on the following code:
```
(module
  (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32)))
  (memory (export "memory") 1)

  ;; WASI entry point
  (func $main (export "_start")
    v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    v128.any_true
    if
      unreachable
    end
    
    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15     
    v128.any_true
    i32.const 0
    i32.eq
    if
      unreachable
    end

    i32.const 0
    call $proc_exit
  )
)
```

* implement POP_V128()

This is to simplify the simd implementation for fast interpreter

* Add all SIMD operations into wasm_interp_fast switch

* Add V128 comparison operations

Tested using
```
(module
  (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32)))

  (memory (export "memory") 1)

  (func $assert_true (param v128)
    local.get 0
    v128.any_true
    i32.eqz
    if
      unreachable
    end
  )

  (func $main (export "_start")
    ;; Test v128.not
    v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    v128.not
    v128.const i8x16 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
    i8x16.eq
    call $assert_true

    ;; Test v128.and
    v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0
    v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0
    v128.and
    v128.const i8x16 255 255 0 0 0 0 0 0 255 255 0 0 0 0 0 0
    i8x16.eq
    call $assert_true

    ;; Test v128.andnot
    v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0
    v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0
    v128.andnot
    v128.const i8x16 0 0 255 255 0 0 0 0 0 0 255 255 0 0 0 0
    i8x16.eq
    call $assert_true

    ;; Test v128.or
    v128.const i8x16 255 255 0 0 0 0 255 255 255 255 0 0 0 0 255 0
    v128.const i8x16 0 0 255 255 255 255 0 0 0 0 255 255 255 255 0 0
    v128.or
    v128.const i8x16 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 0
    i8x16.eq
    call $assert_true

    ;; Test v128.xor
    v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0
    v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0
    v128.xor
    v128.const i8x16 0 0 255 255 255 255 0 0 0 0 255 255 255 255 0 0
    i8x16.eq
    call $assert_true

    i32.const 0
    call $proc_exit
  )
)
```

* Add first NEON SIMD opcode implementations to fast interpreter (#3859)

Add some implementations of SIMD opcodes using NEON instructions.
Tested using:
```wast
(module
  (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32)))
  (memory (export "memory") 1)

  (func $assert_true (param v128)
    local.get 0
    v128.any_true 
    i32.eqz
    if
      unreachable
    end
  )
  (func $main (export "_start")
    i32.const 0
    i32.const 32
    memory.grow
    drop

    i32.const 0
    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    v128.store

    i32.const 0
    v128.load

    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    i8x16.eq
    call $assert_true

    i32.const 16
    v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    v128.store

    i32.const 16
    v128.load
    v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    i8x16.eq
    call $assert_true

    i32.const 0
    v128.load
    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    i8x16.eq
    call $assert_true
    drop

    i32.const 0
    i32.const 1
    memory.grow
    drop

    i32.const 0
    i64.const 0x7F80FF017E02FE80
    i64.store

    i32.const 0
    v128.load8x8_s

    v128.const i16x8 127 -128 -1 1 126 2 -2 -128

    i16x8.eq
    call $assert_true

    i32.const 0
    i64.const 0x80FE027E01FF807F
    i64.store

    i32.const 0
    v128.load8x8_u

    v128.const i16x8 128 254 2 126 1 255 128 127

    i16x8.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000FFFE7FFF0001
    i64.store

    i32.const 0
    v128.load16x4_s

    v128.const i32x4 -32768 -2 32767 1

    i32x4.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000FFFE7FFF0001 
    i64.store

    i32.const 0
    v128.load16x4_u

    v128.const i32x4 32768 65534 32767 1   

    i32x4.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000000000000001
    i64.store

    i32.const 0
    v128.load32x2_s

    v128.const i64x2 -2147483648 1 

    i64x2.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000000000000001
    i64.store

    i32.const 0
    v128.load32x2_u

    v128.const i64x2 2147483648 1

    i64x2.eq
    call $assert_true

    call $proc_exit
  )
)
```

* Emit imm for lane extract and replace (#3906)

* Fix replacement value not being correct (#3919)

* Implement load lanes opcodes for wasm (#3942)

* Implement final SIMD opcodes: store lane (#4001)

* Fix load/store (#4054)

* Correctly use unsigned functions  (#4055)

* implement local and function calls for v128 in the fast interpreter

* Fix splat opcodes, add V128 handling in preserve_referenced_local and reserve_block_ret

* Fix incorrect memory overflow values + SIMD ifdefs

* Fix load/load_splat macros

* correct endif wasm loader

* Update core/iwasm/interpreter/wasm_opcode.h

* Fix spec tests when WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS is 0

* Resolve merge conflicts arising from main -> dev/simd_for_interp and implement fast interpreter const offset loader support for V128

* Enable SIMDe tests on CI

* Document WAMR_BUILD_LIB_SIMDE

---------

Co-authored-by: James Marsh <mrshnja@amazon.co.uk>
Co-authored-by: jammar1 <108334558+jammar1@users.noreply.github.com>
Co-authored-by: Maks Litskevich <makslit@amazon.com>
Co-authored-by: Marcin Kolny <marcin.kolny@gmail.com>
Co-authored-by: Wenyong Huang <wenyong.huang@intel.com>
This commit is contained in:
Marcin Kolny 2025-03-20 06:23:20 +00:00 committed by GitHub
parent c30e65ba5d
commit efa8019bdb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 2189 additions and 73 deletions

View File

@ -158,6 +158,7 @@ jobs:
"-DWAMR_BUILD_PERF_PROFILING=1", "-DWAMR_BUILD_PERF_PROFILING=1",
"-DWAMR_BUILD_REF_TYPES=1", "-DWAMR_BUILD_REF_TYPES=1",
"-DWAMR_BUILD_SIMD=1", "-DWAMR_BUILD_SIMD=1",
"-DWAMR_BUILD_LIB_SIMDE=1",
"-DWAMR_BUILD_TAIL_CALL=1", "-DWAMR_BUILD_TAIL_CALL=1",
"-DWAMR_DISABLE_HW_BOUND_CHECK=1", "-DWAMR_DISABLE_HW_BOUND_CHECK=1",
"-DWAMR_BUILD_MEMORY64=1", "-DWAMR_BUILD_MEMORY64=1",
@ -178,11 +179,9 @@ jobs:
make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1" make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
- make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS - make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS
make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1" make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
# SIMD only on JIT/AOT mode # SIMD only on JIT/AOT/fast interpreter mode
- make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS - make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
make_options_feature: "-DWAMR_BUILD_SIMD=1" make_options_feature: "-DWAMR_BUILD_SIMD=1"
- make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
make_options_feature: "-DWAMR_BUILD_SIMD=1"
# DEBUG_INTERP only on CLASSIC INTERP mode # DEBUG_INTERP only on CLASSIC INTERP mode
- make_options_run_mode: $AOT_BUILD_OPTIONS - make_options_run_mode: $AOT_BUILD_OPTIONS
make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1" make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
@ -649,11 +648,9 @@ jobs:
test_option: $WAMR_COMPILER_TEST_OPTIONS test_option: $WAMR_COMPILER_TEST_OPTIONS
exclude: exclude:
# incompatible modes and features # incompatible modes and features
# classic-interp and fast-interp don't support simd # classic-interp doesn't support simd
- running_mode: "classic-interp" - running_mode: "classic-interp"
test_option: $SIMD_TEST_OPTIONS test_option: $SIMD_TEST_OPTIONS
- running_mode: "fast-interp"
test_option: $SIMD_TEST_OPTIONS
# llvm jit doesn't support multi module # llvm jit doesn't support multi module
- running_mode: "jit" - running_mode: "jit"
test_option: $MULTI_MODULES_TEST_OPTIONS test_option: $MULTI_MODULES_TEST_OPTIONS

View File

@ -49,7 +49,7 @@ env:
# ref types enabled in wamrc by default, so we need to enable it for iwasm in AOT mode # ref types enabled in wamrc by default, so we need to enable it for iwasm in AOT mode
AOT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0 -DWAMR_BUILD_REF_TYPES=1" AOT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0 -DWAMR_BUILD_REF_TYPES=1"
CLASSIC_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0" CLASSIC_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
FAST_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0" FAST_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0 -DWAMR_BUILD_SIMD=0"
FAST_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=1" FAST_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=1"
LLVM_LAZY_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=1" LLVM_LAZY_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=1"
LLVM_EAGER_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=0" LLVM_EAGER_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=0"
@ -97,7 +97,7 @@ jobs:
"-DWAMR_BUILD_PERF_PROFILING=1", "-DWAMR_BUILD_PERF_PROFILING=1",
"-DWAMR_BUILD_REF_TYPES=1", "-DWAMR_BUILD_REF_TYPES=1",
# doesn't support # doesn't support
# "-DWAMR_BUILD_SIMD=1", "-DWAMR_BUILD_SIMD=0",
"-DWAMR_BUILD_TAIL_CALL=1", "-DWAMR_BUILD_TAIL_CALL=1",
"-DWAMR_DISABLE_HW_BOUND_CHECK=1", "-DWAMR_DISABLE_HW_BOUND_CHECK=1",
"-DWAMR_BUILD_SGX_IPFS=1", "-DWAMR_BUILD_SGX_IPFS=1",

View File

@ -300,6 +300,9 @@ endif ()
if (WAMR_BUILD_LIB_RATS EQUAL 1) if (WAMR_BUILD_LIB_RATS EQUAL 1)
message (" Lib rats enabled") message (" Lib rats enabled")
endif() endif()
if ((WAMR_BUILD_LIB_SIMDE EQUAL 1))
message (" Lib simde enabled")
endif()
################## WAMR features ################## ################## WAMR features ##################
if (WAMR_BUILD_MULTI_MODULE EQUAL 1) if (WAMR_BUILD_MULTI_MODULE EQUAL 1)
add_definitions (-DWASM_ENABLE_MULTI_MODULE=1) add_definitions (-DWASM_ENABLE_MULTI_MODULE=1)
@ -371,11 +374,17 @@ else ()
message (" Wakeup of blocking operations enabled") message (" Wakeup of blocking operations enabled")
endif () endif ()
if (WAMR_BUILD_SIMD EQUAL 1) if (WAMR_BUILD_SIMD EQUAL 1)
if (NOT WAMR_BUILD_TARGET MATCHES "RISCV64.*") if (WAMR_BUILD_FAST_INTERP EQUAL 1 AND WAMR_BUILD_SIMDE EQUAL 0)
add_definitions (-DWASM_ENABLE_SIMD=1) set(SIMD_ENABLED 0)
else () message(" SIMD disabled for fast-interp as simde is not being built")
elseif (WAMR_BUILD_TARGET MATCHES "RISCV64.*")
set(SIMD_ENABLED 0)
message (" SIMD disabled due to not supported on target RISCV64") message (" SIMD disabled due to not supported on target RISCV64")
else()
set(SIMD_ENABLED 1)
message (" SIMD enabled")
endif () endif ()
add_definitions(-DWASM_ENABLE_SIMD=${SIMD_ENABLED})
endif () endif ()
if (WAMR_BUILD_AOT_STACK_FRAME EQUAL 1) if (WAMR_BUILD_AOT_STACK_FRAME EQUAL 1)
add_definitions (-DWASM_ENABLE_AOT_STACK_FRAME=1) add_definitions (-DWASM_ENABLE_AOT_STACK_FRAME=1)

View File

@ -155,6 +155,16 @@ if (WAMR_BUILD_LIB_RATS EQUAL 1)
include (${IWASM_DIR}/libraries/lib-rats/lib_rats.cmake) include (${IWASM_DIR}/libraries/lib-rats/lib_rats.cmake)
endif () endif ()
if (WAMR_BUILD_SIMD EQUAL 1 AND WAMR_BUILD_FAST_INTERP EQUAL 1)
if (WAMR_BUILD_PLATFORM STREQUAL "windows")
message(STATUS "SIMDe doesnt support platform " ${WAMR_BUILD_PLATFORM})
set(WAMR_BUILD_SIMDE 0)
else()
include (${IWASM_DIR}/libraries/simde/simde.cmake)
set (WAMR_BUILD_SIMDE 1)
endif()
endif ()
if (WAMR_BUILD_WASM_CACHE EQUAL 1) if (WAMR_BUILD_WASM_CACHE EQUAL 1)
include (${WAMR_ROOT_DIR}/build-scripts/involve_boringssl.cmake) include (${WAMR_ROOT_DIR}/build-scripts/involve_boringssl.cmake)
endif () endif ()

View File

@ -322,6 +322,12 @@
#define WASM_ENABLE_SIMD 0 #define WASM_ENABLE_SIMD 0
#endif #endif
/* Disable SIMDe (used in the fast interpreter for SIMD opcodes)
unless used elsewhere */
#ifndef WASM_ENABLE_SIMDE
#define WASM_ENABLE_SIMDE 0
#endif
/* GC performance profiling */ /* GC performance profiling */
#ifndef WASM_ENABLE_GC_PERF_PROFILING #ifndef WASM_ENABLE_GC_PERF_PROFILING
#define WASM_ENABLE_GC_PERF_PROFILING 0 #define WASM_ENABLE_GC_PERF_PROFILING 0

View File

@ -151,7 +151,8 @@ is_valid_value_type(uint8 type)
bool bool
is_valid_value_type_for_interpreter(uint8 value_type) is_valid_value_type_for_interpreter(uint8 value_type)
{ {
#if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) #if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) \
&& (WASM_ENABLE_FAST_INTERP == 0)
/* /*
* Note: regardless of WASM_ENABLE_SIMD, our interpreters don't have * Note: regardless of WASM_ENABLE_SIMD, our interpreters don't have
* SIMD implemented. It's safer to reject v128, especially for the * SIMD implemented. It's safer to reject v128, especially for the

View File

@ -37,6 +37,10 @@ extern "C" {
do { \ do { \
*(int64 *)(addr) = (int64)(value); \ *(int64 *)(addr) = (int64)(value); \
} while (0) } while (0)
#define PUT_V128_TO_ADDR(addr, value) \
do { \
*(V128 *)(addr) = (value); \
} while (0)
#define PUT_F64_TO_ADDR(addr, value) \ #define PUT_F64_TO_ADDR(addr, value) \
do { \ do { \
*(float64 *)(addr) = (float64)(value); \ *(float64 *)(addr) = (float64)(value); \
@ -49,6 +53,7 @@ extern "C" {
#define GET_I64_FROM_ADDR(addr) (*(int64 *)(addr)) #define GET_I64_FROM_ADDR(addr) (*(int64 *)(addr))
#define GET_F64_FROM_ADDR(addr) (*(float64 *)(addr)) #define GET_F64_FROM_ADDR(addr) (*(float64 *)(addr))
#define GET_REF_FROM_ADDR(addr) (*(void **)(addr)) #define GET_REF_FROM_ADDR(addr) (*(void **)(addr))
#define GET_V128_FROM_ADDR(addr) (*(V128 *)(addr))
/* For STORE opcodes */ /* For STORE opcodes */
#define STORE_I64 PUT_I64_TO_ADDR #define STORE_I64 PUT_I64_TO_ADDR
@ -68,6 +73,12 @@ STORE_U8(void *addr, uint8_t value)
*(uint8 *)addr = value; *(uint8 *)addr = value;
} }
static inline void
STORE_V128(void *addr, V128 value)
{
*(V128 *)addr = value;
}
/* For LOAD opcodes */ /* For LOAD opcodes */
#define LOAD_I64(addr) (*(int64 *)(addr)) #define LOAD_I64(addr) (*(int64 *)(addr))
#define LOAD_F64(addr) (*(float64 *)(addr)) #define LOAD_F64(addr) (*(float64 *)(addr))
@ -75,6 +86,7 @@ STORE_U8(void *addr, uint8_t value)
#define LOAD_U32(addr) (*(uint32 *)(addr)) #define LOAD_U32(addr) (*(uint32 *)(addr))
#define LOAD_I16(addr) (*(int16 *)(addr)) #define LOAD_I16(addr) (*(int16 *)(addr))
#define LOAD_U16(addr) (*(uint16 *)(addr)) #define LOAD_U16(addr) (*(uint16 *)(addr))
#define LOAD_V128(addr) (*(V128 *)(addr))
#define STORE_PTR(addr, ptr) \ #define STORE_PTR(addr, ptr) \
do { \ do { \
@ -83,6 +95,15 @@ STORE_U8(void *addr, uint8_t value)
#else /* WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 */ #else /* WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 */
#define PUT_V128_TO_ADDR(addr, value) \
do { \
uint32 *addr_u32 = (uint32 *)(addr); \
addr_u32[0] = (value).i32x4[0]; \
addr_u32[1] = (value).i32x4[1]; \
addr_u32[2] = (value).i32x4[2]; \
addr_u32[3] = (value).i32x4[3]; \
} while (0)
#define PUT_I64_TO_ADDR(addr, value) \ #define PUT_I64_TO_ADDR(addr, value) \
do { \ do { \
uint32 *addr_u32 = (uint32 *)(addr); \ uint32 *addr_u32 = (uint32 *)(addr); \
@ -124,6 +145,17 @@ STORE_U8(void *addr, uint8_t value)
} while (0) } while (0)
#endif #endif
static inline V128
GET_V128_FROM_ADDR(uint32 *addr)
{
V128 ret;
ret.i32x4[0] = addr[0];
ret.i32x4[1] = addr[1];
ret.i32x4[2] = addr[2];
ret.i32x4[3] = addr[3];
return ret;
}
static inline int64 static inline int64
GET_I64_FROM_ADDR(uint32 *addr) GET_I64_FROM_ADDR(uint32 *addr)
{ {
@ -239,7 +271,94 @@ STORE_U16(void *addr, uint16_t value)
((uint8_t *)(addr))[0] = u.u8[0]; ((uint8_t *)(addr))[0] = u.u8[0];
((uint8_t *)(addr))[1] = u.u8[1]; ((uint8_t *)(addr))[1] = u.u8[1];
} }
static inline void
STORE_V128(void *addr, V128 value)
{
uintptr_t addr_ = (uintptr_t)(addr);
union {
V128 val;
uint64 u64[2];
uint32 u32[4];
uint16 u16[8];
uint8 u8[16];
} u;
if ((addr_ & (uintptr_t)15) == 0) {
*(V128 *)addr = value;
}
else if ((addr_ & (uintptr_t)7) == 0) {
u.val = value;
((uint64 *)(addr))[0] = u.u64[0];
((uint64 *)(addr))[1] = u.u64[1];
}
else if ((addr_ & (uintptr_t)3) == 0) {
u.val = value;
((uint32 *)addr)[0] = u.u32[0];
((uint32 *)addr)[1] = u.u32[1];
((uint32 *)addr)[2] = u.u32[2];
((uint32 *)addr)[3] = u.u32[3];
}
else if ((addr_ & (uintptr_t)1) == 0) {
u.val = value;
((uint16 *)addr)[0] = u.u16[0];
((uint16 *)addr)[1] = u.u16[1];
((uint16 *)addr)[2] = u.u16[2];
((uint16 *)addr)[3] = u.u16[3];
((uint16 *)addr)[4] = u.u16[4];
((uint16 *)addr)[5] = u.u16[5];
((uint16 *)addr)[6] = u.u16[6];
((uint16 *)addr)[7] = u.u16[7];
}
else {
u.val = value;
for (int i = 0; i < 16; i++)
((uint8 *)addr)[i] = u.u8[i];
}
}
/* For LOAD opcodes */ /* For LOAD opcodes */
static inline V128
LOAD_V128(void *addr)
{
uintptr_t addr1 = (uintptr_t)addr;
union {
V128 val;
uint64 u64[2];
uint32 u32[4];
uint16 u16[8];
uint8 u8[16];
} u;
if ((addr1 & (uintptr_t)15) == 0)
return *(V128 *)addr;
if ((addr1 & (uintptr_t)7) == 0) {
u.u64[0] = ((uint64 *)addr)[0];
u.u64[1] = ((uint64 *)addr)[1];
}
else if ((addr1 & (uintptr_t)3) == 0) {
u.u32[0] = ((uint32 *)addr)[0];
u.u32[1] = ((uint32 *)addr)[1];
u.u32[2] = ((uint32 *)addr)[2];
u.u32[3] = ((uint32 *)addr)[3];
}
else if ((addr1 & (uintptr_t)1) == 0) {
u.u16[0] = ((uint16 *)addr)[0];
u.u16[1] = ((uint16 *)addr)[1];
u.u16[2] = ((uint16 *)addr)[2];
u.u16[3] = ((uint16 *)addr)[3];
u.u16[4] = ((uint16 *)addr)[4];
u.u16[5] = ((uint16 *)addr)[5];
u.u16[6] = ((uint16 *)addr)[6];
u.u16[7] = ((uint16 *)addr)[7];
}
else {
for (int i = 0; i < 16; i++)
u.u8[i] = ((uint8 *)addr)[i];
}
return u.val;
}
static inline int64 static inline int64
LOAD_I64(void *addr) LOAD_I64(void *addr)
{ {

File diff suppressed because it is too large Load Diff

View File

@ -319,7 +319,8 @@ is_byte_a_type(uint8 type)
} }
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
static V128 static V128
read_i8x16(uint8 *p_buf, char *error_buf, uint32 error_buf_size) read_i8x16(uint8 *p_buf, char *error_buf, uint32 error_buf_size)
{ {
@ -332,7 +333,8 @@ read_i8x16(uint8 *p_buf, char *error_buf, uint32 error_buf_size)
return result; return result;
} }
#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ #endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \
(WASM_ENABLE_FAST_INTERP != 0) */
#endif /* end of WASM_ENABLE_SIMD */ #endif /* end of WASM_ENABLE_SIMD */
static void * static void *
@ -725,7 +727,8 @@ load_init_expr(WASMModule *module, const uint8 **p_buf, const uint8 *buf_end,
goto fail; goto fail;
break; break;
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
/* v128.const */ /* v128.const */
case INIT_EXPR_TYPE_V128_CONST: case INIT_EXPR_TYPE_V128_CONST:
{ {
@ -754,7 +757,8 @@ load_init_expr(WASMModule *module, const uint8 **p_buf, const uint8 *buf_end,
#endif #endif
break; break;
} }
#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ #endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \
(WASM_ENABLE_FAST_INTERP != 0) */
#endif /* end of WASM_ENABLE_SIMD */ #endif /* end of WASM_ENABLE_SIMD */
#if WASM_ENABLE_REF_TYPES != 0 || WASM_ENABLE_GC != 0 #if WASM_ENABLE_REF_TYPES != 0 || WASM_ENABLE_GC != 0
@ -4174,7 +4178,8 @@ load_export_section(const uint8 *buf, const uint8 *buf_end, WASMModule *module,
return false; return false;
} }
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
/* TODO: check func type, if it has v128 param or result, /* TODO: check func type, if it has v128 param or result,
report error */ report error */
#endif #endif
@ -7347,6 +7352,10 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache,
case WASM_OP_SET_GLOBAL: case WASM_OP_SET_GLOBAL:
case WASM_OP_GET_GLOBAL_64: case WASM_OP_GET_GLOBAL_64:
case WASM_OP_SET_GLOBAL_64: case WASM_OP_SET_GLOBAL_64:
#if WASM_ENABLE_SIMDE != 0
case WASM_OP_GET_GLOBAL_V128:
case WASM_OP_SET_GLOBAL_V128:
#endif
case WASM_OP_SET_GLOBAL_AUX_STACK: case WASM_OP_SET_GLOBAL_AUX_STACK:
skip_leb_uint32(p, p_end); /* local index */ skip_leb_uint32(p, p_end); /* local index */
break; break;
@ -7723,7 +7732,8 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache,
} }
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
case WASM_OP_SIMD_PREFIX: case WASM_OP_SIMD_PREFIX:
{ {
uint32 opcode1; uint32 opcode1;
@ -7816,7 +7826,8 @@ wasm_loader_find_block_addr(WASMExecEnv *exec_env, BlockAddr *block_addr_cache,
} }
break; break;
} }
#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ #endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \
(WASM_ENABLE_FAST_INTERP != 0) */
#endif /* end of WASM_ENABLE_SIMD */ #endif /* end of WASM_ENABLE_SIMD */
#if WASM_ENABLE_SHARED_MEMORY != 0 #if WASM_ENABLE_SHARED_MEMORY != 0
@ -7991,6 +8002,10 @@ typedef struct WASMLoaderContext {
int32 *i32_consts; int32 *i32_consts;
uint32 i32_const_max_num; uint32 i32_const_max_num;
uint32 i32_const_num; uint32 i32_const_num;
/* const buffer for V128 */
V128 *v128_consts;
uint32 v128_const_max_num;
uint32 v128_const_num;
/* processed code */ /* processed code */
uint8 *p_code_compiled; uint8 *p_code_compiled;
@ -8224,6 +8239,8 @@ wasm_loader_ctx_destroy(WASMLoaderContext *ctx)
wasm_runtime_free(ctx->i64_consts); wasm_runtime_free(ctx->i64_consts);
if (ctx->i32_consts) if (ctx->i32_consts)
wasm_runtime_free(ctx->i32_consts); wasm_runtime_free(ctx->i32_consts);
if (ctx->v128_consts)
wasm_runtime_free(ctx->v128_consts);
#endif #endif
wasm_runtime_free(ctx); wasm_runtime_free(ctx);
} }
@ -8281,6 +8298,11 @@ wasm_loader_ctx_init(WASMFunction *func, char *error_buf, uint32 error_buf_size)
loader_malloc(sizeof(int32) * loader_ctx->i32_const_max_num, loader_malloc(sizeof(int32) * loader_ctx->i32_const_max_num,
error_buf, error_buf_size))) error_buf, error_buf_size)))
goto fail; goto fail;
loader_ctx->v128_const_max_num = 8;
if (!(loader_ctx->v128_consts =
loader_malloc(sizeof(V128) * loader_ctx->v128_const_max_num,
error_buf, error_buf_size)))
goto fail;
if (func->param_cell_num >= (int32)INT16_MAX - func->local_cell_num) { if (func->param_cell_num >= (int32)INT16_MAX - func->local_cell_num) {
set_error_buf(error_buf, error_buf_size, set_error_buf(error_buf, error_buf_size,
@ -9139,6 +9161,7 @@ preserve_referenced_local(WASMLoaderContext *loader_ctx, uint8 opcode,
bool *preserved, char *error_buf, bool *preserved, char *error_buf,
uint32 error_buf_size) uint32 error_buf_size)
{ {
uint32 i = 0; uint32 i = 0;
int16 preserved_offset = (int16)local_index; int16 preserved_offset = (int16)local_index;
@ -9162,6 +9185,13 @@ preserve_referenced_local(WASMLoaderContext *loader_ctx, uint8 opcode,
loader_ctx->preserved_local_offset++; loader_ctx->preserved_local_offset++;
emit_label(EXT_OP_COPY_STACK_TOP); emit_label(EXT_OP_COPY_STACK_TOP);
} }
#if WASM_ENABLE_SIMDE != 0
else if (local_type == VALUE_TYPE_V128) {
if (loader_ctx->p_code_compiled)
loader_ctx->preserved_local_offset += 4;
emit_label(EXT_OP_COPY_STACK_TOP_V128);
}
#endif
else { else {
if (loader_ctx->p_code_compiled) if (loader_ctx->p_code_compiled)
loader_ctx->preserved_local_offset += 2; loader_ctx->preserved_local_offset += 2;
@ -9174,10 +9204,15 @@ preserve_referenced_local(WASMLoaderContext *loader_ctx, uint8 opcode,
loader_ctx->frame_offset_bottom[i] = preserved_offset; loader_ctx->frame_offset_bottom[i] = preserved_offset;
} }
if (is_32bit_type(cur_type)) if (cur_type == VALUE_TYPE_V128) {
i += 4;
}
else if (is_32bit_type(cur_type)) {
i++; i++;
else }
else {
i += 2; i += 2;
}
} }
(void)error_buf; (void)error_buf;
@ -9206,7 +9241,10 @@ preserve_local_for_block(WASMLoaderContext *loader_ctx, uint8 opcode,
return false; return false;
} }
if (is_32bit_type(cur_type)) { if (cur_type == VALUE_TYPE_V128) {
i += 4;
}
else if (is_32bit_type(cur_type)) {
i++; i++;
} }
else { else {
@ -9545,6 +9583,15 @@ cmp_i32_const(const void *p_i32_const1, const void *p_i32_const2)
return (i32_const1 < i32_const2) ? -1 : (i32_const1 > i32_const2) ? 1 : 0; return (i32_const1 < i32_const2) ? -1 : (i32_const1 > i32_const2) ? 1 : 0;
} }
static int
cmp_v128_const(const void *p_v128_const1, const void *p_v128_const2)
{
V128 v128_const1 = *(V128 *)p_v128_const1;
V128 v128_const2 = *(V128 *)p_v128_const2;
return memcmp(&v128_const1, &v128_const2, sizeof(V128));
}
static bool static bool
wasm_loader_get_const_offset(WASMLoaderContext *ctx, uint8 type, void *value, wasm_loader_get_const_offset(WASMLoaderContext *ctx, uint8 type, void *value,
int16 *offset, char *error_buf, int16 *offset, char *error_buf,
@ -9578,6 +9625,32 @@ wasm_loader_get_const_offset(WASMLoaderContext *ctx, uint8 type, void *value,
} }
ctx->i64_consts[ctx->i64_const_num++] = *(int64 *)value; ctx->i64_consts[ctx->i64_const_num++] = *(int64 *)value;
} }
else if (type == VALUE_TYPE_V128) {
/* No slot left, emit const instead */
if (ctx->v128_const_num * 4 > INT16_MAX - 2) {
*offset = 0;
return true;
}
/* Traverse the list if the const num is small */
if (ctx->v128_const_num < 10) {
for (uint32 i = 0; i < ctx->v128_const_num; i++) {
if (memcmp(&ctx->v128_consts[i], value, sizeof(V128))
== 0) {
*offset = -1;
return true;
}
}
}
if (ctx->v128_const_num >= ctx->v128_const_max_num) {
MEM_REALLOC(ctx->v128_consts,
sizeof(V128) * ctx->v128_const_max_num,
sizeof(V128) * (ctx->v128_const_max_num * 2));
ctx->v128_const_max_num *= 2;
}
ctx->v128_consts[ctx->v128_const_num++] = *(V128 *)value;
}
else { else {
/* Treat i32 and f32 as the same by reading i32 value from /* Treat i32 and f32 as the same by reading i32 value from
the raw bytes */ the raw bytes */
@ -9623,6 +9696,17 @@ wasm_loader_get_const_offset(WASMLoaderContext *ctx, uint8 type, void *value,
*offset = -(uint32)(ctx->i64_const_num * 2 + ctx->i32_const_num) *offset = -(uint32)(ctx->i64_const_num * 2 + ctx->i32_const_num)
+ (uint32)(i64_const - ctx->i64_consts) * 2; + (uint32)(i64_const - ctx->i64_consts) * 2;
} }
else if (type == VALUE_TYPE_V128) {
V128 key = *(V128 *)value, *v128_const;
v128_const = bsearch(&key, ctx->v128_consts, ctx->v128_const_num,
sizeof(V128), cmp_v128_const);
if (!v128_const) { /* not found, emit const instead */
*offset = 0;
return true;
}
*offset = -(uint32)(ctx->v128_const_num)
+ (uint32)(v128_const - ctx->v128_consts);
}
else { else {
int32 key = *(int32 *)value, *i32_const; int32 key = *(int32 *)value, *i32_const;
i32_const = bsearch(&key, ctx->i32_consts, ctx->i32_const_num, i32_const = bsearch(&key, ctx->i32_consts, ctx->i32_const_num,
@ -9819,17 +9903,23 @@ reserve_block_ret(WASMLoaderContext *loader_ctx, uint8 opcode,
block_type, &return_types, &reftype_maps, &reftype_map_count); block_type, &return_types, &reftype_maps, &reftype_map_count);
#endif #endif
/* If there is only one return value, use EXT_OP_COPY_STACK_TOP/_I64 instead /* If there is only one return value, use EXT_OP_COPY_STACK_TOP/_I64/V128
* of EXT_OP_COPY_STACK_VALUES for interpreter performance. */ * instead of EXT_OP_COPY_STACK_VALUES for interpreter performance. */
if (return_count == 1) { if (return_count == 1) {
uint8 cell = (uint8)wasm_value_type_cell_num(return_types[0]); uint8 cell = (uint8)wasm_value_type_cell_num(return_types[0]);
if (cell <= 2 /* V128 isn't supported whose cell num is 4 */ if (block->dynamic_offset != *(loader_ctx->frame_offset - cell)) {
&& block->dynamic_offset != *(loader_ctx->frame_offset - cell)) {
/* insert op_copy before else opcode */ /* insert op_copy before else opcode */
if (opcode == WASM_OP_ELSE) if (opcode == WASM_OP_ELSE)
skip_label(); skip_label();
emit_label(cell == 1 ? EXT_OP_COPY_STACK_TOP #if WASM_ENABLE_SIMDE != 0
: EXT_OP_COPY_STACK_TOP_I64); if (cell == 4) {
emit_label(EXT_OP_COPY_STACK_TOP_V128);
}
#endif
if (cell <= 2) {
emit_label(cell == 1 ? EXT_OP_COPY_STACK_TOP
: EXT_OP_COPY_STACK_TOP_I64);
}
emit_operand(loader_ctx, *(loader_ctx->frame_offset - cell)); emit_operand(loader_ctx, *(loader_ctx->frame_offset - cell));
emit_operand(loader_ctx, block->dynamic_offset); emit_operand(loader_ctx, block->dynamic_offset);
@ -9864,11 +9954,37 @@ reserve_block_ret(WASMLoaderContext *loader_ctx, uint8 opcode,
for (i = (int32)return_count - 1; i >= 0; i--) { for (i = (int32)return_count - 1; i >= 0; i--) {
uint8 cells = (uint8)wasm_value_type_cell_num(return_types[i]); uint8 cells = (uint8)wasm_value_type_cell_num(return_types[i]);
frame_offset -= cells; if (frame_offset - cells < loader_ctx->frame_offset_bottom) {
dynamic_offset -= cells; set_error_buf(error_buf, error_buf_size, "frame offset underflow");
if (dynamic_offset != *frame_offset) { goto fail;
value_count++; }
total_cel_num += cells;
if (cells == 4) {
bool needs_copy = false;
int16 v128_dynamic = dynamic_offset - cells;
for (int j = 0; j < 4; j++) {
if (*(frame_offset - j - 1) != (v128_dynamic + j)) {
needs_copy = true;
break;
}
}
if (needs_copy) {
value_count++;
total_cel_num += cells;
}
frame_offset -= cells;
dynamic_offset = v128_dynamic;
}
else {
frame_offset -= cells;
dynamic_offset -= cells;
if (dynamic_offset != *frame_offset) {
value_count++;
total_cel_num += cells;
}
} }
} }
@ -9904,19 +10020,50 @@ reserve_block_ret(WASMLoaderContext *loader_ctx, uint8 opcode,
dynamic_offset = dynamic_offset_org; dynamic_offset = dynamic_offset_org;
for (i = (int32)return_count - 1, j = 0; i >= 0; i--) { for (i = (int32)return_count - 1, j = 0; i >= 0; i--) {
uint8 cell = (uint8)wasm_value_type_cell_num(return_types[i]); uint8 cell = (uint8)wasm_value_type_cell_num(return_types[i]);
frame_offset -= cell;
dynamic_offset -= cell; if (cell == 4) {
if (dynamic_offset != *frame_offset) { bool needs_copy = false;
/* cell num */ int16 v128_dynamic = dynamic_offset - cell;
cells[j] = cell;
/* src offset */ for (int k = 0; k < 4; k++) {
src_offsets[j] = *frame_offset; if (*(frame_offset - k - 1) != (v128_dynamic + k)) {
/* dst offset */ needs_copy = true;
dst_offsets[j] = dynamic_offset; break;
j++; }
}
if (needs_copy) {
cells[j] = cell;
src_offsets[j] = *(frame_offset - cell);
dst_offsets[j] = v128_dynamic;
j++;
}
frame_offset -= cell;
dynamic_offset = v128_dynamic;
} }
else {
frame_offset -= cell;
dynamic_offset -= cell;
if (dynamic_offset != *frame_offset) {
cells[j] = cell;
/* src offset */
src_offsets[j] = *frame_offset;
/* dst offset */
dst_offsets[j] = dynamic_offset;
j++;
}
}
if (opcode == WASM_OP_ELSE) { if (opcode == WASM_OP_ELSE) {
*frame_offset = dynamic_offset; if (cell == 4) {
for (int k = 0; k < cell; k++) {
*(frame_offset + k) = dynamic_offset + k;
}
}
else {
*frame_offset = dynamic_offset;
}
} }
else { else {
loader_ctx->frame_offset = frame_offset; loader_ctx->frame_offset = frame_offset;
@ -10075,7 +10222,8 @@ check_memory_access_align(uint8 opcode, uint32 align, char *error_buf,
} }
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
static bool static bool
check_simd_memory_access_align(uint8 opcode, uint32 align, char *error_buf, check_simd_memory_access_align(uint8 opcode, uint32 align, char *error_buf,
uint32 error_buf_size) uint32 error_buf_size)
@ -11172,6 +11320,39 @@ re_scan:
} }
} }
if (loader_ctx->v128_const_num > 0) {
V128 *v128_consts_old = loader_ctx->v128_consts;
/* Sort the v128 consts */
qsort(v128_consts_old, loader_ctx->v128_const_num, sizeof(V128),
cmp_v128_const);
/* Remove the duplicated v128 consts */
uint32 k = 1;
for (i = 1; i < loader_ctx->v128_const_num; i++) {
if (!(memcmp(&v128_consts_old[i], &v128_consts_old[i - 1],
sizeof(V128))
== 0)) {
v128_consts_old[k++] = v128_consts_old[i];
}
}
if (k < loader_ctx->v128_const_num) {
V128 *v128_consts_new;
/* Try to reallocate memory with a smaller size */
if ((v128_consts_new =
wasm_runtime_malloc((uint32)sizeof(V128) * k))) {
bh_memcpy_s(v128_consts_new, (uint32)sizeof(V128) * k,
v128_consts_old, (uint32)sizeof(V128) * k);
/* Free the old memory */
wasm_runtime_free(v128_consts_old);
loader_ctx->v128_consts = v128_consts_new;
loader_ctx->v128_const_max_num = k;
}
loader_ctx->v128_const_num = k;
}
}
if (loader_ctx->i32_const_num > 0) { if (loader_ctx->i32_const_num > 0) {
int32 *i32_consts_old = loader_ctx->i32_consts; int32 *i32_consts_old = loader_ctx->i32_consts;
@ -12492,10 +12673,20 @@ re_scan:
#endif #endif
} }
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
else if (*(loader_ctx->frame_ref - 1) == VALUE_TYPE_V128) { else if (*(loader_ctx->frame_ref - 1) == VALUE_TYPE_V128) {
loader_ctx->frame_ref -= 4; loader_ctx->frame_ref -= 4;
loader_ctx->stack_cell_num -= 4; loader_ctx->stack_cell_num -= 4;
#if WASM_ENABLE_FAST_INTERP != 0
skip_label();
loader_ctx->frame_offset -= 4;
if ((*(loader_ctx->frame_offset)
> loader_ctx->start_dynamic_offset)
&& (*(loader_ctx->frame_offset)
< loader_ctx->max_dynamic_offset))
loader_ctx->dynamic_offset -= 4;
#endif
} }
#endif #endif
#endif #endif
@ -12582,10 +12773,12 @@ re_scan:
#endif /* end of WASM_ENABLE_FAST_INTERP */ #endif /* end of WASM_ENABLE_FAST_INTERP */
break; break;
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
case VALUE_TYPE_V128: case VALUE_TYPE_V128:
break; break;
#endif /* (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ #endif /* (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \
(WASM_ENABLE_FAST_INTERP != 0) */
#endif /* WASM_ENABLE_SIMD != 0 */ #endif /* WASM_ENABLE_SIMD != 0 */
default: default:
{ {
@ -12680,8 +12873,9 @@ re_scan:
uint8 opcode_tmp = WASM_OP_SELECT; uint8 opcode_tmp = WASM_OP_SELECT;
if (type == VALUE_TYPE_V128) { if (type == VALUE_TYPE_V128) {
#if (WASM_ENABLE_SIMD == 0) \ #if (WASM_ENABLE_SIMD == 0) \
|| ((WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0)) || ((WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) \
&& (WASM_ENABLE_FAST_INTERP == 0))
set_error_buf(error_buf, error_buf_size, set_error_buf(error_buf, error_buf_size,
"SIMD v128 type isn't supported"); "SIMD v128 type isn't supported");
goto fail; goto fail;
@ -13177,10 +13371,21 @@ re_scan:
emit_label(EXT_OP_SET_LOCAL_FAST); emit_label(EXT_OP_SET_LOCAL_FAST);
emit_byte(loader_ctx, (uint8)local_offset); emit_byte(loader_ctx, (uint8)local_offset);
} }
else { else if (is_64bit_type(local_type)) {
emit_label(EXT_OP_SET_LOCAL_FAST_I64); emit_label(EXT_OP_SET_LOCAL_FAST_I64);
emit_byte(loader_ctx, (uint8)local_offset); emit_byte(loader_ctx, (uint8)local_offset);
} }
#if WASM_ENABLE_SIMDE != 0
else if (local_type == VALUE_TYPE_V128) {
emit_label(EXT_OP_SET_LOCAL_FAST_V128);
emit_byte(loader_ctx, (uint8)local_offset);
}
#endif
else {
set_error_buf(error_buf, error_buf_size,
"unknown local type");
goto fail;
}
POP_OFFSET_TYPE(local_type); POP_OFFSET_TYPE(local_type);
} }
} }
@ -13253,6 +13458,12 @@ re_scan:
emit_label(EXT_OP_TEE_LOCAL_FAST); emit_label(EXT_OP_TEE_LOCAL_FAST);
emit_byte(loader_ctx, (uint8)local_offset); emit_byte(loader_ctx, (uint8)local_offset);
} }
#if WASM_ENABLE_SIMDE != 0
else if (local_type == VALUE_TYPE_V128) {
emit_label(EXT_OP_TEE_LOCAL_FAST_V128);
emit_byte(loader_ctx, (uint8)local_offset);
}
#endif
else { else {
emit_label(EXT_OP_TEE_LOCAL_FAST_I64); emit_label(EXT_OP_TEE_LOCAL_FAST_I64);
emit_byte(loader_ctx, (uint8)local_offset); emit_byte(loader_ctx, (uint8)local_offset);
@ -13341,12 +13552,18 @@ re_scan:
#endif #endif
*p_org = WASM_OP_GET_GLOBAL_64; *p_org = WASM_OP_GET_GLOBAL_64;
} }
#else /* else of WASM_ENABLE_FAST_INTERP */ #else /* else of WASM_ENABLE_FAST_INTERP */
if (global_type == VALUE_TYPE_I64 if (global_type == VALUE_TYPE_I64
|| global_type == VALUE_TYPE_F64) { || global_type == VALUE_TYPE_F64) {
skip_label(); skip_label();
emit_label(WASM_OP_GET_GLOBAL_64); emit_label(WASM_OP_GET_GLOBAL_64);
} }
#if WASM_ENABLE_SIMDE != 0
if (global_type == VALUE_TYPE_V128) {
skip_label();
emit_label(WASM_OP_GET_GLOBAL_V128);
}
#endif /* end of WASM_ENABLE_SIMDE */
emit_uint32(loader_ctx, global_idx); emit_uint32(loader_ctx, global_idx);
PUSH_OFFSET_TYPE(global_type); PUSH_OFFSET_TYPE(global_type);
#endif /* end of WASM_ENABLE_FAST_INTERP */ #endif /* end of WASM_ENABLE_FAST_INTERP */
@ -13430,7 +13647,7 @@ re_scan:
func->has_op_set_global_aux_stack = true; func->has_op_set_global_aux_stack = true;
#endif #endif
} }
#else /* else of WASM_ENABLE_FAST_INTERP */ #else /* else of WASM_ENABLE_FAST_INTERP */
if (global_type == VALUE_TYPE_I64 if (global_type == VALUE_TYPE_I64
|| global_type == VALUE_TYPE_F64) { || global_type == VALUE_TYPE_F64) {
skip_label(); skip_label();
@ -13441,6 +13658,12 @@ re_scan:
skip_label(); skip_label();
emit_label(WASM_OP_SET_GLOBAL_AUX_STACK); emit_label(WASM_OP_SET_GLOBAL_AUX_STACK);
} }
#if WASM_ENABLE_SIMDE != 0
else if (global_type == VALUE_TYPE_V128) {
skip_label();
emit_label(WASM_OP_SET_GLOBAL_V128);
}
#endif /* end of WASM_ENABLE_SIMDE */
emit_uint32(loader_ctx, global_idx); emit_uint32(loader_ctx, global_idx);
POP_OFFSET_TYPE(global_type); POP_OFFSET_TYPE(global_type);
#endif /* end of WASM_ENABLE_FAST_INTERP */ #endif /* end of WASM_ENABLE_FAST_INTERP */
@ -15285,7 +15508,8 @@ re_scan:
} }
#if WASM_ENABLE_SIMD != 0 #if WASM_ENABLE_SIMD != 0
#if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) #if (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) \
|| (WASM_ENABLE_FAST_INTERP != 0)
case WASM_OP_SIMD_PREFIX: case WASM_OP_SIMD_PREFIX:
{ {
uint32 opcode1; uint32 opcode1;
@ -15297,6 +15521,10 @@ re_scan:
pb_read_leb_uint32(p, p_end, opcode1); pb_read_leb_uint32(p, p_end, opcode1);
#if WASM_ENABLE_FAST_INTERP != 0
emit_byte(loader_ctx, opcode1);
#endif
/* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h /* follow the order of enum WASMSimdEXTOpcode in wasm_opcode.h
*/ */
switch (opcode1) { switch (opcode1) {
@ -15324,6 +15552,10 @@ re_scan:
pb_read_leb_mem_offset(p, p_end, pb_read_leb_mem_offset(p, p_end,
mem_offset); /* offset */ mem_offset); /* offset */
#if WASM_ENABLE_FAST_INTERP != 0
emit_uint32(loader_ctx, mem_offset);
#endif
POP_AND_PUSH(mem_offset_type, VALUE_TYPE_V128); POP_AND_PUSH(mem_offset_type, VALUE_TYPE_V128);
#if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0 #if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0
func->has_memory_operations = true; func->has_memory_operations = true;
@ -15344,6 +15576,10 @@ re_scan:
pb_read_leb_mem_offset(p, p_end, pb_read_leb_mem_offset(p, p_end,
mem_offset); /* offset */ mem_offset); /* offset */
#if WASM_ENABLE_FAST_INTERP != 0
emit_uint32(loader_ctx, mem_offset);
#endif
POP_V128(); POP_V128();
POP_MEM_OFFSET(); POP_MEM_OFFSET();
#if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0 #if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0
@ -15355,7 +15591,13 @@ re_scan:
/* basic operation */ /* basic operation */
case SIMD_v128_const: case SIMD_v128_const:
{ {
uint64 high, low;
CHECK_BUF1(p, p_end, 16); CHECK_BUF1(p, p_end, 16);
#if WASM_ENABLE_FAST_INTERP != 0
wasm_runtime_read_v128(p, &high, &low);
emit_uint64(loader_ctx, high);
emit_uint64(loader_ctx, low);
#endif
p += 16; p += 16;
PUSH_V128(); PUSH_V128();
break; break;
@ -15367,12 +15609,17 @@ re_scan:
CHECK_BUF1(p, p_end, 16); CHECK_BUF1(p, p_end, 16);
mask = read_i8x16(p, error_buf, error_buf_size); mask = read_i8x16(p, error_buf, error_buf_size);
p += 16;
if (!check_simd_shuffle_mask(mask, error_buf, if (!check_simd_shuffle_mask(mask, error_buf,
error_buf_size)) { error_buf_size)) {
goto fail; goto fail;
} }
#if WASM_ENABLE_FAST_INTERP != 0
uint64 high, low;
wasm_runtime_read_v128(p, &high, &low);
emit_uint64(loader_ctx, high);
emit_uint64(loader_ctx, low);
#endif
p += 16;
POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128); POP2_AND_PUSH(VALUE_TYPE_V128, VALUE_TYPE_V128);
break; break;
} }
@ -15443,14 +15690,25 @@ re_scan:
error_buf_size)) { error_buf_size)) {
goto fail; goto fail;
} }
#if WASM_ENABLE_FAST_INTERP != 0
emit_byte(loader_ctx, lane);
#endif
if (replace[opcode1 - SIMD_i8x16_extract_lane_s]) { if (replace[opcode1 - SIMD_i8x16_extract_lane_s]) {
#if WASM_ENABLE_FAST_INTERP != 0
if (!(wasm_loader_pop_frame_ref_offset(
loader_ctx,
replace[opcode1
- SIMD_i8x16_extract_lane_s],
error_buf, error_buf_size)))
goto fail;
#else
if (!(wasm_loader_pop_frame_ref( if (!(wasm_loader_pop_frame_ref(
loader_ctx, loader_ctx,
replace[opcode1 replace[opcode1
- SIMD_i8x16_extract_lane_s], - SIMD_i8x16_extract_lane_s],
error_buf, error_buf_size))) error_buf, error_buf_size)))
goto fail; goto fail;
#endif /* end of WASM_ENABLE_FAST_INTERP != 0 */
} }
POP_AND_PUSH( POP_AND_PUSH(
@ -15569,9 +15827,14 @@ re_scan:
error_buf_size)) { error_buf_size)) {
goto fail; goto fail;
} }
#if WASM_ENABLE_FAST_INTERP != 0
emit_uint32(loader_ctx, mem_offset);
#endif
POP_V128(); POP_V128();
POP_MEM_OFFSET(); POP_MEM_OFFSET();
#if WASM_ENABLE_FAST_INTERP != 0
emit_byte(loader_ctx, lane);
#endif
if (opcode1 < SIMD_v128_store8_lane) { if (opcode1 < SIMD_v128_store8_lane) {
PUSH_V128(); PUSH_V128();
} }
@ -15594,7 +15857,9 @@ re_scan:
pb_read_leb_mem_offset(p, p_end, pb_read_leb_mem_offset(p, p_end,
mem_offset); /* offset */ mem_offset); /* offset */
#if WASM_ENABLE_FAST_INTERP != 0
emit_uint32(loader_ctx, mem_offset);
#endif
POP_AND_PUSH(mem_offset_type, VALUE_TYPE_V128); POP_AND_PUSH(mem_offset_type, VALUE_TYPE_V128);
#if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0 #if WASM_ENABLE_JIT != 0 || WASM_ENABLE_WAMR_COMPILER != 0
func->has_memory_operations = true; func->has_memory_operations = true;
@ -15943,7 +16208,8 @@ re_scan:
} }
break; break;
} }
#endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) */ #endif /* end of (WASM_ENABLE_WAMR_COMPILER != 0) || (WASM_ENABLE_JIT != 0) || \
(WASM_ENABLE_FAST_INTERP != 0) */
#endif /* end of WASM_ENABLE_SIMD */ #endif /* end of WASM_ENABLE_SIMD */
#if WASM_ENABLE_SHARED_MEMORY != 0 #if WASM_ENABLE_SHARED_MEMORY != 0
@ -16123,8 +16389,9 @@ re_scan:
if (loader_ctx->p_code_compiled == NULL) if (loader_ctx->p_code_compiled == NULL)
goto re_scan; goto re_scan;
func->const_cell_num = func->const_cell_num = loader_ctx->i64_const_num * 2
loader_ctx->i64_const_num * 2 + loader_ctx->i32_const_num; + loader_ctx->v128_const_num * 4
+ loader_ctx->i32_const_num;
if (func->const_cell_num > 0) { if (func->const_cell_num > 0) {
if (!(func->consts = if (!(func->consts =
loader_malloc((uint64)sizeof(uint32) * func->const_cell_num, loader_malloc((uint64)sizeof(uint32) * func->const_cell_num,
@ -16143,6 +16410,12 @@ re_scan:
loader_ctx->i32_consts, loader_ctx->i32_consts,
(uint32)sizeof(int32) * loader_ctx->i32_const_num); (uint32)sizeof(int32) * loader_ctx->i32_const_num);
} }
if (loader_ctx->v128_const_num > 0) {
bh_memcpy_s(func->consts,
(uint32)sizeof(V128) * loader_ctx->v128_const_num,
loader_ctx->v128_consts,
(uint32)sizeof(V128) * loader_ctx->v128_const_num);
}
} }
func->max_stack_cell_num = loader_ctx->preserved_local_offset func->max_stack_cell_num = loader_ctx->preserved_local_offset

View File

@ -278,6 +278,15 @@ typedef enum WASMOpcode {
DEBUG_OP_BREAK = 0xdc, /* debug break point */ DEBUG_OP_BREAK = 0xdc, /* debug break point */
#endif #endif
#if WASM_ENABLE_JIT != 0 \
|| WASM_ENABLE_FAST_INTERP != 0 && WASM_ENABLE_SIMD != 0
EXT_OP_SET_LOCAL_FAST_V128 = 0xdd,
EXT_OP_TEE_LOCAL_FAST_V128 = 0xde,
EXT_OP_COPY_STACK_TOP_V128 = 0xdf,
WASM_OP_GET_GLOBAL_V128 = 0xe0,
WASM_OP_SET_GLOBAL_V128 = 0xe1,
#endif
/* Post-MVP extend op prefix */ /* Post-MVP extend op prefix */
WASM_OP_GC_PREFIX = 0xfb, WASM_OP_GC_PREFIX = 0xfb,
WASM_OP_MISC_PREFIX = 0xfc, WASM_OP_MISC_PREFIX = 0xfc,
@ -779,16 +788,27 @@ typedef enum WASMAtomicEXTOpcode {
#else #else
#define DEF_DEBUG_BREAK_HANDLE() #define DEF_DEBUG_BREAK_HANDLE()
#endif #endif
#define SET_GOTO_TABLE_ELEM(opcode) [opcode] = HANDLE_OPCODE(opcode) #define SET_GOTO_TABLE_ELEM(opcode) [opcode] = HANDLE_OPCODE(opcode)
#if WASM_ENABLE_JIT != 0 && WASM_ENABLE_SIMD != 0 #if (WASM_ENABLE_JIT != 0 || WASM_ENABLE_FAST_INTERP != 0) \
&& WASM_ENABLE_SIMD != 0
#define SET_GOTO_TABLE_SIMD_PREFIX_ELEM() \ #define SET_GOTO_TABLE_SIMD_PREFIX_ELEM() \
SET_GOTO_TABLE_ELEM(WASM_OP_SIMD_PREFIX), SET_GOTO_TABLE_ELEM(WASM_OP_SIMD_PREFIX),
#else #else
#define SET_GOTO_TABLE_SIMD_PREFIX_ELEM() #define SET_GOTO_TABLE_SIMD_PREFIX_ELEM()
#endif #endif
#if (WASM_ENABLE_FAST_INTERP != 0) && WASM_ENABLE_SIMD != 0
#define DEF_EXT_V128_HANDLE() \
SET_GOTO_TABLE_ELEM(EXT_OP_SET_LOCAL_FAST_V128), /* 0xdd */ \
SET_GOTO_TABLE_ELEM(EXT_OP_TEE_LOCAL_FAST_V128), /* 0xde */ \
SET_GOTO_TABLE_ELEM(EXT_OP_COPY_STACK_TOP_V128), /* 0xdf */ \
SET_GOTO_TABLE_ELEM(WASM_OP_GET_GLOBAL_V128), /* 0xe0 */ \
SET_GOTO_TABLE_ELEM(WASM_OP_SET_GLOBAL_V128), /* 0xe1 */
#else
#define DEF_EXT_V128_HANDLE()
#endif
/* /*
* Macro used to generate computed goto tables for the C interpreter. * Macro used to generate computed goto tables for the C interpreter.
*/ */
@ -1020,7 +1040,7 @@ typedef enum WASMAtomicEXTOpcode {
SET_GOTO_TABLE_ELEM(WASM_OP_MISC_PREFIX), /* 0xfc */ \ SET_GOTO_TABLE_ELEM(WASM_OP_MISC_PREFIX), /* 0xfc */ \
SET_GOTO_TABLE_SIMD_PREFIX_ELEM() /* 0xfd */ \ SET_GOTO_TABLE_SIMD_PREFIX_ELEM() /* 0xfd */ \
SET_GOTO_TABLE_ELEM(WASM_OP_ATOMIC_PREFIX), /* 0xfe */ \ SET_GOTO_TABLE_ELEM(WASM_OP_ATOMIC_PREFIX), /* 0xfe */ \
DEF_DEBUG_BREAK_HANDLE() \ DEF_DEBUG_BREAK_HANDLE() DEF_EXT_V128_HANDLE() \
}; };
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -0,0 +1,21 @@
# Copyright (C) 2024 Amazon Inc. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# simde is a header only library
set (LIB_SIMDE_DIR ${CMAKE_CURRENT_LIST_DIR})
add_definitions (-DWASM_ENABLE_SIMDE=1)
include_directories(${LIB_SIMDE_DIR} ${LIB_SIMDE_DIR}/simde)
include(FetchContent)
FetchContent_Declare(
simde
GIT_REPOSITORY https://github.com/simd-everywhere/simde
GIT_TAG v0.8.2
)
message("-- Fetching simde ..")
FetchContent_MakeAvailable(simde)
include_directories("${simde_SOURCE_DIR}")

View File

@ -132,7 +132,11 @@ cmake -DWAMR_BUILD_PLATFORM=linux -DWAMR_BUILD_TARGET=ARM
### **Enable 128-bit SIMD feature** ### **Enable 128-bit SIMD feature**
- **WAMR_BUILD_SIMD**=1/0, default to enable if not set - **WAMR_BUILD_SIMD**=1/0, default to enable if not set
> Note: only supported in AOT mode x86-64 target. > Note: supported in AOT mode, JIT mode, and fast-interpreter mode with SIMDe library.
### **Enable SIMDe library for SIMD in fast interpreter**
- **WAMR_BUILD_LIB_SIMDE**=1/0, default to disable if not set
> Note: If enabled, SIMDe (SIMD Everywhere) library will be used to implement SIMD operations in fast interpreter mode.
### **Enable Exception Handling** ### **Enable Exception Handling**
- **WAMR_BUILD_EXCE_HANDLING**=1/0, default to disable if not set - **WAMR_BUILD_EXCE_HANDLING**=1/0, default to disable if not set
@ -335,4 +339,11 @@ Or if we want to enable interpreter, disable AOT and WASI, and build as X86_32,
``` Bash ``` Bash
cmake .. -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_AOT=0 -DWAMR_BUILD_LIBC_WASI=0 -DWAMR_BUILD_TARGET=X86_32 cmake .. -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_AOT=0 -DWAMR_BUILD_LIBC_WASI=0 -DWAMR_BUILD_TARGET=X86_32
``` ```
When enabling SIMD for fast interpreter mode, you'll need to enable both SIMD and the SIMDe library:
``` Bash
cmake .. -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_SIMD=1 -DWAMR_BUILD_LIB_SIMDE=1
```

View File

@ -913,8 +913,8 @@ function do_execute_in_running_mode()
fi fi
if [[ ${ENABLE_SIMD} -eq 1 ]]; then if [[ ${ENABLE_SIMD} -eq 1 ]]; then
if [[ "${RUNNING_MODE}" != "jit" && "${RUNNING_MODE}" != "aot" ]]; then if [[ "${RUNNING_MODE}" != "jit" && "${RUNNING_MODE}" != "aot" && "${RUNNING_MODE}" != "fast-interp" ]]; then
echo "support simd in llvm-jit mode and aot mode" echo "support simd in llvm-jit, aot and fast-interp mode"
return 0; return 0;
fi fi
fi fi