From 4c2af25aff9de6df2c23083840f1ff783e7ebbcb Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Sat, 22 Jun 2024 11:32:52 +0900 Subject: [PATCH] aot compiler: Use larger alignment for load/store when possible (#3552) Consider the following wasm module: ```wast (module (func (export "foo") i32.const 0x104 i32.const 0x12345678 i32.store ) (memory 1 1) ) ``` While the address (0x104) is perfectly aligned for i32.store, as our aot compiler uses 1-byte alignment for load/store LLVM IR instructions, it often produces inefficient machine code, especially for alignment-sensitive targets. For example, the above "foo" function is compiled into the following xtensa machine code. ``` 0000002c : 2c: 004136 entry a1, 32 2f: 07a182 movi a8, 0x107 32: 828a add.n a8, a2, a8 34: 291c movi.n a9, 18 36: 004892 s8i a9, a8, 0 39: 06a182 movi a8, 0x106 3c: 828a add.n a8, a2, a8 3e: ffff91 l32r a9, 3c (ff91828a ) 3e: R_XTENSA_SLOT0_OP .literal+0x8 41: 004892 s8i a9, a8, 0 44: 05a182 movi a8, 0x105 47: 828a add.n a8, a2, a8 49: ffff91 l32r a9, 48 (ffff9182 ) 49: R_XTENSA_SLOT0_OP .literal+0xc 4c: 41a890 srli a10, a9, 8 4f: 0048a2 s8i a10, a8, 0 52: 04a182 movi a8, 0x104 55: 828a add.n a8, a2, a8 57: 004892 s8i a9, a8, 0 5a: f01d retw.n ``` Note that the each four bytes are stored separately using one-byte-store instruction, s8i. This commit tries to use larger alignments for load/store LLVM IR instructions when possible. with this commit, the above example is compiled into the following machine code, which seems more reasonable. ``` 0000002c : 2c: 004136 entry a1, 32 2f: ffff81 l32r a8, 2c (81004136 ) 2f: R_XTENSA_SLOT0_OP .literal+0x8 32: 416282 s32i a8, a2, 0x104 35: f01d retw.n ``` Note: this doesn't work well for --xip because aot_load_const_from_table() hides the constness of the value. Maybe we need our own mechanism to propagate the constness and the value. --- core/iwasm/common/wasm_memory.c | 12 ++++ core/iwasm/compilation/aot_emit_memory.c | 62 ++++++++++++++----- core/iwasm/compilation/aot_emit_memory.h | 3 +- core/iwasm/compilation/simd/simd_load_store.c | 4 +- .../unit/compilation/aot_emit_memory_test.cc | 3 +- 5 files changed, 65 insertions(+), 19 deletions(-) diff --git a/core/iwasm/common/wasm_memory.c b/core/iwasm/common/wasm_memory.c index 82b771231..d86ea4a8b 100644 --- a/core/iwasm/common/wasm_memory.c +++ b/core/iwasm/common/wasm_memory.c @@ -883,6 +883,12 @@ wasm_enlarge_memory_internal(WASMModuleInstance *module, uint32 inc_page_count) } #endif /* end of WASM_MEM_ALLOC_WITH_USAGE */ + /* + * AOT compiler assumes at least 8 byte alignment. + * see aot_check_memory_overflow. + */ + bh_assert(((uintptr_t)memory->memory_data & 0x7) == 0); + memory->num_bytes_per_page = num_bytes_per_page; memory->cur_page_count = total_page_count; memory->max_page_count = max_page_count; @@ -1032,5 +1038,11 @@ wasm_allocate_linear_memory(uint8 **data, bool is_shared_memory, #endif } + /* + * AOT compiler assumes at least 8 byte alignment. + * see aot_check_memory_overflow. + */ + bh_assert(((uintptr_t)*data & 0x7) == 0); + return BHT_OK; } diff --git a/core/iwasm/compilation/aot_emit_memory.c b/core/iwasm/compilation/aot_emit_memory.c index 506467449..a74d32681 100644 --- a/core/iwasm/compilation/aot_emit_memory.c +++ b/core/iwasm/compilation/aot_emit_memory.c @@ -96,7 +96,8 @@ get_memory_curr_page_count(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx); LLVMValueRef aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, - mem_offset_t offset, uint32 bytes, bool enable_segue) + mem_offset_t offset, uint32 bytes, bool enable_segue, + unsigned int *alignp) { LLVMValueRef offset_const = MEMORY64_COND_VALUE(I64_CONST(offset), I32_CONST(offset)); @@ -180,6 +181,26 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, comp_ctx->comp_data->memories[0].init_page_count; uint64 mem_data_size = (uint64)num_bytes_per_page * init_page_count; + if (alignp != NULL) { + /* + * A note about max_align below: + * the assumption here is the base address of a linear memory + * has the natural alignment. for platforms using mmap, it can + * be even larger. for now, use a conservative value. + */ + const int max_align = 8; + int shift = ffs((int)(unsigned int)mem_offset); + if (shift == 0) { + *alignp = max_align; + } + else { + unsigned int align = 1 << (shift - 1); + if (align > max_align) { + align = max_align; + } + *alignp = align; + } + } if (mem_offset + bytes <= mem_data_size) { /* inside memory space */ if (comp_ctx->pointer_size == sizeof(uint64)) @@ -205,6 +226,9 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, return maddr; } } + else if (alignp != NULL) { + *alignp = 1; + } if (is_target_64bit) { if (!(offset_const = LLVMBuildZExt(comp_ctx->builder, offset_const, @@ -324,7 +348,7 @@ fail: aot_set_last_error("llvm build load failed."); \ goto fail; \ } \ - LLVMSetAlignment(value, 1); \ + LLVMSetAlignment(value, known_align); \ } while (0) #define BUILD_TRUNC(value, data_type) \ @@ -343,7 +367,7 @@ fail: aot_set_last_error("llvm build store failed."); \ goto fail; \ } \ - LLVMSetAlignment(res, 1); \ + LLVMSetAlignment(res, known_align); \ } while (0) #define BUILD_SIGN_EXT(dst_type) \ @@ -445,8 +469,9 @@ aot_compile_op_i32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, LLVMTypeRef data_type; bool enable_segue = comp_ctx->enable_segue_i32_load; + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - enable_segue))) + enable_segue, &known_align))) return false; switch (bytes) { @@ -515,8 +540,9 @@ aot_compile_op_i64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, LLVMTypeRef data_type; bool enable_segue = comp_ctx->enable_segue_i64_load; + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - enable_segue))) + enable_segue, &known_align))) return false; switch (bytes) { @@ -591,8 +617,9 @@ aot_compile_op_f32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, LLVMValueRef maddr, value; bool enable_segue = comp_ctx->enable_segue_f32_load; + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4, - enable_segue))) + enable_segue, &known_align))) return false; if (!enable_segue) @@ -614,8 +641,9 @@ aot_compile_op_f64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, LLVMValueRef maddr, value; bool enable_segue = comp_ctx->enable_segue_f64_load; + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8, - enable_segue))) + enable_segue, &known_align))) return false; if (!enable_segue) @@ -640,8 +668,9 @@ aot_compile_op_i32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, POP_I32(value); + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - enable_segue))) + enable_segue, &known_align))) return false; switch (bytes) { @@ -691,8 +720,9 @@ aot_compile_op_i64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, POP_I64(value); + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - enable_segue))) + enable_segue, &known_align))) return false; switch (bytes) { @@ -748,8 +778,9 @@ aot_compile_op_f32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, POP_F32(value); + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4, - enable_segue))) + enable_segue, &known_align))) return false; if (!enable_segue) @@ -771,8 +802,9 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, POP_F64(value); + unsigned int known_align; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8, - enable_segue))) + enable_segue, &known_align))) return false; if (!enable_segue) @@ -1302,7 +1334,7 @@ aot_compile_op_atomic_rmw(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, POP_I64(value); if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - enable_segue))) + enable_segue, NULL))) return false; if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align)) @@ -1392,7 +1424,7 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx, } if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - enable_segue))) + enable_segue, NULL))) return false; if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align)) @@ -1505,7 +1537,7 @@ aot_compile_op_atomic_wait(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, CHECK_LLVM_CONST(is_wait64); if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - false))) + false, NULL))) return false; if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align)) @@ -1579,7 +1611,7 @@ aot_compiler_op_atomic_notify(AOTCompContext *comp_ctx, POP_I32(count); if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, - false))) + false, NULL))) return false; if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align)) diff --git a/core/iwasm/compilation/aot_emit_memory.h b/core/iwasm/compilation/aot_emit_memory.h index f1828f17f..1eb95993c 100644 --- a/core/iwasm/compilation/aot_emit_memory.h +++ b/core/iwasm/compilation/aot_emit_memory.h @@ -53,7 +53,8 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, LLVMValueRef aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, - mem_offset_t offset, uint32 bytes, bool enable_segue); + mem_offset_t offset, uint32 bytes, bool enable_segue, + unsigned int *alignp); bool aot_compile_op_memory_size(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx); diff --git a/core/iwasm/compilation/simd/simd_load_store.c b/core/iwasm/compilation/simd/simd_load_store.c index 45829f080..3b5023937 100644 --- a/core/iwasm/compilation/simd/simd_load_store.c +++ b/core/iwasm/compilation/simd/simd_load_store.c @@ -19,7 +19,7 @@ simd_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align, LLVMValueRef maddr, data; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, - data_length, enable_segue))) { + data_length, enable_segue, NULL))) { HANDLE_FAILURE("aot_check_memory_overflow"); return NULL; } @@ -287,7 +287,7 @@ simd_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align, LLVMValueRef maddr, result; if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, - data_length, enable_segue))) + data_length, enable_segue, NULL))) return false; if (!(maddr = LLVMBuildBitCast(comp_ctx->builder, maddr, value_ptr_type, diff --git a/tests/unit/compilation/aot_emit_memory_test.cc b/tests/unit/compilation/aot_emit_memory_test.cc index 26a6a79ba..0e8e86f21 100644 --- a/tests/unit/compilation/aot_emit_memory_test.cc +++ b/tests/unit/compilation/aot_emit_memory_test.cc @@ -100,7 +100,8 @@ TEST_F(compilation_aot_emit_memory_test, aot_check_memory_overflow) for (uint32 i = 0; i < DEFAULT_CYCLE_TIMES; i++) { offset = (1 + (rand() % (DEFAULT_MAX_RAND_NUM - 1 + 1))); - aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, false); + aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, false, + NULL); } }