From 4c2af25aff9de6df2c23083840f1ff783e7ebbcb Mon Sep 17 00:00:00 2001
From: YAMAMOTO Takashi <yamamoto@midokura.com>
Date: Sat, 22 Jun 2024 11:32:52 +0900
Subject: [PATCH] aot compiler: Use larger alignment for load/store when
 possible (#3552)

Consider the following wasm module:
```wast
(module
  (func (export "foo")
    i32.const 0x104
    i32.const 0x12345678
    i32.store
  )
  (memory 1 1)
)
```

While the address (0x104) is perfectly aligned for i32.store,
as our aot compiler uses 1-byte alignment for load/store LLVM
IR instructions, it often produces inefficient machine code,
especially for alignment-sensitive targets.

For example, the above "foo" function is compiled into the
following xtensa machine code.
```
0000002c <aot_func_internal#0>:
  2c:   004136          entry   a1, 32
  2f:   07a182          movi    a8, 0x107
  32:   828a            add.n   a8, a2, a8
  34:   291c            movi.n  a9, 18
  36:   004892          s8i     a9, a8, 0
  39:   06a182          movi    a8, 0x106
  3c:   828a            add.n   a8, a2, a8
  3e:   ffff91          l32r    a9, 3c <aot_func_internal#0+0x10> (ff91828a <aot_func_internal#0+0xff91825e>)
                        3e: R_XTENSA_SLOT0_OP   .literal+0x8
  41:   004892          s8i     a9, a8, 0
  44:   05a182          movi    a8, 0x105
  47:   828a            add.n   a8, a2, a8
  49:   ffff91          l32r    a9, 48 <aot_func_internal#0+0x1c> (ffff9182 <aot_func_internal#0+0xffff9156>)
                        49: R_XTENSA_SLOT0_OP   .literal+0xc
  4c:   41a890          srli    a10, a9, 8
  4f:   0048a2          s8i     a10, a8, 0
  52:   04a182          movi    a8, 0x104
  55:   828a            add.n   a8, a2, a8
  57:   004892          s8i     a9, a8, 0
  5a:   f01d            retw.n
```

Note that the each four bytes are stored separately using
one-byte-store instruction, s8i.

This commit tries to use larger alignments for load/store LLVM IR
instructions when possible.  with this commit, the above example is
compiled into the following machine code, which seems more reasonable.
```
0000002c <aot_func_internal#0>:
  2c:   004136          entry   a1, 32
  2f:   ffff81          l32r    a8, 2c <aot_func_internal#0> (81004136 <aot_func_internal#0+0x8100410a>)
                        2f: R_XTENSA_SLOT0_OP   .literal+0x8
  32:   416282          s32i    a8, a2, 0x104
  35:   f01d            retw.n
```

Note: this doesn't work well for --xip because aot_load_const_from_table()
hides the constness of the value. Maybe we need our own mechanism to
propagate the constness and the value.
---
 core/iwasm/common/wasm_memory.c               | 12 ++++
 core/iwasm/compilation/aot_emit_memory.c      | 62 ++++++++++++++-----
 core/iwasm/compilation/aot_emit_memory.h      |  3 +-
 core/iwasm/compilation/simd/simd_load_store.c |  4 +-
 .../unit/compilation/aot_emit_memory_test.cc  |  3 +-
 5 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/core/iwasm/common/wasm_memory.c b/core/iwasm/common/wasm_memory.c
index 82b771231..d86ea4a8b 100644
--- a/core/iwasm/common/wasm_memory.c
+++ b/core/iwasm/common/wasm_memory.c
@@ -883,6 +883,12 @@ wasm_enlarge_memory_internal(WASMModuleInstance *module, uint32 inc_page_count)
     }
 #endif /* end of WASM_MEM_ALLOC_WITH_USAGE */
 
+    /*
+     * AOT compiler assumes at least 8 byte alignment.
+     * see aot_check_memory_overflow.
+     */
+    bh_assert(((uintptr_t)memory->memory_data & 0x7) == 0);
+
     memory->num_bytes_per_page = num_bytes_per_page;
     memory->cur_page_count = total_page_count;
     memory->max_page_count = max_page_count;
@@ -1032,5 +1038,11 @@ wasm_allocate_linear_memory(uint8 **data, bool is_shared_memory,
 #endif
     }
 
+    /*
+     * AOT compiler assumes at least 8 byte alignment.
+     * see aot_check_memory_overflow.
+     */
+    bh_assert(((uintptr_t)*data & 0x7) == 0);
+
     return BHT_OK;
 }
diff --git a/core/iwasm/compilation/aot_emit_memory.c b/core/iwasm/compilation/aot_emit_memory.c
index 506467449..a74d32681 100644
--- a/core/iwasm/compilation/aot_emit_memory.c
+++ b/core/iwasm/compilation/aot_emit_memory.c
@@ -96,7 +96,8 @@ get_memory_curr_page_count(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
 
 LLVMValueRef
 aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                          mem_offset_t offset, uint32 bytes, bool enable_segue)
+                          mem_offset_t offset, uint32 bytes, bool enable_segue,
+                          unsigned int *alignp)
 {
     LLVMValueRef offset_const =
         MEMORY64_COND_VALUE(I64_CONST(offset), I32_CONST(offset));
@@ -180,6 +181,26 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             comp_ctx->comp_data->memories[0].init_page_count;
         uint64 mem_data_size = (uint64)num_bytes_per_page * init_page_count;
 
+        if (alignp != NULL) {
+            /*
+             * A note about max_align below:
+             * the assumption here is the base address of a linear memory
+             * has the natural alignment. for platforms using mmap, it can
+             * be even larger. for now, use a conservative value.
+             */
+            const int max_align = 8;
+            int shift = ffs((int)(unsigned int)mem_offset);
+            if (shift == 0) {
+                *alignp = max_align;
+            }
+            else {
+                unsigned int align = 1 << (shift - 1);
+                if (align > max_align) {
+                    align = max_align;
+                }
+                *alignp = align;
+            }
+        }
         if (mem_offset + bytes <= mem_data_size) {
             /* inside memory space */
             if (comp_ctx->pointer_size == sizeof(uint64))
@@ -205,6 +226,9 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
             return maddr;
         }
     }
+    else if (alignp != NULL) {
+        *alignp = 1;
+    }
 
     if (is_target_64bit) {
         if (!(offset_const = LLVMBuildZExt(comp_ctx->builder, offset_const,
@@ -324,7 +348,7 @@ fail:
             aot_set_last_error("llvm build load failed.");                \
             goto fail;                                                    \
         }                                                                 \
-        LLVMSetAlignment(value, 1);                                       \
+        LLVMSetAlignment(value, known_align);                             \
     } while (0)
 
 #define BUILD_TRUNC(value, data_type)                                     \
@@ -343,7 +367,7 @@ fail:
             aot_set_last_error("llvm build store failed.");             \
             goto fail;                                                  \
         }                                                               \
-        LLVMSetAlignment(res, 1);                                       \
+        LLVMSetAlignment(res, known_align);                             \
     } while (0)
 
 #define BUILD_SIGN_EXT(dst_type)                                        \
@@ -445,8 +469,9 @@ aot_compile_op_i32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMTypeRef data_type;
     bool enable_segue = comp_ctx->enable_segue_i32_load;
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     switch (bytes) {
@@ -515,8 +540,9 @@ aot_compile_op_i64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMTypeRef data_type;
     bool enable_segue = comp_ctx->enable_segue_i64_load;
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     switch (bytes) {
@@ -591,8 +617,9 @@ aot_compile_op_f32_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMValueRef maddr, value;
     bool enable_segue = comp_ctx->enable_segue_f32_load;
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     if (!enable_segue)
@@ -614,8 +641,9 @@ aot_compile_op_f64_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     LLVMValueRef maddr, value;
     bool enable_segue = comp_ctx->enable_segue_f64_load;
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     if (!enable_segue)
@@ -640,8 +668,9 @@ aot_compile_op_i32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_I32(value);
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     switch (bytes) {
@@ -691,8 +720,9 @@ aot_compile_op_i64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_I64(value);
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     switch (bytes) {
@@ -748,8 +778,9 @@ aot_compile_op_f32_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_F32(value);
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 4,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     if (!enable_segue)
@@ -771,8 +802,9 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
     POP_F64(value);
 
+    unsigned int known_align;
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, 8,
-                                            enable_segue)))
+                                            enable_segue, &known_align)))
         return false;
 
     if (!enable_segue)
@@ -1302,7 +1334,7 @@ aot_compile_op_atomic_rmw(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
         POP_I64(value);
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            enable_segue)))
+                                            enable_segue, NULL)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1392,7 +1424,7 @@ aot_compile_op_atomic_cmpxchg(AOTCompContext *comp_ctx,
     }
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            enable_segue)))
+                                            enable_segue, NULL)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1505,7 +1537,7 @@ aot_compile_op_atomic_wait(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
     CHECK_LLVM_CONST(is_wait64);
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            false)))
+                                            false, NULL)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
@@ -1579,7 +1611,7 @@ aot_compiler_op_atomic_notify(AOTCompContext *comp_ctx,
     POP_I32(count);
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes,
-                                            false)))
+                                            false, NULL)))
         return false;
 
     if (!check_memory_alignment(comp_ctx, func_ctx, maddr, align))
diff --git a/core/iwasm/compilation/aot_emit_memory.h b/core/iwasm/compilation/aot_emit_memory.h
index f1828f17f..1eb95993c 100644
--- a/core/iwasm/compilation/aot_emit_memory.h
+++ b/core/iwasm/compilation/aot_emit_memory.h
@@ -53,7 +53,8 @@ aot_compile_op_f64_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
 
 LLVMValueRef
 aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
-                          mem_offset_t offset, uint32 bytes, bool enable_segue);
+                          mem_offset_t offset, uint32 bytes, bool enable_segue,
+                          unsigned int *alignp);
 
 bool
 aot_compile_op_memory_size(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx);
diff --git a/core/iwasm/compilation/simd/simd_load_store.c b/core/iwasm/compilation/simd/simd_load_store.c
index 45829f080..3b5023937 100644
--- a/core/iwasm/compilation/simd/simd_load_store.c
+++ b/core/iwasm/compilation/simd/simd_load_store.c
@@ -19,7 +19,7 @@ simd_load(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
     LLVMValueRef maddr, data;
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset,
-                                            data_length, enable_segue))) {
+                                            data_length, enable_segue, NULL))) {
         HANDLE_FAILURE("aot_check_memory_overflow");
         return NULL;
     }
@@ -287,7 +287,7 @@ simd_store(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, uint32 align,
     LLVMValueRef maddr, result;
 
     if (!(maddr = aot_check_memory_overflow(comp_ctx, func_ctx, offset,
-                                            data_length, enable_segue)))
+                                            data_length, enable_segue, NULL)))
         return false;
 
     if (!(maddr = LLVMBuildBitCast(comp_ctx->builder, maddr, value_ptr_type,
diff --git a/tests/unit/compilation/aot_emit_memory_test.cc b/tests/unit/compilation/aot_emit_memory_test.cc
index 26a6a79ba..0e8e86f21 100644
--- a/tests/unit/compilation/aot_emit_memory_test.cc
+++ b/tests/unit/compilation/aot_emit_memory_test.cc
@@ -100,7 +100,8 @@ TEST_F(compilation_aot_emit_memory_test, aot_check_memory_overflow)
 
     for (uint32 i = 0; i < DEFAULT_CYCLE_TIMES; i++) {
         offset = (1 + (rand() % (DEFAULT_MAX_RAND_NUM - 1 + 1)));
-        aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, false);
+        aot_check_memory_overflow(comp_ctx, func_ctx, offset, bytes, false,
+                                  NULL);
     }
 }