From 8d88471c468a768803c80f6316667c19827c6cb2 Mon Sep 17 00:00:00 2001 From: Wenyong Huang Date: Mon, 5 Jun 2023 09:17:39 +0800 Subject: [PATCH] Implement AOT static PGO (#2243) LLVM PGO (Profile-Guided Optimization) allows the compiler to better optimize code for how it actually runs. This PR implements the AOT static PGO, and is tested on Linux x86-64 and x86-32. The basic steps are: 1. Use `wamrc --enable-llvm-pgo -o ` to generate an instrumented aot file. 2. Compile iwasm with `cmake -DWAMR_BUILD_STATIC_PGO=1` and run `iwasm --gen-prof-file= ` to generate the raw profile file. 3. Run `llvm-profdata merge -output= ` to merge the raw profile file into the profile file. 4. Run `wamrc --use-prof-file= -o ` to generate the optimized aot file. 5. Run the optimized aot_file: `iwasm `. The test scripts are also added for each benchmark, run `test_pgo.sh` under each benchmark's folder to test the AOT static pgo. --- build-scripts/build_llvm.py | 2 +- build-scripts/config_common.cmake | 4 + core/config.h | 4 + core/iwasm/aot/aot_loader.c | 293 +++++++++- core/iwasm/aot/aot_reloc.h | 9 + core/iwasm/aot/aot_runtime.c | 517 ++++++++++++++++++ core/iwasm/aot/aot_runtime.h | 98 ++++ core/iwasm/aot/arch/aot_reloc_x86_32.c | 6 + core/iwasm/aot/arch/aot_reloc_x86_64.c | 23 +- core/iwasm/common/wasm_runtime_common.c | 27 + core/iwasm/compilation/aot_emit_aot_file.c | 318 +++++++++-- core/iwasm/compilation/aot_llvm.c | 26 + core/iwasm/compilation/aot_llvm.h | 15 + core/iwasm/compilation/aot_llvm_extra.cpp | 67 ++- core/iwasm/include/aot_export.h | 2 + core/iwasm/include/wasm_export.h | 24 + .../platform/include/platform_api_extension.h | 1 + product-mini/platforms/posix/main.c | 57 ++ tests/benchmarks/README.md | 62 +++ tests/benchmarks/coremark/README.md | 2 + tests/benchmarks/coremark/test_pgo.sh | 50 ++ tests/benchmarks/dhrystone/test_pgo.sh | 50 ++ tests/benchmarks/jetstream/README.md | 2 + tests/benchmarks/jetstream/test_pgo.sh | 87 +++ tests/benchmarks/libsodium/test_pgo.sh | 116 ++++ tests/benchmarks/polybench/test_pgo.sh | 90 +++ tests/benchmarks/sightglass/README.md | 2 + tests/benchmarks/sightglass/test_pgo.sh | 89 +++ wamr-compiler/main.c | 10 + 29 files changed, 2000 insertions(+), 53 deletions(-) create mode 100644 tests/benchmarks/README.md create mode 100755 tests/benchmarks/coremark/test_pgo.sh create mode 100755 tests/benchmarks/dhrystone/test_pgo.sh create mode 100755 tests/benchmarks/jetstream/test_pgo.sh create mode 100755 tests/benchmarks/libsodium/test_pgo.sh create mode 100755 tests/benchmarks/polybench/test_pgo.sh create mode 100755 tests/benchmarks/sightglass/test_pgo.sh diff --git a/build-scripts/build_llvm.py b/build-scripts/build_llvm.py index 3957f4b89..d70915c3b 100755 --- a/build-scripts/build_llvm.py +++ b/build-scripts/build_llvm.py @@ -61,7 +61,7 @@ def build_llvm(llvm_dir, platform, backends, projects, use_clang=False, extra_fl "-DLLVM_ENABLE_IDE:BOOL=OFF", "-DLLVM_ENABLE_LIBEDIT=OFF", "-DLLVM_ENABLE_TERMINFO:BOOL=OFF", - "-DLLVM_ENABLE_ZLIB:BOOL=OFF", + "-DLLVM_ENABLE_ZLIB:BOOL=ON", "-DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF", "-DLLVM_INCLUDE_DOCS:BOOL=OFF", "-DLLVM_INCLUDE_EXAMPLES:BOOL=OFF", diff --git a/build-scripts/config_common.cmake b/build-scripts/config_common.cmake index 16e8b7aa4..f5a49c879 100644 --- a/build-scripts/config_common.cmake +++ b/build-scripts/config_common.cmake @@ -388,3 +388,7 @@ if ("$ENV{COLLECT_CODE_COVERAGE}" STREQUAL "1" OR COLLECT_CODE_COVERAGE EQUAL 1) add_definitions (-DCOLLECT_CODE_COVERAGE) message (" Collect code coverage enabled") endif () +if (WAMR_BUILD_STATIC_PGO EQUAL 1) + add_definitions (-DWASM_ENABLE_STATIC_PGO=1) + message (" AOT static PGO enabled") +endif () diff --git a/core/config.h b/core/config.h index feedb13fa..82e181f6a 100644 --- a/core/config.h +++ b/core/config.h @@ -445,4 +445,8 @@ #define WASM_ENABLE_WASM_CACHE 0 #endif +#ifndef WASM_ENABLE_STATIC_PGO +#define WASM_ENABLE_STATIC_PGO 0 +#endif + #endif /* end of _CONFIG_H_ */ diff --git a/core/iwasm/aot/aot_loader.c b/core/iwasm/aot/aot_loader.c index db9eea425..480a00b91 100644 --- a/core/iwasm/aot/aot_loader.c +++ b/core/iwasm/aot/aot_loader.c @@ -1430,8 +1430,28 @@ destroy_object_data_sections(AOTObjectDataSection *data_sections, uint32 i; AOTObjectDataSection *data_section = data_sections; for (i = 0; i < data_section_count; i++, data_section++) - if (data_section->data) + if (data_section->data) { +#if WASM_ENABLE_STATIC_PGO != 0 + if (!strncmp(data_section->name, "__llvm_prf_data", 15)) { + LLVMProfileData *data = (LLVMProfileData *)data_section->data; + if (data->values) { + uint32 num_value_sites = + data->num_value_sites[0] + data->num_value_sites[1]; + uint32 j; + for (j = 0; j < num_value_sites; j++) { + ValueProfNode *node = data->values[j], *node_next; + while (node) { + node_next = node->next; + wasm_runtime_free(node); + node = node_next; + } + } + wasm_runtime_free(data->values); + } + } +#endif os_munmap(data_section->data, data_section->size); + } wasm_runtime_free(data_sections); } @@ -1900,6 +1920,8 @@ str2uint64(const char *buf, uint64 *p_res) return true; } +#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */ + static bool do_text_relocation(AOTModule *module, AOTRelocationGroup *group, char *error_buf, uint32 error_buf_size) @@ -1937,6 +1959,14 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group, bh_memcpy_s(symbol, symbol_len, relocation->symbol_name, symbol_len); symbol[symbol_len] = '\0'; +#if WASM_ENABLE_STATIC_PGO != 0 + if (!strcmp(symbol, "__llvm_profile_runtime") + || !strcmp(symbol, "__llvm_profile_register_function") + || !strcmp(symbol, "__llvm_profile_register_names_function")) { + continue; + } +#endif + if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) { p = symbol + strlen(AOT_FUNC_PREFIX); if (*p == '\0' @@ -1945,7 +1975,26 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group, "invalid import symbol %s", symbol); goto check_symbol_fail; } +#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \ + && !defined(BH_PLATFORM_WINDOWS) + if (relocation->relocation_type == R_X86_64_GOTPCREL) { + GOTItem *got_item = module->got_item_list; + uint32 got_item_idx = 0; + + while (got_item) { + if (got_item->func_idx == func_index) + break; + got_item_idx++; + got_item = got_item->next; + } + /* Calculate `GOT + G` */ + symbol_addr = module->got_func_ptrs + got_item_idx; + } + else + symbol_addr = module->func_ptrs[func_index]; +#else symbol_addr = module->func_ptrs[func_index]; +#endif } else if (!strcmp(symbol, ".text")) { symbol_addr = module->code; @@ -1956,7 +2005,13 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group, /* ".rodata.cst4/8/16/.." */ || !strncmp(symbol, ".rodata.cst", strlen(".rodata.cst")) /* ".rodata.strn.m" */ - || !strncmp(symbol, ".rodata.str", strlen(".rodata.str"))) { + || !strncmp(symbol, ".rodata.str", strlen(".rodata.str")) +#if WASM_ENABLE_STATIC_PGO != 0 + || !strncmp(symbol, "__llvm_prf_cnts", 15) + || !strncmp(symbol, "__llvm_prf_data", 15) + || !strncmp(symbol, "__llvm_prf_names", 16) +#endif + ) { symbol_addr = get_data_section_addr(module, symbol, NULL); if (!symbol_addr) { set_error_buf_v(error_buf, error_buf_size, @@ -2088,6 +2143,14 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group, else if (!strcmp(group->section_name, ".rdata")) { data_section_name = group->section_name; } +#if WASM_ENABLE_STATIC_PGO != 0 + else if (!strncmp(group->section_name, ".rel__llvm_prf_data", 19)) { + data_section_name = group->section_name + strlen(".rel"); + } + else if (!strncmp(group->section_name, ".rela__llvm_prf_data", 20)) { + data_section_name = group->section_name + strlen(".rela"); + } +#endif else { set_error_buf(error_buf, error_buf_size, "invalid data relocation section name"); @@ -2107,6 +2170,49 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group, if (!strcmp(symbol, ".text")) { symbol_addr = module->code; } +#if WASM_ENABLE_STATIC_PGO != 0 + else if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) { + char *p = symbol + strlen(AOT_FUNC_PREFIX); + uint32 func_index; + if (*p == '\0' + || (func_index = (uint32)atoi(p)) > module->func_count) { + set_error_buf_v(error_buf, error_buf_size, + "invalid relocation symbol %s", symbol); + return false; + } + symbol_addr = module->func_ptrs[func_index]; + } + else if (!strcmp(symbol, "__llvm_prf_cnts")) { + uint32 j; + for (j = 0; j < module->data_section_count; j++) { + if (!strncmp(module->data_sections[j].name, symbol, 15)) { + bh_assert(relocation->relocation_addend + sizeof(uint64) + <= module->data_sections[j].size); + symbol_addr = module->data_sections[j].data; + break; + } + } + if (j == module->data_section_count) { + set_error_buf_v(error_buf, error_buf_size, + "invalid relocation symbol %s", symbol); + return false; + } + } + else if (!strncmp(symbol, "__llvm_prf_cnts", 15)) { + uint32 j; + for (j = 0; j < module->data_section_count; j++) { + if (!strcmp(module->data_sections[j].name, symbol)) { + symbol_addr = module->data_sections[j].data; + break; + } + } + if (j == module->data_section_count) { + set_error_buf_v(error_buf, error_buf_size, + "invalid relocation symbol %s", symbol); + return false; + } + } +#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */ else { set_error_buf_v(error_buf, error_buf_size, "invalid relocation symbol %s", symbol); @@ -2154,7 +2260,7 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end, { AOTRelocationGroup *groups = NULL, *group; uint32 symbol_count = 0; - uint32 group_count = 0, i, j; + uint32 group_count = 0, i, j, got_item_count = 0; uint64 size; uint32 *symbol_offsets, total_string_len; uint8 *symbol_buf, *symbol_buf_end; @@ -2216,6 +2322,8 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end, for (j = 0; j < relocation_count; j++) { AOTRelocation relocation = { 0 }; + char group_name_buf[128] = { 0 }; + char symbol_name_buf[128] = { 0 }; uint32 symbol_index, offset32; int32 addend32; uint16 symbol_name_len; @@ -2244,10 +2352,10 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end, symbol_name_len = *(uint16 *)symbol_name; symbol_name += sizeof(uint16); - char group_name_buf[128] = { 0 }; - char symbol_name_buf[128] = { 0 }; - memcpy(group_name_buf, group_name, group_name_len); - memcpy(symbol_name_buf, symbol_name, symbol_name_len); + bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf), + group_name, group_name_len); + bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf), + symbol_name, symbol_name_len); if ((group_name_len == strlen(".text") || (module->is_indirect_mode @@ -2309,6 +2417,139 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end, } #endif /* end of defined(BH_PLATFORM_WINDOWS) */ +#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \ + && !defined(BH_PLATFORM_WINDOWS) + buf = symbol_buf_end; + read_uint32(buf, buf_end, group_count); + + /* Resolve the relocations of type R_X86_64_GOTPCREL */ + for (i = 0; i < group_count; i++) { + uint32 name_index, relocation_count; + uint16 group_name_len; + uint8 *group_name; + + /* section name address is 4 bytes aligned. */ + buf = (uint8 *)align_ptr(buf, sizeof(uint32)); + read_uint32(buf, buf_end, name_index); + + if (name_index >= symbol_count) { + set_error_buf(error_buf, error_buf_size, + "symbol index out of range"); + goto fail; + } + + group_name = symbol_buf + symbol_offsets[name_index]; + group_name_len = *(uint16 *)group_name; + group_name += sizeof(uint16); + + read_uint32(buf, buf_end, relocation_count); + + for (j = 0; j < relocation_count; j++) { + AOTRelocation relocation = { 0 }; + char group_name_buf[128] = { 0 }; + char symbol_name_buf[128] = { 0 }; + uint32 symbol_index; + uint16 symbol_name_len; + uint8 *symbol_name; + + /* relocation offset and addend */ + buf += sizeof(void *) * 2; + + read_uint32(buf, buf_end, relocation.relocation_type); + read_uint32(buf, buf_end, symbol_index); + + if (symbol_index >= symbol_count) { + set_error_buf(error_buf, error_buf_size, + "symbol index out of range"); + goto fail; + } + + symbol_name = symbol_buf + symbol_offsets[symbol_index]; + symbol_name_len = *(uint16 *)symbol_name; + symbol_name += sizeof(uint16); + + bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf), + group_name, group_name_len); + bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf), + symbol_name, symbol_name_len); + + if (relocation.relocation_type == R_X86_64_GOTPCREL + && !strncmp(symbol_name_buf, AOT_FUNC_PREFIX, + strlen(AOT_FUNC_PREFIX))) { + uint32 func_idx = + atoi(symbol_name_buf + strlen(AOT_FUNC_PREFIX)); + GOTItem *got_item = module->got_item_list; + + if (func_idx >= module->func_count) { + set_error_buf(error_buf, error_buf_size, + "func index out of range"); + goto fail; + } + + while (got_item) { + if (got_item->func_idx == func_idx) + break; + got_item = got_item->next; + } + + if (!got_item) { + /* Create the got item and append to the list */ + got_item = wasm_runtime_malloc(sizeof(GOTItem)); + if (!got_item) { + set_error_buf(error_buf, error_buf_size, + "allocate memory failed"); + goto fail; + } + + got_item->func_idx = func_idx; + got_item->next = NULL; + if (!module->got_item_list) { + module->got_item_list = module->got_item_list_end = + got_item; + } + else { + module->got_item_list_end->next = got_item; + module->got_item_list_end = got_item; + } + + got_item_count++; + } + } + } + } + + if (got_item_count) { + GOTItem *got_item = module->got_item_list; + uint32 got_item_idx = 0; + + map_prot = MMAP_PROT_READ | MMAP_PROT_WRITE; + /* aot code and data in x86_64 must be in range 0 to 2G due to + relocation for R_X86_64_32/32S/PC32 */ + map_flags = MMAP_MAP_32BIT; + + /* Create the GOT for func_ptrs, note that it is different from + the .got section of a dynamic object file */ + size = (uint64)sizeof(void *) * got_item_count; + if (size > UINT32_MAX + || !(module->got_func_ptrs = + os_mmap(NULL, (uint32)size, map_prot, map_flags))) { + set_error_buf(error_buf, error_buf_size, "mmap memory failed"); + goto fail; + } + + while (got_item) { + module->got_func_ptrs[got_item_idx++] = + module->func_ptrs[got_item->func_idx]; + got_item = got_item->next; + } + + module->got_item_count = got_item_count; + } +#else + (void)got_item_count; +#endif /* (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) && \ + !defined(BH_PLATFORM_WINDOWS) */ + buf = symbol_buf_end; read_uint32(buf, buf_end, group_count); @@ -2994,9 +3235,27 @@ aot_unload(AOTModule *module) } #endif +#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \ + && !defined(BH_PLATFORM_WINDOWS) + { + GOTItem *got_item = module->got_item_list, *got_item_next; + + if (module->got_func_ptrs) { + os_munmap(module->got_func_ptrs, + sizeof(void *) * module->got_item_count); + } + while (got_item) { + got_item_next = got_item->next; + wasm_runtime_free(got_item); + got_item = got_item_next; + } + } +#endif + if (module->data_sections) destroy_object_data_sections(module->data_sections, module->data_section_count); + #if WASM_ENABLE_DEBUG_AOT != 0 jit_code_entry_destroy(module->elf_hdr); #endif @@ -3043,3 +3302,23 @@ aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len) return NULL; } #endif /* end of WASM_ENABLE_LOAD_CUSTOM_SECTION */ + +#if WASM_ENABLE_STATIC_PGO != 0 +void +aot_exchange_uint16(uint8 *p_data) +{ + return exchange_uint16(p_data); +} + +void +aot_exchange_uint32(uint8 *p_data) +{ + return exchange_uint32(p_data); +} + +void +aot_exchange_uint64(uint8 *p_data) +{ + return exchange_uint64(p_data); +} +#endif diff --git a/core/iwasm/aot/aot_reloc.h b/core/iwasm/aot/aot_reloc.h index 9f5c2d57f..98df09cb4 100644 --- a/core/iwasm/aot/aot_reloc.h +++ b/core/iwasm/aot/aot_reloc.h @@ -121,6 +121,14 @@ typedef struct { REG_SYM(aot_intrinsic_i32_rem_s), \ REG_SYM(aot_intrinsic_i32_rem_u), \ +#if WASM_ENABLE_STATIC_PGO != 0 +#define REG_LLVM_PGO_SYM() \ + { "__llvm_profile_instrument_target", llvm_profile_instrument_target }, \ + { "__llvm_profile_instrument_memop", llvm_profile_instrument_memop }, +#else +#define REG_LLVM_PGO_SYM() +#endif + #define REG_COMMON_SYMBOLS \ REG_SYM(aot_set_exception_with_id), \ REG_SYM(aot_invoke_native), \ @@ -150,6 +158,7 @@ typedef struct { REG_REF_TYPES_SYM() \ REG_AOT_TRACE_SYM() \ REG_INTRINSIC_SYM() \ + REG_LLVM_PGO_SYM() \ #define CHECK_RELOC_OFFSET(data_size) do { \ if (!check_reloc_offset(target_section_size, \ diff --git a/core/iwasm/aot/aot_runtime.c b/core/iwasm/aot/aot_runtime.c index 2a3280ef9..ca0a4ff1d 100644 --- a/core/iwasm/aot/aot_runtime.c +++ b/core/iwasm/aot/aot_runtime.c @@ -2852,3 +2852,520 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst) } } #endif /* end of WASM_ENABLE_PERF_PROFILING */ + +#if WASM_ENABLE_STATIC_PGO != 0 + +/* indirect call target */ +#define IPVK_IndirectCallTarget 0 +/* memory intrinsic functions size */ +#define IPVK_MemOPSize 1 +#define IPVK_First IPVK_IndirectCallTarget +#define IPVK_Last IPVK_MemOPSize + +#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 24 +#define INSTR_PROF_MAX_NUM_VAL_PER_SITE 255 + +static int hasNonDefaultValsPerSite = 0; +static uint32 VPMaxNumValsPerSite = INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE; + +static bool +cmpxchg_ptr(void **ptr, void *old_val, void *new_val) +{ +#if defined(os_atomic_cmpxchg) + return os_atomic_cmpxchg(ptr, &old_val, new_val); +#else + /* TODO: add lock when thread-manager is enabled */ + void *read = *ptr; + if (read == old_val) { + *ptr = new_val; + return true; + } + return false; +#endif +} + +static int +allocateValueProfileCounters(LLVMProfileData *Data) +{ + ValueProfNode **Mem; + uint64 NumVSites = 0, total_size; + uint32 VKI; + + /* When dynamic allocation is enabled, allow tracking the max number of + values allowed. */ + if (!hasNonDefaultValsPerSite) + VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE; + + for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI) + NumVSites += Data->num_value_sites[VKI]; + + /* If NumVSites = 0, calloc is allowed to return a non-null pointer. */ + bh_assert(NumVSites > 0 && "NumVSites can't be zero"); + + total_size = (uint64)sizeof(ValueProfNode *) * NumVSites; + if (total_size > UINT32_MAX + || !(Mem = (ValueProfNode **)wasm_runtime_malloc((uint32)total_size))) { + return 0; + } + memset(Mem, 0, (uint32)total_size); + + if (!cmpxchg_ptr((void **)&Data->values, NULL, Mem)) { + wasm_runtime_free(Mem); + return 0; + } + return 1; +} + +static ValueProfNode * +allocateOneNode(void) +{ + ValueProfNode *Node; + + Node = wasm_runtime_malloc((uint32)sizeof(ValueProfNode)); + if (Node) + memset(Node, 0, sizeof(ValueProfNode)); + return Node; +} + +static void +instrumentTargetValueImpl(uint64 TargetValue, void *Data, uint32 CounterIndex, + uint64 CountValue) +{ + ValueProfNode **ValueCounters; + ValueProfNode *PrevVNode = NULL, *MinCountVNode = NULL, *CurVNode; + LLVMProfileData *PData = (LLVMProfileData *)Data; + uint64 MinCount = UINT64_MAX; + uint8 VDataCount = 0; + bool success = false; + + if (!PData) + return; + if (!CountValue) + return; + if (!PData->values) { + if (!allocateValueProfileCounters(PData)) + return; + } + + ValueCounters = (ValueProfNode **)PData->values; + CurVNode = ValueCounters[CounterIndex]; + + while (CurVNode) { + if (TargetValue == CurVNode->value) { + CurVNode->count += CountValue; + return; + } + if (CurVNode->count < MinCount) { + MinCount = CurVNode->count; + MinCountVNode = CurVNode; + } + PrevVNode = CurVNode; + CurVNode = CurVNode->next; + ++VDataCount; + } + + if (VDataCount >= VPMaxNumValsPerSite) { + if (MinCountVNode->count <= CountValue) { + CurVNode = MinCountVNode; + CurVNode->value = TargetValue; + CurVNode->count = CountValue; + } + else + MinCountVNode->count -= CountValue; + + return; + } + + CurVNode = allocateOneNode(); + if (!CurVNode) + return; + CurVNode->value = TargetValue; + CurVNode->count += CountValue; + + if (!ValueCounters[CounterIndex]) { + success = + cmpxchg_ptr((void **)&ValueCounters[CounterIndex], NULL, CurVNode); + } + else if (PrevVNode && !PrevVNode->next) { + success = cmpxchg_ptr((void **)&PrevVNode->next, 0, CurVNode); + } + + if (!success) { + wasm_runtime_free(CurVNode); + } +} + +void +llvm_profile_instrument_target(uint64 target_value, void *data, + uint32 counter_idx) +{ + instrumentTargetValueImpl(target_value, data, counter_idx, 1); +} + +static inline uint32 +popcount64(uint64 u) +{ + uint32 ret = 0; + while (u) { + u = (u & (u - 1)); + ret++; + } + return ret; +} + +static inline uint32 +clz64(uint64 type) +{ + uint32 num = 0; + if (type == 0) + return 64; + while (!(type & 0x8000000000000000LL)) { + num++; + type <<= 1; + } + return num; +} + +/* Map an (observed) memop size value to the representative value of its range. + For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */ +static uint64 +InstrProfGetRangeRepValue(uint64 Value) +{ + if (Value <= 8) + /* The first ranges are individually tracked. Use the value as is. */ + return Value; + else if (Value >= 513) + /* The last range is mapped to its lowest value. */ + return 513; + else if (popcount64(Value) == 1) + /* If it's a power of two, use it as is. */ + return Value; + else + /* Otherwise, take to the previous power of two + 1. */ + return (((uint64)1) << (64 - clz64(Value) - 1)) + 1; +} + +void +llvm_profile_instrument_memop(uint64 target_value, void *data, + uint32 counter_idx) +{ + uint64 rep_value = InstrProfGetRangeRepValue(target_value); + instrumentTargetValueImpl(rep_value, data, counter_idx, 1); +} + +static uint32 +get_pgo_prof_data_size(AOTModuleInstance *module_inst, uint32 *p_num_prof_data, + uint32 *p_num_prof_counters, uint32 *p_padding_size, + uint32 *p_prof_counters_size, uint32 *p_prof_names_size, + uint32 *p_value_counters_size, uint8 **p_prof_names) +{ + AOTModule *module = (AOTModule *)module_inst->module; + LLVMProfileData *prof_data; + uint8 *prof_names = NULL; + uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i; + uint32 prof_counters_size = 0, prof_names_size = 0; + uint32 total_size, total_size_wo_value_counters; + + for (i = 0; i < module->data_section_count; i++) { + if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) { + bh_assert(module->data_sections[i].size == sizeof(LLVMProfileData)); + num_prof_data++; + prof_data = (LLVMProfileData *)module->data_sections[i].data; + num_prof_counters += prof_data->num_counters; + } + else if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts", + 15)) { + prof_counters_size += module->data_sections[i].size; + } + else if (!strncmp(module->data_sections[i].name, "__llvm_prf_names", + 16)) { + prof_names_size = module->data_sections[i].size; + prof_names = module->data_sections[i].data; + } + } + + if (prof_counters_size != num_prof_counters * sizeof(uint64)) + return 0; + + total_size = sizeof(LLVMProfileRawHeader) + + num_prof_data * sizeof(LLVMProfileData_64) + + prof_counters_size + prof_names_size; + padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64)); + if (padding_size != sizeof(uint64)) + total_size += padding_size; + + /* Total size excluding value counters */ + total_size_wo_value_counters = total_size; + + for (i = 0; i < module->data_section_count; i++) { + if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) { + uint32 j, k, num_value_sites, num_value_nodes; + ValueProfNode **values, *value_node; + + prof_data = (LLVMProfileData *)module->data_sections[i].data; + values = prof_data->values; + + if (prof_data->num_value_sites[0] > 0 + || prof_data->num_value_sites[1] > 0) { + /* TotalSize (uint32) and NumValueKinds (uint32) */ + total_size += 8; + for (j = 0; j < 2; j++) { + if ((num_value_sites = prof_data->num_value_sites[j]) > 0) { + /* ValueKind (uint32) and NumValueSites (uint32) */ + total_size += 8; + /* (Value + Counter) group counts of each value site, + each count is one byte */ + total_size += align_uint(num_value_sites, 8); + + if (values) { + for (k = 0; k < num_value_sites; k++) { + num_value_nodes = 0; + value_node = *values; + while (value_node) { + num_value_nodes++; + value_node = value_node->next; + } + if (num_value_nodes) { + /* (Value + Counter) groups */ + total_size += num_value_nodes * 8 * 2; + } + values++; + } + } + } + } + } + } + } + + if (p_num_prof_data) + *p_num_prof_data = num_prof_data; + if (p_num_prof_counters) + *p_num_prof_counters = num_prof_counters; + if (p_padding_size) + *p_padding_size = padding_size; + if (p_prof_counters_size) + *p_prof_counters_size = prof_counters_size; + if (p_prof_names_size) + *p_prof_names_size = prof_names_size; + if (p_value_counters_size) + *p_value_counters_size = total_size - total_size_wo_value_counters; + if (p_prof_names) + *p_prof_names = prof_names; + + return total_size; +} + +uint32 +aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst) +{ + return get_pgo_prof_data_size(module_inst, NULL, NULL, NULL, NULL, NULL, + NULL, NULL); +} + +static union { + int a; + char b; +} __ue = { .a = 1 }; + +#define is_little_endian() (__ue.b == 1) + +uint32 +aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf, + uint32 len) +{ + AOTModule *module = (AOTModule *)module_inst->module; + LLVMProfileRawHeader prof_header = { 0 }; + LLVMProfileData *prof_data; + uint8 *prof_names = NULL; + uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i; + uint32 prof_counters_size = 0, prof_names_size = 0; + uint32 value_counters_size = 0, value_counters_size_backup = 0; + uint32 total_size, size; + int64 counters_delta, offset_counters; + + total_size = get_pgo_prof_data_size(module_inst, &num_prof_data, + &num_prof_counters, &padding_size, + &prof_counters_size, &prof_names_size, + &value_counters_size, &prof_names); + if (len < total_size) + return 0; + + value_counters_size_backup = value_counters_size; + value_counters_size = 0; + + prof_header.counters_delta = counters_delta = + sizeof(LLVMProfileData_64) * num_prof_data; + offset_counters = 0; + for (i = 0; i < module->data_section_count; i++) { + if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) { + prof_data = (LLVMProfileData *)module->data_sections[i].data; + prof_data->offset_counters = counters_delta + offset_counters; + offset_counters += prof_data->num_counters * sizeof(uint64); + counters_delta -= sizeof(LLVMProfileData_64); + } + } + + prof_header.magic = 0xFF6C70726F667281LL; + /* Version 8 */ + prof_header.version = 0x0000000000000008LL; + /* with VARIANT_MASK_IR_PROF (IR Instrumentation) */ + prof_header.version |= 0x1ULL << 56; + /* with VARIANT_MASK_MEMPROF (Memory Profile) */ + prof_header.version |= 0x1ULL << 62; + prof_header.num_prof_data = num_prof_data; + prof_header.num_prof_counters = num_prof_counters; + prof_header.names_size = prof_names_size; + prof_header.value_kind_last = 1; + + if (!is_little_endian()) { + aot_exchange_uint64((uint8 *)&prof_header.magic); + aot_exchange_uint64((uint8 *)&prof_header.version); + aot_exchange_uint64((uint8 *)&prof_header.num_prof_data); + aot_exchange_uint64((uint8 *)&prof_header.num_prof_counters); + aot_exchange_uint64((uint8 *)&prof_header.names_size); + aot_exchange_uint64((uint8 *)&prof_header.counters_delta); + aot_exchange_uint64((uint8 *)&prof_header.value_kind_last); + } + + size = sizeof(LLVMProfileRawHeader); + bh_memcpy_s(buf, size, &prof_header, size); + buf += size; + + for (i = 0; i < module->data_section_count; i++) { + if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) { + LLVMProfileData_64 *prof_data_64 = (LLVMProfileData_64 *)buf; + + /* Convert LLVMProfileData to LLVMProfileData_64, the pointer width + in the output file is alawys 8 bytes */ + prof_data = (LLVMProfileData *)module->data_sections[i].data; + prof_data_64->func_md5 = prof_data->func_md5; + prof_data_64->func_hash = prof_data->func_hash; + prof_data_64->offset_counters = prof_data->offset_counters; + prof_data_64->func_ptr = prof_data->func_ptr; + prof_data_64->values = (uint64)(uintptr_t)prof_data->values; + prof_data_64->num_counters = prof_data->num_counters; + prof_data_64->num_value_sites[0] = prof_data->num_value_sites[0]; + prof_data_64->num_value_sites[1] = prof_data->num_value_sites[1]; + + if (!is_little_endian()) { + aot_exchange_uint64((uint8 *)&prof_data_64->func_hash); + aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters); + aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters); + aot_exchange_uint64((uint8 *)&prof_data_64->func_ptr); + aot_exchange_uint64((uint8 *)&prof_data_64->values); + aot_exchange_uint32((uint8 *)&prof_data_64->num_counters); + aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[0]); + aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[1]); + } + buf += sizeof(LLVMProfileData_64); + } + } + + for (i = 0; i < module->data_section_count; i++) { + if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts", 15)) { + size = module->data_sections[i].size; + bh_memcpy_s(buf, size, module->data_sections[i].data, size); + buf += size; + } + } + + if (prof_names && prof_names_size > 0) { + size = prof_names_size; + bh_memcpy_s(buf, size, prof_names, size); + buf += size; + padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64)); + if (padding_size != sizeof(uint64)) { + char padding_buf[8] = { 0 }; + bh_memcpy_s(buf, padding_size, padding_buf, padding_size); + buf += padding_size; + } + } + + for (i = 0; i < module->data_section_count; i++) { + if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) { + uint32 j, k, num_value_sites, num_value_nodes; + ValueProfNode **values, **values_tmp, *value_node; + + prof_data = (LLVMProfileData *)module->data_sections[i].data; + values = values_tmp = prof_data->values; + + if (prof_data->num_value_sites[0] > 0 + || prof_data->num_value_sites[1] > 0) { + uint32 *buf_total_size = (uint32 *)buf; + + buf += 4; /* emit TotalSize later */ + *(uint32 *)buf = (prof_data->num_value_sites[0] > 0 + && prof_data->num_value_sites[1] > 0) + ? 2 + : 1; + if (!is_little_endian()) + aot_exchange_uint32((uint8 *)buf); + buf += 4; + + for (j = 0; j < 2; j++) { + if ((num_value_sites = prof_data->num_value_sites[j]) > 0) { + /* ValueKind */ + *(uint32 *)buf = j; + if (!is_little_endian()) + aot_exchange_uint32((uint8 *)buf); + buf += 4; + /* NumValueSites */ + *(uint32 *)buf = num_value_sites; + if (!is_little_endian()) + aot_exchange_uint32((uint8 *)buf); + buf += 4; + + for (k = 0; k < num_value_sites; k++) { + num_value_nodes = 0; + if (values_tmp) { + value_node = *values_tmp; + while (value_node) { + num_value_nodes++; + value_node = value_node->next; + } + values_tmp++; + } + bh_assert(num_value_nodes < 255); + *(uint8 *)buf++ = (uint8)num_value_nodes; + } + if (num_value_sites % 8) { + buf += 8 - (num_value_sites % 8); + } + + for (k = 0; k < num_value_sites; k++) { + if (values) { + value_node = *values; + while (value_node) { + *(uint64 *)buf = value_node->value; + if (!is_little_endian()) + aot_exchange_uint64((uint8 *)buf); + buf += 8; + *(uint64 *)buf = value_node->count; + if (!is_little_endian()) + aot_exchange_uint64((uint8 *)buf); + buf += 8; + value_node = value_node->next; + } + values++; + } + } + } + } + + /* TotalSize */ + *(uint32 *)buf_total_size = + (uint8 *)buf - (uint8 *)buf_total_size; + if (!is_little_endian()) + aot_exchange_uint64((uint8 *)buf_total_size); + value_counters_size += (uint8 *)buf - (uint8 *)buf_total_size; + } + } + } + + bh_assert(value_counters_size == value_counters_size_backup); + (void)value_counters_size_backup; + + return total_size; +} +#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */ diff --git a/core/iwasm/aot/aot_runtime.h b/core/iwasm/aot/aot_runtime.h index bcd06534e..2493d7c2c 100644 --- a/core/iwasm/aot/aot_runtime.h +++ b/core/iwasm/aot/aot_runtime.h @@ -41,6 +41,10 @@ typedef struct AOTObjectDataSection { char *name; uint8 *data; uint32 size; +#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0 + bool is_name_allocated; + bool is_data_allocated; +#endif } AOTObjectDataSection; /* Relocation info */ @@ -51,6 +55,9 @@ typedef struct AOTRelocation { char *symbol_name; /* index in the symbol offset field */ uint32 symbol_index; +#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0 + bool is_symbol_name_allocated; +#endif } AOTRelocation; /* Relocation Group */ @@ -60,6 +67,9 @@ typedef struct AOTRelocationGroup { uint32 name_index; uint32 relocation_count; AOTRelocation *relocations; +#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0 + bool is_section_name_allocated; +#endif } AOTRelocationGroup; /* AOT function instance */ @@ -108,6 +118,13 @@ typedef struct AOTUnwindInfo { #define PLT_ITEM_SIZE 12 #endif +#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64) +typedef struct GOTItem { + uint32 func_idx; + struct GOTItem *next; +} GOTItem, *GOTItemList; +#endif + typedef struct AOTModule { uint32 module_type; @@ -204,6 +221,13 @@ typedef struct AOTModule { bool rtl_func_table_registered; #endif +#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64) + uint32 got_item_count; + GOTItemList got_item_list; + GOTItemList got_item_list_end; + void **got_func_ptrs; +#endif + /* data sections in AOT object file, including .data, .rodata and .rodata.cstN. */ AOTObjectDataSection *data_sections; @@ -294,6 +318,54 @@ typedef struct AOTFrame { #endif } AOTFrame; +#if WASM_ENABLE_STATIC_PGO != 0 +typedef struct LLVMProfileRawHeader { + uint64 magic; + uint64 version; + uint64 binary_ids_size; + uint64 num_prof_data; + uint64 padding_bytes_before_counters; + uint64 num_prof_counters; + uint64 padding_bytes_after_counters; + uint64 names_size; + uint64 counters_delta; + uint64 names_delta; + uint64 value_kind_last; +} LLVMProfileRawHeader; + +typedef struct ValueProfNode { + uint64 value; + uint64 count; + struct ValueProfNode *next; +} ValueProfNode; + +/* The profiling data of data sections created by aot compiler and + used when profiling, the width of pointer can be 8 bytes (64-bit) + or 4 bytes (32-bit) */ +typedef struct LLVMProfileData { + uint64 func_md5; + uint64 func_hash; + uint64 offset_counters; + uintptr_t func_ptr; + ValueProfNode **values; + uint32 num_counters; + uint16 num_value_sites[2]; +} LLVMProfileData; + +/* The profiling data for writting to the output file, the width of + pointer is 8 bytes suppose we always use wamrc and llvm-profdata + with 64-bit mode */ +typedef struct LLVMProfileData_64 { + uint64 func_md5; + uint64 func_hash; + uint64 offset_counters; + uint64 func_ptr; + uint64 values; + uint32 num_counters; + uint16 num_value_sites[2]; +} LLVMProfileData_64; +#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */ + /** * Load a AOT module from aot file buffer * @param buf the byte buffer which contains the AOT file data @@ -564,6 +636,32 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst); const uint8 * aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len); +#if WASM_ENABLE_STATIC_PGO != 0 +void +llvm_profile_instrument_target(uint64 target_value, void *data, + uint32 counter_idx); + +void +llvm_profile_instrument_memop(uint64 target_value, void *data, + uint32 counter_idx); + +uint32 +aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst); + +uint32 +aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf, + uint32 len); + +void +aot_exchange_uint16(uint8 *p_data); + +void +aot_exchange_uint32(uint8 *p_data); + +void +aot_exchange_uint64(uint8 *p_data); +#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */ + #ifdef __cplusplus } /* end of extern "C" */ #endif diff --git a/core/iwasm/aot/arch/aot_reloc_x86_32.c b/core/iwasm/aot/arch/aot_reloc_x86_32.c index af3e0bb8e..5a49c14af 100644 --- a/core/iwasm/aot/arch/aot_reloc_x86_32.c +++ b/core/iwasm/aot/arch/aot_reloc_x86_32.c @@ -8,6 +8,9 @@ #define R_386_32 1 /* Direct 32 bit */ #define R_386_PC32 2 /* PC relative 32 bit */ #define R_386_PLT32 4 /* 32-bit address ProcedureLinkageTable */ +#define R_386_TLS_GD_32 \ + 24 /* Direct 32 bit for general dynamic \ + thread local data */ #if !defined(_WIN32) && !defined(_WIN32_) /* clang-format off */ @@ -110,6 +113,9 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr, { switch (reloc_type) { case R_386_32: +#if WASM_ENABLE_STATIC_PGO != 0 + case R_386_TLS_GD_32: +#endif { intptr_t value; diff --git a/core/iwasm/aot/arch/aot_reloc_x86_64.c b/core/iwasm/aot/arch/aot_reloc_x86_64.c index f4d8eeabd..1221a6297 100644 --- a/core/iwasm/aot/arch/aot_reloc_x86_64.c +++ b/core/iwasm/aot/arch/aot_reloc_x86_64.c @@ -6,11 +6,13 @@ #include "aot_reloc.h" #if !defined(BH_PLATFORM_WINDOWS) -#define R_X86_64_64 1 /* Direct 64 bit */ -#define R_X86_64_PC32 2 /* PC relative 32 bit signed */ -#define R_X86_64_PLT32 4 /* 32 bit PLT address */ -#define R_X86_64_32 10 /* Direct 32 bit zero extended */ -#define R_X86_64_32S 11 /* Direct 32 bit sign extended */ +#define R_X86_64_64 1 /* Direct 64 bit */ +#define R_X86_64_PC32 2 /* PC relative 32 bit signed */ +#define R_X86_64_PLT32 4 /* 32 bit PLT address */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */ +#define R_X86_64_32 10 /* Direct 32 bit zero extended */ +#define R_X86_64_32S 11 /* Direct 32 bit sign extended */ +#define R_X86_64_PC64 24 /* PC relative 64 bit */ #else #ifndef IMAGE_REL_AMD64_ADDR64 #define IMAGE_REL_AMD64_ADDR64 1 /* The 64-bit VA of the relocation target */ @@ -164,6 +166,7 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr, #endif #if !defined(BH_PLATFORM_WINDOWS) case R_X86_64_PC32: + case R_X86_64_GOTPCREL: /* GOT + G has been calculated as symbol_addr */ { intptr_t target_addr = (intptr_t) /* S + A - P */ ((uintptr_t)symbol_addr + reloc_addend @@ -182,6 +185,16 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr, *(int32 *)(target_section_addr + reloc_offset) = (int32)target_addr; break; } + case R_X86_64_PC64: + { + intptr_t target_addr = (intptr_t) /* S + A - P */ + ((uintptr_t)symbol_addr + reloc_addend + - (uintptr_t)(target_section_addr + reloc_offset)); + + CHECK_RELOC_OFFSET(sizeof(int64)); + *(int64 *)(target_section_addr + reloc_offset) = (int64)target_addr; + break; + } case R_X86_64_32: case R_X86_64_32S: { diff --git a/core/iwasm/common/wasm_runtime_common.c b/core/iwasm/common/wasm_runtime_common.c index 35bb9bce4..b2923db33 100644 --- a/core/iwasm/common/wasm_runtime_common.c +++ b/core/iwasm/common/wasm_runtime_common.c @@ -5033,6 +5033,33 @@ wasm_runtime_dump_call_stack_to_buf(wasm_exec_env_t exec_env, char *buf, } #endif /* end of WASM_ENABLE_DUMP_CALL_STACK */ +#if WASM_ENABLE_STATIC_PGO != 0 +uint32 +wasm_runtime_get_pgo_prof_data_size(WASMModuleInstanceCommon *module_inst) +{ +#if WASM_ENABLE_AOT != 0 + if (module_inst->module_type == Wasm_Module_AoT) { + AOTModuleInstance *aot_inst = (AOTModuleInstance *)module_inst; + return aot_get_pgo_prof_data_size(aot_inst); + } +#endif + return 0; +} + +uint32 +wasm_runtime_dump_pgo_prof_data_to_buf(WASMModuleInstanceCommon *module_inst, + char *buf, uint32 len) +{ +#if WASM_ENABLE_AOT != 0 + if (module_inst->module_type == Wasm_Module_AoT) { + AOTModuleInstance *aot_inst = (AOTModuleInstance *)module_inst; + return aot_dump_pgo_prof_data_to_buf(aot_inst, buf, len); + } +#endif + return 0; +} +#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */ + bool wasm_runtime_get_table_elem_type(const WASMModuleCommon *module_comm, uint32 table_idx, uint8 *out_elem_type, diff --git a/core/iwasm/compilation/aot_emit_aot_file.c b/core/iwasm/compilation/aot_emit_aot_file.c index 62bb809da..893e39918 100644 --- a/core/iwasm/compilation/aot_emit_aot_file.c +++ b/core/iwasm/compilation/aot_emit_aot_file.c @@ -111,6 +111,8 @@ typedef struct AOTSymbolList { /* AOT object data */ typedef struct AOTObjectData { + AOTCompContext *comp_ctx; + LLVMMemoryBufferRef mem_buf; LLVMBinaryRef binary; @@ -119,6 +121,12 @@ typedef struct AOTObjectData { void *text; uint32 text_size; + void *text_unlikely; + uint32 text_unlikely_size; + + void *text_hot; + uint32 text_hot_size; + /* literal data and size */ void *literal; uint32 literal_size; @@ -558,8 +566,10 @@ get_init_data_section_size(AOTCompContext *comp_ctx, AOTCompData *comp_data, static uint32 get_text_section_size(AOTObjectData *obj_data) { - return (sizeof(uint32) + obj_data->literal_size + obj_data->text_size + 3) - & ~3; + return sizeof(uint32) + align_uint(obj_data->literal_size, 4) + + align_uint(obj_data->text_size, 4) + + align_uint(obj_data->text_unlikely_size, 4) + + align_uint(obj_data->text_hot_size, 4); } static uint32 @@ -1702,12 +1712,28 @@ aot_emit_text_section(uint8 *buf, uint8 *buf_end, uint32 *p_offset, EMIT_U32(AOT_SECTION_TYPE_TEXT); EMIT_U32(section_size); EMIT_U32(obj_data->literal_size); - if (obj_data->literal_size > 0) - EMIT_BUF(obj_data->literal, obj_data->literal_size); - EMIT_BUF(obj_data->text, obj_data->text_size); - while (offset & 3) - EMIT_BUF(&placeholder, 1); + if (obj_data->literal_size > 0) { + EMIT_BUF(obj_data->literal, obj_data->literal_size); + while (offset & 3) + EMIT_BUF(&placeholder, 1); + } + + if (obj_data->text_size > 0) { + EMIT_BUF(obj_data->text, obj_data->text_size); + while (offset & 3) + EMIT_BUF(&placeholder, 1); + } + if (obj_data->text_unlikely_size > 0) { + EMIT_BUF(obj_data->text_unlikely, obj_data->text_unlikely_size); + while (offset & 3) + EMIT_BUF(&placeholder, 1); + } + if (obj_data->text_hot_size > 0) { + EMIT_BUF(obj_data->text_hot, obj_data->text_hot_size); + while (offset & 3) + EMIT_BUF(&placeholder, 1); + } if (offset - *p_offset != section_size + sizeof(uint32) * 2) { aot_set_last_error("emit text section failed."); @@ -2211,11 +2237,23 @@ aot_resolve_text(AOTObjectData *obj_data) } while ( !LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) { - if ((name = (char *)LLVMGetSectionName(sec_itr)) - && !strcmp(name, ".text")) { - obj_data->text = (char *)LLVMGetSectionContents(sec_itr); - obj_data->text_size = (uint32)LLVMGetSectionSize(sec_itr); - break; + if ((name = (char *)LLVMGetSectionName(sec_itr))) { + if (!strcmp(name, ".text")) { + obj_data->text = (char *)LLVMGetSectionContents(sec_itr); + obj_data->text_size = (uint32)LLVMGetSectionSize(sec_itr); + } + else if (!strcmp(name, ".text.unlikely.")) { + obj_data->text_unlikely = + (char *)LLVMGetSectionContents(sec_itr); + obj_data->text_unlikely_size = + (uint32)LLVMGetSectionSize(sec_itr); + } + else if (!strcmp(name, ".text.hot.")) { + obj_data->text_hot = + (char *)LLVMGetSectionContents(sec_itr); + obj_data->text_hot_size = + (uint32)LLVMGetSectionSize(sec_itr); + } } LLVMMoveToNextSection(sec_itr); } @@ -2253,7 +2291,8 @@ static bool get_relocations_count(LLVMSectionIteratorRef sec_itr, uint32 *p_count); static bool -is_data_section(LLVMSectionIteratorRef sec_itr, char *section_name) +is_data_section(AOTObjectData *obj_data, LLVMSectionIteratorRef sec_itr, + char *section_name) { uint32 relocation_count = 0; @@ -2265,7 +2304,11 @@ is_data_section(LLVMSectionIteratorRef sec_itr, char *section_name) || !strncmp(section_name, ".rodata.str", strlen(".rodata.str")) || (!strcmp(section_name, ".rdata") && get_relocations_count(sec_itr, &relocation_count) - && relocation_count > 0)); + && relocation_count > 0) + || (obj_data->comp_ctx->enable_llvm_pgo + && (!strncmp(section_name, "__llvm_prf_cnts", 15) + || !strncmp(section_name, "__llvm_prf_data", 15) + || !strncmp(section_name, "__llvm_prf_names", 16)))); } static bool @@ -2281,7 +2324,7 @@ get_object_data_sections_count(AOTObjectData *obj_data, uint32 *p_count) } while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) { if ((name = (char *)LLVMGetSectionName(sec_itr)) - && (is_data_section(sec_itr, name))) { + && (is_data_section(obj_data, sec_itr, name))) { count++; } LLVMMoveToNextSection(sec_itr); @@ -2306,6 +2349,9 @@ aot_resolve_object_data_sections(AOTObjectData *obj_data) } if (sections_count > 0) { + uint32 llvm_prf_cnts_idx = 0, llvm_prf_data_idx = 0; + char buf[32]; + size = (uint32)sizeof(AOTObjectDataSection) * sections_count; if (!(data_section = obj_data->data_sections = wasm_runtime_malloc(size))) { @@ -2322,10 +2368,46 @@ aot_resolve_object_data_sections(AOTObjectData *obj_data) while ( !LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) { if ((name = (char *)LLVMGetSectionName(sec_itr)) - && (is_data_section(sec_itr, name))) { + && (is_data_section(obj_data, sec_itr, name))) { data_section->name = name; - data_section->data = (uint8 *)LLVMGetSectionContents(sec_itr); - data_section->size = (uint32)LLVMGetSectionSize(sec_itr); + if (obj_data->comp_ctx->enable_llvm_pgo + && !strcmp(name, "__llvm_prf_cnts")) { + snprintf(buf, sizeof(buf), "%s%u", name, + llvm_prf_cnts_idx++); + size = strlen(buf) + 1; + if (!(data_section->name = wasm_runtime_malloc(size))) { + aot_set_last_error( + "allocate memory for data section name failed."); + return false; + } + bh_memcpy_s(data_section->name, size, buf, size); + data_section->is_name_allocated = true; + } + else if (obj_data->comp_ctx->enable_llvm_pgo + && !strcmp(name, "__llvm_prf_data")) { + snprintf(buf, sizeof(buf), "%s%u", name, + llvm_prf_data_idx++); + size = strlen(buf) + 1; + if (!(data_section->name = wasm_runtime_malloc(size))) { + aot_set_last_error( + "allocate memory for data section name failed."); + return false; + } + bh_memcpy_s(data_section->name, size, buf, size); + data_section->is_name_allocated = true; + } + + if (obj_data->comp_ctx->enable_llvm_pgo + && !strcmp(name, "__llvm_prf_names")) { + data_section->data = (uint8 *)aot_compress_aot_func_names( + obj_data->comp_ctx, &data_section->size); + data_section->is_data_allocated = true; + } + else { + data_section->data = + (uint8 *)LLVMGetSectionContents(sec_itr); + data_section->size = (uint32)LLVMGetSectionSize(sec_itr); + } data_section++; } LLVMMoveToNextSection(sec_itr); @@ -2365,9 +2447,36 @@ aot_resolve_functions(AOTCompContext *comp_ctx, AOTObjectData *obj_data) && str_starts_with(name, prefix)) { func_index = (uint32)atoi(name + strlen(prefix)); if (func_index < obj_data->func_count) { + LLVMSectionIteratorRef contain_section; + char *contain_section_name; + func = obj_data->funcs + func_index; func->func_name = name; - func->text_offset = LLVMGetSymbolAddress(sym_itr); + + if (!(contain_section = LLVMObjectFileCopySectionIterator( + obj_data->binary))) { + aot_set_last_error("llvm get section iterator failed."); + LLVMDisposeSymbolIterator(sym_itr); + return false; + } + LLVMMoveToContainingSection(contain_section, sym_itr); + contain_section_name = + (char *)LLVMGetSectionName(contain_section); + LLVMDisposeSectionIterator(contain_section); + + if (!strcmp(contain_section_name, ".text.unlikely.")) { + func->text_offset = align_uint(obj_data->text_size, 4) + + LLVMGetSymbolAddress(sym_itr); + } + else if (!strcmp(contain_section_name, ".text.hot.")) { + func->text_offset = + align_uint(obj_data->text_size, 4) + + align_uint(obj_data->text_unlikely_size, 4) + + LLVMGetSymbolAddress(sym_itr); + } + else { + func->text_offset = LLVMGetSymbolAddress(sym_itr); + } } } LLVMMoveToNextSymbol(sym_itr); @@ -2478,9 +2587,86 @@ aot_resolve_object_relocation_group(AOTObjectData *obj_data, } /* set relocation fields */ - relocation->relocation_offset = offset; relocation->relocation_type = (uint32)type; relocation->symbol_name = (char *)LLVMGetSymbolName(rel_sym); + relocation->relocation_offset = offset; + if (!strcmp(group->section_name, ".rela.text.unlikely.") + || !strcmp(group->section_name, ".rel.text.unlikely.")) { + relocation->relocation_offset += align_uint(obj_data->text_size, 4); + } + else if (!strcmp(group->section_name, ".rela.text.hot.") + || !strcmp(group->section_name, ".rel.text.hot.")) { + relocation->relocation_offset += + align_uint(obj_data->text_size, 4) + + align_uint(obj_data->text_unlikely_size, 4); + } + if (!strcmp(relocation->symbol_name, ".text.unlikely.")) { + relocation->symbol_name = ".text"; + relocation->relocation_addend += align_uint(obj_data->text_size, 4); + } + if (!strcmp(relocation->symbol_name, ".text.hot.")) { + relocation->symbol_name = ".text"; + relocation->relocation_addend += + align_uint(obj_data->text_size, 4) + + align_uint(obj_data->text_unlikely_size, 4); + } + + if (obj_data->comp_ctx->enable_llvm_pgo + && (!strcmp(relocation->symbol_name, "__llvm_prf_cnts") + || !strcmp(relocation->symbol_name, "__llvm_prf_data"))) { + LLVMSectionIteratorRef sec_itr; + char buf[32], *section_name; + uint32 prof_section_idx = 0; + + if (!(sec_itr = + LLVMObjectFileCopySectionIterator(obj_data->binary))) { + aot_set_last_error("llvm get section iterator failed."); + LLVMDisposeSymbolIterator(rel_sym); + goto fail; + } + while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, + sec_itr)) { + section_name = (char *)LLVMGetSectionName(sec_itr); + if (section_name + && !strcmp(section_name, relocation->symbol_name)) { + if (LLVMGetSectionContainsSymbol(sec_itr, rel_sym)) + break; + prof_section_idx++; + } + LLVMMoveToNextSection(sec_itr); + } + LLVMDisposeSectionIterator(sec_itr); + + if (!strcmp(group->section_name, ".rela.text") + || !strcmp(group->section_name, ".rel.text")) { + snprintf(buf, sizeof(buf), "%s%u", relocation->symbol_name, + prof_section_idx); + size = strlen(buf) + 1; + if (!(relocation->symbol_name = wasm_runtime_malloc(size))) { + aot_set_last_error( + "allocate memory for relocation symbol name failed."); + LLVMDisposeSymbolIterator(rel_sym); + goto fail; + } + bh_memcpy_s(relocation->symbol_name, size, buf, size); + relocation->is_symbol_name_allocated = true; + } + else if (!strncmp(group->section_name, ".rela__llvm_prf_data", 20) + || !strncmp(group->section_name, ".rel__llvm_prf_data", + 19)) { + snprintf(buf, sizeof(buf), "%s%u", relocation->symbol_name, + prof_section_idx); + size = strlen(buf) + 1; + if (!(relocation->symbol_name = wasm_runtime_malloc(size))) { + aot_set_last_error( + "allocate memory for relocation symbol name failed."); + LLVMDisposeSymbolIterator(rel_sym); + goto fail; + } + bh_memcpy_s(relocation->symbol_name, size, buf, size); + relocation->is_symbol_name_allocated = true; + } + } /* for ".LCPIxxx", ".LJTIxxx", ".LBBxxx" and switch lookup table * relocation, transform the symbol name to real section name and set @@ -2525,10 +2711,14 @@ fail: } static bool -is_relocation_section_name(char *section_name) +is_relocation_section_name(AOTObjectData *obj_data, char *section_name) { return (!strcmp(section_name, ".rela.text") || !strcmp(section_name, ".rel.text") + || !strcmp(section_name, ".rela.text.unlikely.") + || !strcmp(section_name, ".rel.text.unlikely.") + || !strcmp(section_name, ".rela.text.hot.") + || !strcmp(section_name, ".rel.text.hot.") || !strcmp(section_name, ".rela.literal") || !strcmp(section_name, ".rela.data") || !strcmp(section_name, ".rel.data") @@ -2536,6 +2726,9 @@ is_relocation_section_name(char *section_name) || !strcmp(section_name, ".rel.sdata") || !strcmp(section_name, ".rela.rodata") || !strcmp(section_name, ".rel.rodata") + || (obj_data->comp_ctx->enable_llvm_pgo + && (!strcmp(section_name, ".rela__llvm_prf_data") + || !strcmp(section_name, ".rel__llvm_prf_data"))) /* ".rela.rodata.cst4/8/16/.." */ || !strncmp(section_name, ".rela.rodata.cst", strlen(".rela.rodata.cst")) @@ -2545,14 +2738,15 @@ is_relocation_section_name(char *section_name) } static bool -is_relocation_section(LLVMSectionIteratorRef sec_itr) +is_relocation_section(AOTObjectData *obj_data, LLVMSectionIteratorRef sec_itr) { uint32 count = 0; char *name = (char *)LLVMGetSectionName(sec_itr); if (name) { - if (is_relocation_section_name(name)) + if (is_relocation_section_name(obj_data, name)) return true; - else if ((!strcmp(name, ".text") || !strcmp(name, ".rdata")) + else if ((!strcmp(name, ".text") || !strcmp(name, ".text.unlikely.") + || !strcmp(name, ".text.hot.") || !strcmp(name, ".rdata")) && get_relocations_count(sec_itr, &count) && count > 0) return true; } @@ -2570,7 +2764,7 @@ get_relocation_groups_count(AOTObjectData *obj_data, uint32 *p_count) return false; } while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) { - if (is_relocation_section(sec_itr)) { + if (is_relocation_section(obj_data, sec_itr)) { count++; } LLVMMoveToNextSection(sec_itr); @@ -2586,7 +2780,7 @@ aot_resolve_object_relocation_groups(AOTObjectData *obj_data) { LLVMSectionIteratorRef sec_itr; AOTRelocationGroup *relocation_group; - uint32 group_count; + uint32 group_count, llvm_prf_data_idx = 0; char *name; uint32 size; @@ -2612,14 +2806,50 @@ aot_resolve_object_relocation_groups(AOTObjectData *obj_data) return false; } while (!LLVMObjectFileIsSectionIteratorAtEnd(obj_data->binary, sec_itr)) { - if (is_relocation_section(sec_itr)) { + if (is_relocation_section(obj_data, sec_itr)) { name = (char *)LLVMGetSectionName(sec_itr); relocation_group->section_name = name; + + if (obj_data->comp_ctx->enable_llvm_pgo + && (!strcmp(name, ".rela__llvm_prf_data") + || !strcmp(name, ".rel__llvm_prf_data"))) { + char buf[32]; + snprintf(buf, sizeof(buf), "%s%u", name, llvm_prf_data_idx); + size = strlen(buf) + 1; + if (!(relocation_group->section_name = + wasm_runtime_malloc(size))) { + aot_set_last_error( + "allocate memory for section name failed."); + LLVMDisposeSectionIterator(sec_itr); + return false; + } + bh_memcpy_s(relocation_group->section_name, size, buf, size); + relocation_group->is_section_name_allocated = true; + } + if (!aot_resolve_object_relocation_group(obj_data, relocation_group, sec_itr)) { LLVMDisposeSectionIterator(sec_itr); return false; } + + if (obj_data->comp_ctx->enable_llvm_pgo + && (!strcmp(name, ".rela__llvm_prf_data") + || !strcmp(name, ".rel__llvm_prf_data"))) { + llvm_prf_data_idx++; + } + + if (!strcmp(relocation_group->section_name, ".rela.text.unlikely.") + || !strcmp(relocation_group->section_name, ".rela.text.hot.")) { + relocation_group->section_name = ".rela.text"; + } + else if (!strcmp(relocation_group->section_name, + ".rel.text.unlikely.") + || !strcmp(relocation_group->section_name, + ".rel.text.hot.")) { + relocation_group->section_name = ".rel.text"; + } + relocation_group++; } LLVMMoveToNextSection(sec_itr); @@ -2633,12 +2863,21 @@ static void destroy_relocation_groups(AOTRelocationGroup *relocation_groups, uint32 relocation_group_count) { - uint32 i; + uint32 i, j; AOTRelocationGroup *relocation_group = relocation_groups; - for (i = 0; i < relocation_group_count; i++, relocation_group++) - if (relocation_group->relocations) + for (i = 0; i < relocation_group_count; i++, relocation_group++) { + if (relocation_group->relocations) { + for (j = 0; j < relocation_group->relocation_count; j++) { + if (relocation_group->relocations[j].is_symbol_name_allocated) + wasm_runtime_free( + relocation_group->relocations[j].symbol_name); + } wasm_runtime_free(relocation_group->relocations); + } + if (relocation_group->is_section_name_allocated) + wasm_runtime_free(relocation_group->section_name); + } wasm_runtime_free(relocation_groups); } @@ -2664,8 +2903,20 @@ aot_obj_data_destroy(AOTObjectData *obj_data) LLVMDisposeMemoryBuffer(obj_data->mem_buf); if (obj_data->funcs) wasm_runtime_free(obj_data->funcs); - if (obj_data->data_sections) + if (obj_data->data_sections) { + uint32 i; + for (i = 0; i < obj_data->data_sections_count; i++) { + if (obj_data->data_sections[i].name + && obj_data->data_sections[i].is_name_allocated) { + wasm_runtime_free(obj_data->data_sections[i].name); + } + if (obj_data->data_sections[i].data + && obj_data->data_sections[i].is_data_allocated) { + wasm_runtime_free(obj_data->data_sections[i].data); + } + } wasm_runtime_free(obj_data->data_sections); + } if (obj_data->relocation_groups) destroy_relocation_groups(obj_data->relocation_groups, obj_data->relocation_group_count); @@ -2688,6 +2939,7 @@ aot_obj_data_create(AOTCompContext *comp_ctx) return false; } memset(obj_data, 0, sizeof(AOTObjectData)); + obj_data->comp_ctx = comp_ctx; bh_print_time("Begin to emit object file"); if (comp_ctx->external_llc_compiler || comp_ctx->external_asm_compiler) { @@ -2821,8 +3073,8 @@ aot_obj_data_create(AOTCompContext *comp_ctx) if (!aot_resolve_target_info(comp_ctx, obj_data) || !aot_resolve_text(obj_data) || !aot_resolve_literal(obj_data) || !aot_resolve_object_data_sections(obj_data) - || !aot_resolve_object_relocation_groups(obj_data) - || !aot_resolve_functions(comp_ctx, obj_data)) + || !aot_resolve_functions(comp_ctx, obj_data) + || !aot_resolve_object_relocation_groups(obj_data)) goto fail; return obj_data; diff --git a/core/iwasm/compilation/aot_llvm.c b/core/iwasm/compilation/aot_llvm.c index e398affcb..c2b3be3ad 100644 --- a/core/iwasm/compilation/aot_llvm.c +++ b/core/iwasm/compilation/aot_llvm.c @@ -1670,6 +1670,12 @@ aot_create_comp_context(const AOTCompData *comp_data, aot_comp_option_t option) if (option->disable_llvm_lto) comp_ctx->disable_llvm_lto = true; + if (option->enable_llvm_pgo) + comp_ctx->enable_llvm_pgo = true; + + if (option->use_prof_file) + comp_ctx->use_prof_file = option->use_prof_file; + if (option->enable_stack_estimation) comp_ctx->enable_stack_estimation = true; @@ -2829,3 +2835,23 @@ aot_load_const_from_table(AOTCompContext *comp_ctx, LLVMValueRef base, (void)const_type; return const_value; } + +bool +aot_set_cond_br_weights(AOTCompContext *comp_ctx, LLVMValueRef cond_br, + int32 weights_true, int32 weights_false) +{ + LLVMMetadataRef md_nodes[3], meta_data; + LLVMValueRef meta_data_as_value; + + md_nodes[0] = LLVMMDStringInContext2(comp_ctx->context, "branch_weights", + strlen("branch_weights")); + md_nodes[1] = LLVMValueAsMetadata(I32_CONST(weights_true)); + md_nodes[2] = LLVMValueAsMetadata(I32_CONST(weights_false)); + + meta_data = LLVMMDNodeInContext2(comp_ctx->context, md_nodes, 3); + meta_data_as_value = LLVMMetadataAsValue(comp_ctx->context, meta_data); + + LLVMSetMetadata(cond_br, 2, meta_data_as_value); + + return true; +} diff --git a/core/iwasm/compilation/aot_llvm.h b/core/iwasm/compilation/aot_llvm.h index 8acaa80d5..907a18ac0 100644 --- a/core/iwasm/compilation/aot_llvm.h +++ b/core/iwasm/compilation/aot_llvm.h @@ -349,6 +349,12 @@ typedef struct AOTCompContext { /* Disable LLVM link time optimization */ bool disable_llvm_lto; + /* Enable LLVM PGO (Profile-Guided Optimization) */ + bool enable_llvm_pgo; + + /* Use profile file collected by LLVM PGO */ + char *use_prof_file; + /* Enable to use segument register as the base addr of linear memory for load/store operations */ bool enable_segue_i32_load; @@ -428,7 +434,9 @@ typedef struct AOTCompOption { bool enable_aux_stack_frame; bool disable_llvm_intrinsics; bool disable_llvm_lto; + bool enable_llvm_pgo; bool enable_stack_estimation; + char *use_prof_file; uint32 opt_level; uint32 size_level; uint32 output_format; @@ -541,6 +549,13 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module); void aot_handle_llvm_errmsg(const char *string, LLVMErrorRef err); +char * +aot_compress_aot_func_names(AOTCompContext *comp_ctx, uint32 *p_size); + +bool +aot_set_cond_br_weights(AOTCompContext *comp_ctx, LLVMValueRef cond_br, + int32 weights_true, int32 weights_false); + #ifdef __cplusplus } /* end of extern "C" */ #endif diff --git a/core/iwasm/compilation/aot_llvm_extra.cpp b/core/iwasm/compilation/aot_llvm_extra.cpp index 9b77f5e6a..492cf3dcf 100644 --- a/core/iwasm/compilation/aot_llvm_extra.cpp +++ b/core/iwasm/compilation/aot_llvm_extra.cpp @@ -44,6 +44,7 @@ #if LLVM_VERSION_MAJOR >= 12 #include #endif +#include #include #include "../aot/aot_runtime.h" @@ -232,14 +233,26 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module) PTO.SLPVectorization = true; PTO.LoopUnrolling = true; + Optional PGO = None; + if (comp_ctx->enable_llvm_pgo) { + /* Disable static counter allocation for value profiler, + it will be allocated by runtime */ + const char *argv[] = { "", "-vp-static-alloc=false" }; + cl::ParseCommandLineOptions(2, argv); + PGO = PGOOptions("", "", "", PGOOptions::IRInstr); + } + else if (comp_ctx->use_prof_file) { + PGO = PGOOptions(comp_ctx->use_prof_file, "", "", PGOOptions::IRUse); + } + #ifdef DEBUG_PASS PassInstrumentationCallbacks PIC; - PassBuilder PB(TM, PTO, None, &PIC); + PassBuilder PB(TM, PTO, PGO, &PIC); #else #if LLVM_VERSION_MAJOR == 12 - PassBuilder PB(false, TM, PTO); + PassBuilder PB(false, TM, PTO, PGO); #else - PassBuilder PB(TM, PTO); + PassBuilder PB(TM, PTO, PGO); #endif #endif @@ -334,8 +347,16 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module) FPM.addPass(SLPVectorizerPass()); FPM.addPass(LoadStoreVectorizerPass()); + if (comp_ctx->enable_llvm_pgo || comp_ctx->use_prof_file) { + LICMOptions licm_opt; + /* LICM pass: loop invariant code motion, attempting to remove + as much code from the body of a loop as possible. Experiments + show it is good to enable it when pgo is enabled. */ + FPM.addPass( + createFunctionToLoopPassAdaptor(LICMPass(licm_opt), true)); + } + /* - FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass())); FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass())); FPM.addPass(createFunctionToLoopPassAdaptor(SimpleLoopUnswitchPass())); */ @@ -344,9 +365,10 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module) if (!disable_llvm_lto) { /* Apply LTO for AOT mode */ - if (comp_ctx->comp_data->func_count >= 10) - /* Adds the pre-link optimizations if the func count - is large enough */ + if (comp_ctx->comp_data->func_count >= 10 + || comp_ctx->enable_llvm_pgo || comp_ctx->use_prof_file) + /* Add the pre-link optimizations if the func count + is large enough or PGO is enabled */ MPM.addPass(PB.buildLTOPreLinkDefaultPipeline(OL)); else MPM.addPass(PB.buildLTODefaultPipeline(OL, NULL)); @@ -358,3 +380,34 @@ aot_apply_llvm_new_pass_manager(AOTCompContext *comp_ctx, LLVMModuleRef module) MPM.run(*M, MAM); } + +char * +aot_compress_aot_func_names(AOTCompContext *comp_ctx, uint32 *p_size) +{ + std::vector NameStrs; + std::string Result; + char buf[32], *compressed_str; + uint32 compressed_str_len, i; + + for (i = 0; i < comp_ctx->func_ctx_count; i++) { + snprintf(buf, sizeof(buf), "%s%d", AOT_FUNC_PREFIX, i); + std::string str(buf); + NameStrs.push_back(str); + } + + if (collectPGOFuncNameStrings(NameStrs, true, Result)) { + aot_set_last_error("collect pgo func name strings failed"); + return NULL; + } + + compressed_str_len = Result.size(); + if (!(compressed_str = (char *)wasm_runtime_malloc(compressed_str_len))) { + aot_set_last_error("allocate memory failed"); + return NULL; + } + + bh_memcpy_s(compressed_str, compressed_str_len, Result.c_str(), + compressed_str_len); + *p_size = compressed_str_len; + return compressed_str; +} diff --git a/core/iwasm/include/aot_export.h b/core/iwasm/include/aot_export.h index fef5356ca..dca26aa6e 100644 --- a/core/iwasm/include/aot_export.h +++ b/core/iwasm/include/aot_export.h @@ -55,7 +55,9 @@ typedef struct AOTCompOption { bool enable_aux_stack_frame; bool disable_llvm_intrinsics; bool disable_llvm_lto; + bool enable_llvm_pgo; bool enable_stack_estimation; + char *use_prof_file; uint32_t opt_level; uint32_t size_level; uint32_t output_format; diff --git a/core/iwasm/include/wasm_export.h b/core/iwasm/include/wasm_export.h index 6cdbd2ab2..28b952e8f 100644 --- a/core/iwasm/include/wasm_export.h +++ b/core/iwasm/include/wasm_export.h @@ -1331,6 +1331,30 @@ WASM_RUNTIME_API_EXTERN uint32_t wasm_runtime_dump_call_stack_to_buf(wasm_exec_env_t exec_env, char *buf, uint32_t len); +/** + * Get the size required to store the LLVM PGO profile data + * + * @param module_inst the WASM module instance + * + * @return size required to store the contents, 0 means error + */ +WASM_RUNTIME_API_EXTERN uint32_t +wasm_runtime_get_pgo_prof_data_size(wasm_module_inst_t module_inst); + +/** + * Dump the LLVM PGO profile data to buffer + * + * @param module_inst the WASM module instance + * @param buf buffer to store the dumped content + * @param len length of the buffer + * + * @return bytes dumped to the buffer, 0 means error and data in buf + * may be invalid + */ +WASM_RUNTIME_API_EXTERN uint32_t +wasm_runtime_dump_pgo_prof_data_to_buf(wasm_module_inst_t module_inst, + char *buf, uint32_t len); + /** * Get a custom section by name * diff --git a/core/shared/platform/include/platform_api_extension.h b/core/shared/platform/include/platform_api_extension.h index 94fe16ea3..7029bb8d7 100644 --- a/core/shared/platform/include/platform_api_extension.h +++ b/core/shared/platform/include/platform_api_extension.h @@ -130,6 +130,7 @@ os_thread_exit(void *retval); #define os_memory_order_release memory_order_release #define os_memory_order_seq_cst memory_order_seq_cst #define os_atomic_thread_fence atomic_thread_fence +#define os_atomic_cmpxchg atomic_compare_exchange_strong #endif #endif /* end of os_atomic_thread_fence */ diff --git a/product-mini/platforms/posix/main.c b/product-mini/platforms/posix/main.c index d8ea62220..752d235e6 100644 --- a/product-mini/platforms/posix/main.c +++ b/product-mini/platforms/posix/main.c @@ -97,6 +97,9 @@ print_help() #if WASM_ENABLE_DEBUG_INTERP != 0 printf(" -g=ip:port Set the debug sever address, default is debug disabled\n"); printf(" if port is 0, then a random port will be used\n"); +#endif +#if WASM_ENABLE_STATIC_PGO != 0 + printf(" --gen-prof-file= Generate LLVM PGO (Profile-Guided Optimization) profile file\n"); #endif printf(" --version Show version information\n"); return 1; @@ -413,6 +416,44 @@ moudle_destroyer(uint8 *buffer, uint32 size) static char global_heap_buf[WASM_GLOBAL_HEAP_SIZE] = { 0 }; #endif +#if WASM_ENABLE_STATIC_PGO != 0 +static void +dump_pgo_prof_data(wasm_module_inst_t module_inst, const char *path) +{ + char *buf; + uint32 len; + FILE *file; + + if (!(len = wasm_runtime_get_pgo_prof_data_size(module_inst))) { + printf("failed to get LLVM PGO profile data size\n"); + return; + } + + if (!(buf = wasm_runtime_malloc(len))) { + printf("allocate memory failed\n"); + return; + } + + if (len != wasm_runtime_dump_pgo_prof_data_to_buf(module_inst, buf, len)) { + printf("failed to dump LLVM PGO profile data\n"); + wasm_runtime_free(buf); + return; + } + + if (!(file = fopen(path, "wb"))) { + printf("failed to create file %s", path); + wasm_runtime_free(buf); + return; + } + fwrite(buf, len, 1, file); + fclose(file); + + wasm_runtime_free(buf); + + printf("LLVM raw profile file %s was generated.\n", path); +} +#endif + int main(int argc, char *argv[]) { @@ -460,6 +501,9 @@ main(int argc, char *argv[]) char *ip_addr = NULL; int instance_port = 0; #endif +#if WASM_ENABLE_STATIC_PGO != 0 + const char *gen_prof_file = NULL; +#endif /* Process options. */ for (argc--, argv++; argc > 0 && argv[0][0] == '-'; argc--, argv++) { @@ -663,6 +707,13 @@ main(int argc, char *argv[]) return print_help(); ip_addr = argv[0] + 3; } +#endif +#if WASM_ENABLE_STATIC_PGO != 0 + else if (!strncmp(argv[0], "--gen-prof-file=", 16)) { + if (argv[0][16] == '\0') + return print_help(); + gen_prof_file = argv[0] + 16; + } #endif else if (!strncmp(argv[0], "--version", 9)) { uint32 major, minor, patch; @@ -826,6 +877,12 @@ main(int argc, char *argv[]) } #endif +#if WASM_ENABLE_STATIC_PGO != 0 && WASM_ENABLE_AOT != 0 + if (get_package_type(wasm_file_buf, wasm_file_size) == Wasm_Module_AoT + && gen_prof_file) + dump_pgo_prof_data(wasm_module_inst, gen_prof_file); +#endif + #if WASM_ENABLE_DEBUG_INTERP != 0 fail4: #endif diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md new file mode 100644 index 000000000..2112829e0 --- /dev/null +++ b/tests/benchmarks/README.md @@ -0,0 +1,62 @@ +# WAMR test benchmarks + +This folder contains test benchmarks for wamr. + +## Build and Run + +Refer to the `README.md` under each folder for how to build and run the benchmark. + +## Install `llvm-profdata` + +The tool `llvm-profdata` is used when running the `test_pgo.sh` script under the benchmark folder. There are two ways to install it: + +1. Refer to https://apt.llvm.org/, e.g. in Ubuntu 20.04, add lines below to /etc/apt/source.list + +```bash +deb http://apt.llvm.org/focal/ llvm-toolchain-focal main +deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal main +# 15 +deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main +deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main +``` + +Then run `sudo apt update`, `sudo apt install llvm`. And after installing: + +```bash +cd /usr/bin +sudo ln -s llvm-profdata-15 llvm-profdata +``` + +2. Build manually + +```bash +git clone --depth 1 --branch release/15.x https://github.com/llvm/llvm-project.git +cd llvm-project +mkdir build && cd build +cmake ../llvm \ + -DCMAKE_BUILD_TYPE:STRING="Release" \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DLLVM_APPEND_VC_REV:BOOL=ON \ + -DLLVM_BUILD_EXAMPLES:BOOL=OFF \ + -DLLVM_BUILD_LLVM_DYLIB:BOOL=OFF \ + -DLLVM_BUILD_TESTS:BOOL=OFF \ + -DLLVM_CCACHE_BUILD:BOOL=ON \ + -DLLVM_ENABLE_BINDINGS:BOOL=OFF \ + -DLLVM_ENABLE_IDE:BOOL=OFF \ + -DLLVM_ENABLE_LIBEDIT=OFF \ + -DLLVM_ENABLE_TERMINFO:BOOL=OFF \ + -DLLVM_ENABLE_ZLIB:BOOL=ON \ + -DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF \ + -DLLVM_INCLUDE_DOCS:BOOL=OFF \ + -DLLVM_INCLUDE_EXAMPLES:BOOL=OFF \ + -DLLVM_INCLUDE_UTILS:BOOL=OFF \ + -DLLVM_INCLUDE_TESTS:BOOL=OFF \ + -DLLVM_BUILD_TESTS:BOOL=OFF \ + -DLLVM_OPTIMIZED_TABLEGEN:BOOL=ON \ + -DLLVM_ENABLE_LIBXML2:BOOL=OFF \ + -DLLVM_TARGETS_TO_BUILD:STRING="X86" \ + -DLLVM_INCLUDE_TOOLS:BOOL=ON \ + -G'Ninja' +ninja -j 8 +# tool `llvm-profdata` is generated under this folder. +``` diff --git a/tests/benchmarks/coremark/README.md b/tests/benchmarks/coremark/README.md index 1631cc5c0..a1e029db8 100644 --- a/tests/benchmarks/coremark/README.md +++ b/tests/benchmarks/coremark/README.md @@ -17,3 +17,5 @@ And then run `./build.sh` to build the source code, file `coremark.exe`, `corema # Running Run `./run.sh` to test the benchmark, the native mode, iwasm aot mode and iwasm interpreter mode will be tested respectively. + +Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`. diff --git a/tests/benchmarks/coremark/test_pgo.sh b/tests/benchmarks/coremark/test_pgo.sh new file mode 100755 index 000000000..e49424697 --- /dev/null +++ b/tests/benchmarks/coremark/test_pgo.sh @@ -0,0 +1,50 @@ +#!/bin/sh + +# Copyright (C) 2019 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +PLATFORM=$(uname -s | tr A-Z a-z) + +IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm" +WAMRC="../../../wamr-compiler/build/wamrc" + +if [ ! -e "coremark.wasm" ]; then + echo "coremark.wasm doesn't exist, please run build.sh first" + exit +fi + +echo "" +echo "Compile coremark.wasm to coremark.aot .." +${WAMRC} -o coremark.aot coremark.wasm + +echo "" +echo "Compile coremark.wasm to coremark_pgo.aot .." +${WAMRC} --enable-llvm-pgo -o coremark_pgo.aot coremark.wasm + +echo "" +echo "Run coremark_pgo.aot to generate the raw profile data .." +${IWASM} --gen-prof-file=coremark.profraw coremark_pgo.aot + +echo "" +echo "Merge the raw profile data to coremark.profdata .." +rm -f coremark.profdata && llvm-profdata merge -output=coremark.profdata coremark.profraw + +echo "" +echo "Compile coremark.wasm to coremark_opt.aot with the profile data .." +${WAMRC} --use-prof-file=coremark.profdata -o coremark_opt.aot coremark.wasm + +echo "" +echo "Run the coremark native" +./coremark.exe + +echo "" +echo "Run the original aot file coremark.aot" +${IWASM} coremark.aot + +echo "" +echo "Run the PGO optimized aot file coremark_opt.aot" +${IWASM} coremark_opt.aot + +# Show the profile data: +# llvm-profdata show --all-functions --detailed-summary --binary-ids --counts \ +# --hot-func-list --memop-sizes --show-prof-sym-list coremark.profraw diff --git a/tests/benchmarks/dhrystone/test_pgo.sh b/tests/benchmarks/dhrystone/test_pgo.sh new file mode 100755 index 000000000..5bcb64d59 --- /dev/null +++ b/tests/benchmarks/dhrystone/test_pgo.sh @@ -0,0 +1,50 @@ +#!/bin/sh + +# Copyright (C) 2019 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +PLATFORM=$(uname -s | tr A-Z a-z) + +IWASM="../../../product-mini/platforms/${PLATFORM}/build/iwasm" +WAMRC="../../../wamr-compiler/build/wamrc" + +if [ ! -e "dhrystone.wasm" ]; then + echo "dhrystone.wasm doesn't exist, please run build.sh first" + exit +fi + +echo "" +echo "Compile dhrystone.wasm to dhrystone.aot .." +${WAMRC} -o dhrystone.aot dhrystone.wasm + +echo "" +echo "Compile dhrystone.wasm to dhrystone_pgo.aot .." +${WAMRC} --enable-llvm-pgo -o dhrystone_pgo.aot dhrystone.wasm + +echo "" +echo "Run dhrystone_pgo.aot to generate the raw profile data .." +${IWASM} --gen-prof-file=dhrystone.profraw dhrystone_pgo.aot + +echo "" +echo "Merge the raw profile data to dhrystone.profdata .." +rm -f dhrystone.profdata && llvm-profdata merge -output=dhrystone.profdata dhrystone.profraw + +echo "" +echo "Compile dhrystone.wasm to dhrystone_opt.aot with the profile data .." +${WAMRC} --use-prof-file=dhrystone.profdata -o dhrystone_opt.aot dhrystone.wasm + +echo "" +echo "Run the dhrystone native" +./dhrystone_native + +echo "" +echo "Run the original aot file dhrystone.aot" +${IWASM} dhrystone.aot + +echo "" +echo "Run the PGO optimized aot file dhrystone_opt.aot" +${IWASM} dhrystone_opt.aot + +# Show the profile data: +# llvm-profdata show --all-functions --detailed-summary --binary-ids --counts \ +# --hot-func-list --memop-sizes --show-prof-sym-list dhrystone.profraw diff --git a/tests/benchmarks/jetstream/README.md b/tests/benchmarks/jetstream/README.md index f6c593d11..635e3867e 100644 --- a/tests/benchmarks/jetstream/README.md +++ b/tests/benchmarks/jetstream/README.md @@ -27,3 +27,5 @@ And then run `./build.sh` to build the source code, the folder `out` will be cre # Running Run `./run_aot.sh` to test the benchmark, the native mode and iwasm aot mode will be tested for each workload, and the file `report.txt` will be generated. + +Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`. diff --git a/tests/benchmarks/jetstream/test_pgo.sh b/tests/benchmarks/jetstream/test_pgo.sh new file mode 100755 index 000000000..b6fdd08ea --- /dev/null +++ b/tests/benchmarks/jetstream/test_pgo.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# Copyright (C) 2019 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +CUR_DIR=$PWD +OUT_DIR=$CUR_DIR/out +REPORT=$CUR_DIR/report.txt +TIME=/usr/bin/time + +PLATFORM=$(uname -s | tr A-Z a-z) +IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm +WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc + +BENCH_NAME_MAX_LEN=20 + +JETSTREAM_CASES="gcc-loops HashSet tsf float-mm quicksort" + +rm -f $REPORT +touch $REPORT + +function print_bench_name() +{ + name=$1 + echo -en "$name" >> $REPORT + name_len=${#name} + if [ $name_len -lt $BENCH_NAME_MAX_LEN ] + then + spaces=$(( $BENCH_NAME_MAX_LEN - $name_len )) + for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done + fi +} + +pushd $OUT_DIR > /dev/null 2>&1 +for t in $JETSTREAM_CASES +do + if [ ! -e "${t}.wasm" ]; then + echo "${t}.wasm doesn't exist, please run build.sh first" + exit + fi + + echo "" + echo "Compile ${t}.wasm to ${t}.aot .." + ${WAMRC_CMD} -o ${t}.aot ${t}.wasm + + echo "" + echo "Compile ${t}.wasm to ${t}_pgo.aot .." + ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm + + echo "" + echo "Run ${t}_pgo.aot to generate the raw profile data .." + ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot + + echo "" + echo "Merge the raw profile data to ${t}.profdata .." + rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw + + echo "" + echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .." + ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm +done +popd > /dev/null 2>&1 + +echo "Start to run cases, the result is written to report.txt" + +#run benchmarks +cd $OUT_DIR +echo -en "\t\t\t\t\t native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT + +for t in $JETSTREAM_CASES +do + print_bench_name $t + + echo "run $t with native .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo "run $t with iwasm aot .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo "run $t with iwasm aot opt .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" $IWASM_CMD --dir=. ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo -en "\n" >> $REPORT +done diff --git a/tests/benchmarks/libsodium/test_pgo.sh b/tests/benchmarks/libsodium/test_pgo.sh new file mode 100755 index 000000000..7e211a7d0 --- /dev/null +++ b/tests/benchmarks/libsodium/test_pgo.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# Copyright (C) 2019 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +libsodium_CASES="aead_aes256gcm2 aead_aes256gcm aead_chacha20poly13052 aead_chacha20poly1305 \ + aead_xchacha20poly1305 auth2 auth3 auth5 auth6 auth7 auth box2 box7 box8 \ + box_easy2 box_easy box_seal box_seed box chacha20 codecs core1 core2 core3 \ + core4 core5 core6 core_ed25519 core_ristretto255 ed25519_convert generichash2 \ + generichash3 generichash hash3 hash kdf keygen kx metamorphic misuse \ + onetimeauth2 onetimeauth7 onetimeauth pwhash_argon2id pwhash_argon2i \ + pwhash_scrypt_ll pwhash_scrypt randombytes scalarmult2 scalarmult5 \ + scalarmult6 scalarmult7 scalarmult8 scalarmult_ed25519 scalarmult_ristretto255 \ + scalarmult secretbox2 secretbox7 secretbox8 secretbox_easy2 secretbox_easy \ + secretbox secretstream shorthash sign siphashx24 sodium_core sodium_utils2 \ + sodium_utils stream2 stream3 stream4 stream verify1 xchacha20" + +PLATFORM=$(uname -s | tr A-Z a-z) + +readonly OUT_DIR=$PWD/libsodium/zig-out/bin +readonly REPORT=$PWD/report.txt +readonly IWASM_CMD=$PWD/../../../product-mini/platforms/${PLATFORM}/build/iwasm +readonly WAMRC_CMD=$PWD/../../../wamr-compiler/build/wamrc +readonly TIME=/usr/bin/time + +BENCH_NAME_MAX_LEN=20 + +rm -f $REPORT +touch $REPORT + +function print_bench_name() +{ + name=$1 + echo -en "$name" >> $REPORT + name_len=${#name} + if [ $name_len -lt $BENCH_NAME_MAX_LEN ] + then + spaces=$(( $BENCH_NAME_MAX_LEN - $name_len )) + for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done + fi +} + +pushd $OUT_DIR > /dev/null 2>&1 +for t in $libsodium_CASES +do + if [ ! -e "${t}.wasm" ]; then + echo "${t}.wasm doesn't exist, please run build.sh first" + exit + fi + + echo "" + echo "Compile ${t}.wasm to ${t}.aot .." + ${WAMRC_CMD} -o ${t}.aot ${t}.wasm + + echo "" + echo "Compile ${t}.wasm to ${t}_pgo.aot .." + ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm + + echo "" + echo "Run ${t}_pgo.aot to generate the raw profile data .." + ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot + + echo "" + echo "Merge the raw profile data to ${t}.profdata .." + rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw + + echo "" + echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .." + ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm +done + +# run benchmarks +cd $OUT_DIR + +echo -en "\t\t\t\t\t\tnative\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT + +for t in $libsodium_CASES +do + print_bench_name $t + + echo "run $t with native..." + echo -en "\t" >> $REPORT + if [[ $t != "sodium_utils2" ]]; then + ./${t} | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT + else + # sodium_utils2 doesn't print the result, + # use time command to get result instead + $TIME -f "real-%e-time" ./${t} 2>&1 | grep "real-.*-time" | + awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT + fi + + echo "run $t with iwasm aot..." + echo -en "\t \t" >> $REPORT + if [[ $t != "sodium_utils2" ]]; then + $IWASM_CMD ${t}.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT + else + # sodium_utils2 doesn't print the result, + # use time command to get result instead + $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | + awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT + fi + + echo "run $t with iwasm aot opt..." + echo -en "\t \t" >> $REPORT + if [[ $t != "sodium_utils2" ]]; then + $IWASM_CMD ${t}_opt.aot | awk '{printf "%-10.2f", $0/1000000.0}' >> $REPORT + else + # sodium_utils2 doesn't print the result, + # use time command to get result instead + $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" | + awk -F '-' '{printf "%-10.2f", $2}' >> $REPORT + fi + + echo -en "\n" >> $REPORT +done + diff --git a/tests/benchmarks/polybench/test_pgo.sh b/tests/benchmarks/polybench/test_pgo.sh new file mode 100755 index 000000000..6d435b971 --- /dev/null +++ b/tests/benchmarks/polybench/test_pgo.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Copyright (C) 2019 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +CUR_DIR=$PWD +OUT_DIR=$CUR_DIR/out +REPORT=$CUR_DIR/report.txt +TIME=/usr/bin/time + +PLATFORM=$(uname -s | tr A-Z a-z) +IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm +WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc + +BENCH_NAME_MAX_LEN=20 + +POLYBENCH_CASES="2mm 3mm adi atax bicg cholesky correlation covariance \ + deriche doitgen durbin fdtd-2d floyd-warshall gemm gemver \ + gesummv gramschmidt heat-3d jacobi-1d jacobi-2d ludcmp lu \ + mvt nussinov seidel-2d symm syr2k syrk trisolv trmm" + +rm -f $REPORT +touch $REPORT + +function print_bench_name() +{ + name=$1 + echo -en "$name" >> $REPORT + name_len=${#name} + if [ $name_len -lt $BENCH_NAME_MAX_LEN ] + then + spaces=$(( $BENCH_NAME_MAX_LEN - $name_len )) + for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done + fi +} + +pushd $OUT_DIR > /dev/null 2>&1 +for t in $POLYBENCH_CASES +do + if [ ! -e "${t}.wasm" ]; then + echo "${t}.wasm doesn't exist, please run build.sh first" + exit + fi + + echo "" + echo "Compile ${t}.wasm to ${t}.aot .." + ${WAMRC_CMD} -o ${t}.aot ${t}.wasm + + echo "" + echo "Compile ${t}.wasm to ${t}_pgo.aot .." + ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm + + echo "" + echo "Run ${t}_pgo.aot to generate the raw profile data .." + ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot + + echo "" + echo "Merge the raw profile data to ${t}.profdata .." + rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw + + echo "" + echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .." + ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm +done +popd > /dev/null 2>&1 + +echo "Start to run cases, the result is written to report.txt" + +#run benchmarks +cd $OUT_DIR +echo -en "\t\t\t\t\t native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT + +for t in $POLYBENCH_CASES +do + print_bench_name $t + + echo "run $t with native .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo "run $t with iwasm aot .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo "run $t with iwasm aot opt .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo -en "\n" >> $REPORT +done diff --git a/tests/benchmarks/sightglass/README.md b/tests/benchmarks/sightglass/README.md index a446d80ea..4853fdb91 100644 --- a/tests/benchmarks/sightglass/README.md +++ b/tests/benchmarks/sightglass/README.md @@ -19,3 +19,5 @@ And then run `./build.sh` to build the source code, the folder `out` will be cre Run `./run_aot.sh` to test the benchmark, the native mode and iwasm aot mode will be tested for each workload, and the file `report.txt` will be generated. Run `./run_interp.sh` to test the benchmark, the native mode and iwasm interpreter mode will be tested for each workload, and the file `report.txt` will be generated. + +Run `./test_pgo.sh` to test the benchmark with AOT static PGO (Profile-Guided Optimization) enabled, please refer [here](../README.md#install-llvm-profdata) to install tool `llvm-profdata` and build `iwasm` with `cmake -DWAMR_BUILD_STATIC_PGO=1`. diff --git a/tests/benchmarks/sightglass/test_pgo.sh b/tests/benchmarks/sightglass/test_pgo.sh new file mode 100755 index 000000000..8cb2eeced --- /dev/null +++ b/tests/benchmarks/sightglass/test_pgo.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +# Copyright (C) 2019 Intel Corporation. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +CUR_DIR=$PWD +OUT_DIR=$CUR_DIR/out +REPORT=$CUR_DIR/report.txt +TIME=/usr/bin/time + +PLATFORM=$(uname -s | tr A-Z a-z) +IWASM_CMD=$CUR_DIR/../../../product-mini/platforms/${PLATFORM}/build/iwasm +WAMRC_CMD=$CUR_DIR/../../../wamr-compiler/build/wamrc + +BENCH_NAME_MAX_LEN=20 + +SHOOTOUT_CASES="base64 fib2 gimli heapsort matrix memmove nestedloop \ + nestedloop2 nestedloop3 random seqhash sieve strchr \ + switch2" + +rm -f $REPORT +touch $REPORT + +function print_bench_name() +{ + name=$1 + echo -en "$name" >> $REPORT + name_len=${#name} + if [ $name_len -lt $BENCH_NAME_MAX_LEN ] + then + spaces=$(( $BENCH_NAME_MAX_LEN - $name_len )) + for i in $(eval echo "{1..$spaces}"); do echo -n " " >> $REPORT; done + fi +} + +pushd $OUT_DIR > /dev/null 2>&1 +for t in $SHOOTOUT_CASES +do + if [ ! -e "${t}.wasm" ]; then + echo "${t}.wasm doesn't exist, please run build.sh first" + exit + fi + + echo "" + echo "Compile ${t}.wasm to ${t}.aot .." + ${WAMRC_CMD} -o ${t}.aot ${t}.wasm + + echo "" + echo "Compile ${t}.wasm to ${t}_pgo.aot .." + ${WAMRC_CMD} --enable-llvm-pgo -o ${t}_pgo.aot ${t}.wasm + + echo "" + echo "Run ${t}_pgo.aot to generate the raw profile data .." + ${IWASM_CMD} --gen-prof-file=${t}.profraw --dir=. ${t}_pgo.aot + + echo "" + echo "Merge the raw profile data to ${t}.profdata .." + rm -f ${t}.profdata && llvm-profdata merge -output=${t}.profdata ${t}.profraw + + echo "" + echo "Compile ${t}.wasm to ${t}_opt.aot with the profile data .." + ${WAMRC_CMD} --use-prof-file=${t}.profdata -o ${t}_opt.aot ${t}.wasm +done +popd > /dev/null 2>&1 + +echo "Start to run cases, the result is written to report.txt" + +#run benchmarks +cd $OUT_DIR +echo -en "\t\t\t\t\t native\tiwasm-aot\tiwasm-aot-pgo\n" >> $REPORT + +for t in $SHOOTOUT_CASES +do + print_bench_name $t + + echo "run $t with native .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" ./${t}_native 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo "run $t with iwasm aot .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" $IWASM_CMD ${t}.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo "run $t with iwasm aot opt .." + echo -en "\t" >> $REPORT + $TIME -f "real-%e-time" $IWASM_CMD ${t}_opt.aot 2>&1 | grep "real-.*-time" | awk -F '-' '{ORS=""; print $2}' >> $REPORT + + echo -en "\n" >> $REPORT +done diff --git a/wamr-compiler/main.c b/wamr-compiler/main.c index ccda363e7..be739fd75 100644 --- a/wamr-compiler/main.c +++ b/wamr-compiler/main.c @@ -65,6 +65,8 @@ print_help() printf(" --enable-indirect-mode Enalbe call function through symbol table but not direct call\n"); printf(" --disable-llvm-intrinsics Disable the LLVM built-in intrinsics\n"); printf(" --disable-llvm-lto Disable the LLVM link time optimization\n"); + printf(" --enable-llvm-pgo Enable LLVM PGO (Profile-Guided Optimization)\n"); + printf(" --use-prof-file= Use profile file collected by LLVM PGO (Profile-Guided Optimization)\n"); printf(" --enable-segue[=] Enable using segment register GS as the base address of linear memory,\n"); printf(" only available on linux/linux-sgx x86-64, which may improve performance,\n"); printf(" flags can be: i32.load, i64.load, f32.load, f64.load, v128.load,\n"); @@ -329,6 +331,14 @@ main(int argc, char *argv[]) else if (!strcmp(argv[0], "--disable-llvm-lto")) { option.disable_llvm_lto = true; } + else if (!strcmp(argv[0], "--enable-llvm-pgo")) { + option.enable_llvm_pgo = true; + } + else if (!strncmp(argv[0], "--use-prof-file=", 16)) { + if (argv[0][16] == '\0') + PRINT_HELP_AND_EXIT(); + option.use_prof_file = argv[0] + 16; + } else if (!strcmp(argv[0], "--enable-segue")) { /* all flags are enabled */ option.segue_flags = 0x1F1F;