Fix fast jit issues (#1208)

Move jit spill cache to the end of interp frame to reduce footprint
Fix codegen compare float issue: should not overwritten the source registers
Fix float to int conversion check integer overflow issue
Unify the float compare
Fix get_global issue
This commit is contained in:
Wenyong Huang 2022-06-07 15:49:52 +08:00 committed by GitHub
parent 5e9f08fb68
commit ab2e959616
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 114 additions and 120 deletions

View File

@ -102,12 +102,6 @@
#define WASM_ENABLE_FAST_JIT_DUMP 0 #define WASM_ENABLE_FAST_JIT_DUMP 0
#endif #endif
#ifndef FAST_JIT_SPILL_CACHE_SIZE
/* The size of fast jit spill cache in cell num, one cell num
occpuies 4 bytes */
#define FAST_JIT_SPILL_CACHE_SIZE 32
#endif
#ifndef WASM_ENABLE_WAMR_COMPILER #ifndef WASM_ENABLE_WAMR_COMPILER
#define WASM_ENABLE_WAMR_COMPILER 0 #define WASM_ENABLE_WAMR_COMPILER 0
#endif #endif

View File

@ -82,6 +82,9 @@ x86::Gp regs_i64[] = {
x86::r12, x86::r13, x86::r14, x86::r15, x86::r12, x86::r13, x86::r14, x86::r15,
}; };
#define REG_F32_FREE_IDX 15
#define REG_F64_FREE_IDX 15
x86::Xmm regs_float[] = { x86::Xmm regs_float[] = {
x86::xmm0, x86::xmm0,
x86::xmm1, x86::xmm1,
@ -349,6 +352,8 @@ cmp_r_and_jmp_label(JitCompContext *cc, x86::Assembler &a,
bool fp_cmp = cc->last_cmp_on_fp; bool fp_cmp = cc->last_cmp_on_fp;
bh_assert(!fp_cmp || (fp_cmp && (op == GES)));
switch (op) { switch (op) {
case EQ: case EQ:
{ {
@ -362,61 +367,53 @@ cmp_r_and_jmp_label(JitCompContext *cc, x86::Assembler &a,
} }
case GTS: case GTS:
{ {
if (fp_cmp) {
a.ja(imm);
}
else {
a.jg(imm); a.jg(imm);
}
break; break;
} }
case LES: case LES:
{ {
if (fp_cmp) {
a.jnb(imm);
}
else {
a.jng(imm); a.jng(imm);
}
break; break;
} }
case GES: case GES:
{ {
if (fp_cmp) { if (fp_cmp)
a.jnb(imm); a.jae(imm);
} else
else {
a.jnl(imm); a.jnl(imm);
}
break; break;
} }
case LTS: case LTS:
{ {
if (fp_cmp) {
a.ja(imm);
}
else {
a.jl(imm); a.jl(imm);
}
break; break;
} }
case GTU: case GTU:
{
a.ja(imm); a.ja(imm);
break; break;
}
case LEU: case LEU:
{
a.jna(imm); a.jna(imm);
break; break;
}
case GEU: case GEU:
{
a.jnb(imm); a.jnb(imm);
break; break;
}
case LTU: case LTU:
{
a.jb(imm); a.jb(imm);
break; break;
}
default: default:
{
bh_assert(0); bh_assert(0);
break; break;
} }
}
if (r2) { if (r2) {
int32 label_dst = jit_reg_no(r2); int32 label_dst = jit_reg_no(r2);
@ -761,11 +758,18 @@ static bool
mov_imm_to_m(x86::Assembler &a, x86::Mem &m_dst, Imm imm_src, uint32 bytes_dst) mov_imm_to_m(x86::Assembler &a, x86::Mem &m_dst, Imm imm_src, uint32 bytes_dst)
{ {
if (bytes_dst == 8) { if (bytes_dst == 8) {
/* As there is no instruction `MOV m64, imm64`, we use int64 value = imm_src.value();
if (value >= INT32_MIN && value <= INT32_MAX) {
imm_src.setValue((int32)value);
a.mov(m_dst, imm_src);
}
else {
/* There is no instruction `MOV m64, imm64`, we use
two instructions to implement it */ two instructions to implement it */
a.mov(regs_i64[REG_I64_FREE_IDX], imm_src); a.mov(regs_i64[REG_I64_FREE_IDX], imm_src);
a.mov(m_dst, regs_i64[REG_I64_FREE_IDX]); a.mov(m_dst, regs_i64[REG_I64_FREE_IDX]);
} }
}
else else
a.mov(m_dst, imm_src); a.mov(m_dst, imm_src);
return true; return true;
@ -4220,17 +4224,8 @@ static bool
cmp_imm_r_to_r_f32(x86::Assembler &a, int32 reg_no_dst, float data1_src, cmp_imm_r_to_r_f32(x86::Assembler &a, int32 reg_no_dst, float data1_src,
int32 reg_no2_src) int32 reg_no2_src)
{ {
const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info(); mov_imm_to_r_f32(a, REG_F32_FREE_IDX, data1_src);
/* xmm -> m128 */ a.comiss(regs_float[REG_F32_FREE_IDX], regs_float[reg_no2_src]);
x86::Mem cache = x86::xmmword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
offsetof(WASMExecEnv, jit_cache));
a.movups(cache, regs_float[reg_no2_src]);
/* imm -> gp -> xmm */
mov_imm_to_r_f32(a, reg_no2_src, data1_src);
/* comiss xmm m32 */
a.comiss(regs_float[reg_no2_src], cache);
return true; return true;
} }
@ -4249,15 +4244,8 @@ static bool
cmp_r_imm_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src, cmp_r_imm_to_r_f32(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
float data2_src) float data2_src)
{ {
const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info(); mov_imm_to_r_f32(a, REG_F32_FREE_IDX, data2_src);
/* imm -> m32 */ a.comiss(regs_float[reg_no1_src], regs_float[REG_F32_FREE_IDX]);
x86::Mem cache = x86::dword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
offsetof(WASMExecEnv, jit_cache));
Imm imm(*(uint32 *)&data2_src);
mov_imm_to_m(a, cache, imm, 4);
/* comiss xmm m32 */
a.comiss(regs_float[reg_no1_src], cache);
return true; return true;
} }
@ -4315,17 +4303,8 @@ static bool
cmp_imm_r_to_r_f64(x86::Assembler &a, int32 reg_no_dst, double data1_src, cmp_imm_r_to_r_f64(x86::Assembler &a, int32 reg_no_dst, double data1_src,
int32 reg_no2_src) int32 reg_no2_src)
{ {
const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info(); mov_imm_to_r_f64(a, REG_F64_FREE_IDX, data1_src);
/* xmm -> m128 */ a.comisd(regs_float[REG_F64_FREE_IDX], regs_float[reg_no2_src]);
x86::Mem cache = x86::qword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
offsetof(WASMExecEnv, jit_cache));
a.movupd(cache, regs_float[reg_no2_src]);
/* imm -> gp -> xmm */
mov_imm_to_r_f64(a, reg_no2_src, data1_src);
/* comiss xmm m64 */
a.comisd(regs_float[reg_no2_src], cache);
return true; return true;
} }
@ -4344,15 +4323,8 @@ static bool
cmp_r_imm_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src, cmp_r_imm_to_r_f64(x86::Assembler &a, int32 reg_no_dst, int32 reg_no1_src,
double data2_src) double data2_src)
{ {
const JitHardRegInfo *hreg_info = jit_codegen_get_hreg_info(); mov_imm_to_r_f64(a, REG_F64_FREE_IDX, data2_src);
/* imm -> m64 */ a.comisd(regs_float[reg_no1_src], regs_float[REG_F64_FREE_IDX]);
x86::Mem cache = x86::qword_ptr(regs_i64[hreg_info->exec_env_hreg_index],
offsetof(WASMExecEnv, jit_cache));
Imm imm(*(uint64 *)&data2_src);
mov_imm_to_m(a, cache, imm, 8);
/* comisd xmm m64 */
a.comisd(regs_float[reg_no1_src], cache);
return true; return true;
} }
@ -5071,13 +5043,19 @@ cmp_r_and_jmp_relative(JitCompContext *cc, x86::Assembler &a, int32 reg_no,
+ a.code()->sectionById(0)->buffer().size(); + a.code()->sectionById(0)->buffer().size();
bool fp_cmp = cc->last_cmp_on_fp; bool fp_cmp = cc->last_cmp_on_fp;
bh_assert(!fp_cmp || (fp_cmp && (op == GTS || op == GES)));
switch (op) { switch (op) {
case EQ: case EQ:
{
a.je(target); a.je(target);
break; break;
}
case NE: case NE:
{
a.jne(target); a.jne(target);
break; break;
}
case GTS: case GTS:
{ {
if (fp_cmp) { if (fp_cmp) {
@ -5090,18 +5068,13 @@ cmp_r_and_jmp_relative(JitCompContext *cc, x86::Assembler &a, int32 reg_no,
} }
case LES: case LES:
{ {
if (fp_cmp) {
a.jnb(target);
}
else {
a.jng(target); a.jng(target);
}
break; break;
} }
case GES: case GES:
{ {
if (fp_cmp) { if (fp_cmp) {
a.jnb(target); a.jae(target);
} }
else { else {
a.jnl(target); a.jnl(target);
@ -5110,30 +5083,35 @@ cmp_r_and_jmp_relative(JitCompContext *cc, x86::Assembler &a, int32 reg_no,
} }
case LTS: case LTS:
{ {
if (fp_cmp) {
a.ja(target);
}
else {
a.jl(target); a.jl(target);
}
break; break;
} }
case GTU: case GTU:
{
a.ja(target); a.ja(target);
break; break;
}
case LEU: case LEU:
{
a.jna(target); a.jna(target);
break; break;
}
case GEU: case GEU:
{
a.jae(target); a.jae(target);
break; break;
}
case LTU: case LTU:
{
a.jb(target); a.jb(target);
break; break;
}
default: default:
{
bh_assert(0); bh_assert(0);
break; break;
} }
}
/* The offset written by asmjit is always 0, we patch it again */ /* The offset written by asmjit is always 0, we patch it again */
*(int32 *)(stream + 2) = offset; *(int32 *)(stream + 2) = offset;
@ -5174,10 +5152,13 @@ lower_select(JitCompContext *cc, x86::Assembler &a, COND_OP op, JitReg r0,
CHECK_NCONST(r1); CHECK_NCONST(r1);
CHECK_KIND(r1, JIT_REG_KIND_I32); CHECK_KIND(r1, JIT_REG_KIND_I32);
if (r0 == r3 && r0 != r2) { if (r0 == r3 && r0 != r2 && !cc->last_cmp_on_fp) {
JitReg r_tmp; JitReg r_tmp;
/* Exchange r2, r3*/ /* For i32/i64, exchange r2 and r3 to make r0 equal to r2,
so as to decrease possible execution instructions.
For f32/f64 comparison, should not change the order as
the result of comparison with NaN may be different. */
r_tmp = r2; r_tmp = r2;
r2 = r3; r2 = r3;
r3 = r_tmp; r3 = r_tmp;
@ -5258,7 +5239,8 @@ lower_branch(JitCompContext *cc, x86::Assembler &a, bh_list *jmp_info_list,
label_dst = jit_reg_no(r1); label_dst = jit_reg_no(r1);
if (label_dst < (int32)jit_cc_label_num(cc) - 1 && is_last_insn if (label_dst < (int32)jit_cc_label_num(cc) - 1 && is_last_insn
&& label_is_neighboring(cc, label_src, label_dst)) { && label_is_neighboring(cc, label_src, label_dst)
&& !cc->last_cmp_on_fp) {
JitReg r_tmp; JitReg r_tmp;
r_tmp = r1; r_tmp = r1;
@ -6555,20 +6537,20 @@ static uint8 hreg_info_F32[3][16] = {
{ 0, 0, 0, 0, 0, 0, 0, 0, { 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1 }, 1, 1, 1, 1, 1, 1, 1, 1 },
{ 1, 1, 1, 1, 1, 1, 1, 1, { 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1 }, /* caller_saved_native */ 1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_native */
{ 1, 1, 1, 1, 1, 1, 1, 1, { 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1 }, /* caller_saved_jitted */ 1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_jitted */
}; };
/* System V AMD64 ABI Calling Conversion. [XYZ]MM0-7 */ /* System V AMD64 ABI Calling Conversion. [XYZ]MM0-7 */
static uint8 hreg_info_F64[3][16] = { static uint8 hreg_info_F64[3][16] = {
/* xmm0 ~ xmm15 */ /* xmm0 ~ xmm15 */
{ 1, 1, 1, 1, 1, 1, 1, 1, { 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0, 1 },
{ 1, 1, 1, 1, 1, 1, 1, 1, { 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1 }, /* caller_saved_native */ 1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_native */
{ 1, 1, 1, 1, 1, 1, 1, 1, { 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1 }, /* caller_saved_jitted */ 1, 1, 1, 1, 1, 1, 1, 0 }, /* caller_saved_jitted */
}; };
static const JitHardRegInfo hreg_info = { static const JitHardRegInfo hreg_info = {

View File

@ -187,7 +187,7 @@ jit_compile_op_compare_float_point(JitCompContext *cc, FloatCond cond,
case FLOAT_LT: case FLOAT_LT:
{ {
GEN_INSN(CMP, cc->cmp_reg, rhs, lhs); GEN_INSN(CMP, cc->cmp_reg, rhs, lhs);
GEN_INSN(SELECTLTS, res, cc->cmp_reg, const_one, const_zero); GEN_INSN(SELECTGTS, res, cc->cmp_reg, const_one, const_zero);
break; break;
} }
case FLOAT_GT: case FLOAT_GT:
@ -199,7 +199,7 @@ jit_compile_op_compare_float_point(JitCompContext *cc, FloatCond cond,
case FLOAT_LE: case FLOAT_LE:
{ {
GEN_INSN(CMP, cc->cmp_reg, rhs, lhs); GEN_INSN(CMP, cc->cmp_reg, rhs, lhs);
GEN_INSN(SELECTLES, res, cc->cmp_reg, const_one, const_zero); GEN_INSN(SELECTGES, res, cc->cmp_reg, const_one, const_zero);
break; break;
} }
case FLOAT_GE: case FLOAT_GE:

View File

@ -63,8 +63,8 @@ jit_compile_op_i32_trunc_f32(JitCompContext *cc, bool sign, bool saturating)
} }
/* If value is out of integer range, throw exception */ /* If value is out of integer range, throw exception */
GEN_INSN(CMP, cc->cmp_reg, value, min_valid_float); GEN_INSN(CMP, cc->cmp_reg, min_valid_float, value);
if (!jit_emit_exception(cc, EXCE_INTEGER_OVERFLOW, JIT_OP_BLES, if (!jit_emit_exception(cc, EXCE_INTEGER_OVERFLOW, JIT_OP_BGES,
cc->cmp_reg, NULL)) { cc->cmp_reg, NULL)) {
goto fail; goto fail;
} }
@ -123,8 +123,8 @@ jit_compile_op_i32_trunc_f64(JitCompContext *cc, bool sign, bool saturating)
} }
/* If value is out of integer range, throw exception */ /* If value is out of integer range, throw exception */
GEN_INSN(CMP, cc->cmp_reg, value, min_valid_double); GEN_INSN(CMP, cc->cmp_reg, min_valid_double, value);
if (!jit_emit_exception(cc, EXCE_INTEGER_OVERFLOW, JIT_OP_BLES, if (!jit_emit_exception(cc, EXCE_INTEGER_OVERFLOW, JIT_OP_BGES,
cc->cmp_reg, NULL)) { cc->cmp_reg, NULL)) {
goto fail; goto fail;
} }

View File

@ -165,7 +165,16 @@ fail:
static uint8 static uint8
get_global_type(const WASMModule *module, uint32 global_idx) get_global_type(const WASMModule *module, uint32 global_idx)
{ {
return module->globals[global_idx].type; if (global_idx < module->import_global_count) {
const WASMGlobalImport *import_global =
&((module->import_globals + global_idx)->u.global);
return import_global->type;
}
else {
const WASMGlobal *global =
module->globals + (global_idx - module->import_global_count);
return global->type;
}
} }
static uint32 static uint32
@ -177,7 +186,8 @@ get_global_data_offset(const WASMModule *module, uint32 global_idx)
return import_global->data_offset; return import_global->data_offset;
} }
else { else {
const WASMGlobal *global = module->globals + global_idx; const WASMGlobal *global =
module->globals + (global_idx - module->import_global_count);
return global->data_offset; return global->data_offset;
} }
} }

View File

@ -332,6 +332,12 @@ jit_pass_dump(JitCompContext *cc)
const char *pass_name = const char *pass_name =
pass_no > 0 ? jit_compiler_get_pass_name(passes[pass_no - 1]) : "NULL"; pass_no > 0 ? jit_compiler_get_pass_name(passes[pass_no - 1]) : "NULL";
#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
if (!strcmp(pass_name, "lower_cg"))
/* Ignore lower codegen pass as it does nothing in x86-64 */
return true;
#endif
os_printf("JIT.COMPILER.DUMP: PASS_NO=%d PREV_PASS=%s\n\n", pass_no, os_printf("JIT.COMPILER.DUMP: PASS_NO=%d PREV_PASS=%s\n\n", pass_no,
pass_name); pass_name);
jit_dump_cc(cc); jit_dump_cc(cc);

View File

@ -755,9 +755,11 @@ init_func_translation(JitCompContext *cc)
cc->jit_frame = jit_frame; cc->jit_frame = jit_frame;
cc->cur_basic_block = jit_cc_entry_basic_block(cc); cc->cur_basic_block = jit_cc_entry_basic_block(cc);
cc->total_frame_size = wasm_interp_interp_frame_size(total_cell_num); cc->spill_cache_offset = wasm_interp_interp_frame_size(total_cell_num);
cc->spill_cache_offset = (uint32)offsetof(WASMInterpFrame, spill_cache); /* Set spill cache size according to max local cell num, max stack cell
cc->spill_cache_size = (uint32)sizeof(uint32) * FAST_JIT_SPILL_CACHE_SIZE; num and virtual fixed register num */
cc->spill_cache_size = (max_locals + max_stacks) * 4 + sizeof(void *) * 4;
cc->total_frame_size = cc->spill_cache_offset + cc->spill_cache_size;
cc->jitted_return_address_offset = cc->jitted_return_address_offset =
offsetof(WASMInterpFrame, jitted_return_addr); offsetof(WASMInterpFrame, jitted_return_addr);
cc->cur_basic_block = jit_cc_entry_basic_block(cc); cc->cur_basic_block = jit_cc_entry_basic_block(cc);
@ -838,7 +840,7 @@ free_block_memory(JitBlock *block)
jit_free(block); jit_free(block);
} }
static JitBlock * static JitBasicBlock *
create_func_block(JitCompContext *cc) create_func_block(JitCompContext *cc)
{ {
JitBlock *jit_block; JitBlock *jit_block;
@ -883,7 +885,7 @@ create_func_block(JitCompContext *cc)
jit_block_stack_push(&cc->block_stack, jit_block); jit_block_stack_push(&cc->block_stack, jit_block);
cc->cur_basic_block = jit_block->basic_block_entry; cc->cur_basic_block = jit_block->basic_block_entry;
return jit_block; return jit_block->basic_block_entry;
fail: fail:
free_block_memory(jit_block); free_block_memory(jit_block);
@ -2086,13 +2088,13 @@ JitBasicBlock *
jit_frontend_translate_func(JitCompContext *cc) jit_frontend_translate_func(JitCompContext *cc)
{ {
JitFrame *jit_frame; JitFrame *jit_frame;
JitBlock *jit_block; JitBasicBlock *basic_block_entry;
if (!(jit_frame = init_func_translation(cc))) { if (!(jit_frame = init_func_translation(cc))) {
return NULL; return NULL;
} }
if (!(jit_block = create_func_block(cc))) { if (!(basic_block_entry = create_func_block(cc))) {
return NULL; return NULL;
} }
@ -2100,7 +2102,7 @@ jit_frontend_translate_func(JitCompContext *cc)
return NULL; return NULL;
} }
return jit_block->basic_block_entry; return basic_block_entry;
} }
#if 0 #if 0

View File

@ -28,7 +28,6 @@ typedef struct WASMInterpFrame {
#if WASM_ENABLE_FAST_JIT != 0 #if WASM_ENABLE_FAST_JIT != 0
uint8 *jitted_return_addr; uint8 *jitted_return_addr;
uint32 spill_cache[FAST_JIT_SPILL_CACHE_SIZE];
#endif #endif
#if WASM_ENABLE_PERF_PROFILING != 0 #if WASM_ENABLE_PERF_PROFILING != 0
@ -52,11 +51,12 @@ typedef struct WASMInterpFrame {
WASMBranchBlock *csp_boundary; WASMBranchBlock *csp_boundary;
WASMBranchBlock *csp; WASMBranchBlock *csp;
/* Frame data, the layout is: /**
lp: param_cell_count + local_cell_count * Frame data, the layout is:
sp_bottom to sp_boundary: stack of data * lp: parameters and local variables
csp_bottom to csp_boundary: stack of block * sp_bottom to sp_boundary: wasm operand stack
ref to frame end: data types of local vairables and stack data * csp_bottom to csp_boundary: wasm label stack
* jit spill cache: only available for fast jit
*/ */
uint32 lp[1]; uint32 lp[1];
#endif #endif

View File

@ -1509,8 +1509,8 @@ wasm_instantiate(WASMModule *module, bool is_sub_inst, uint32 stack_size,
if (stack_size == 0) if (stack_size == 0)
stack_size = DEFAULT_WASM_STACK_SIZE; stack_size = DEFAULT_WASM_STACK_SIZE;
#if WASM_ENABLE_SPEC_TEST != 0 #if WASM_ENABLE_SPEC_TEST != 0
if (stack_size < 100 * 1024) if (stack_size < 64 * 1024)
stack_size = 100 * 1024; stack_size = 64 * 1024;
#endif #endif
module_inst->default_wasm_stack_size = stack_size; module_inst->default_wasm_stack_size = stack_size;