[llvm] ad66985 - [AMDGPU] V_LDEXP_F16 encoding fix and doc update.

Wed Oct 19 06:53:17 PDT 2022

Author: Joe Nash
Date: 2022-10-19T09:52:53-04:00
New Revision: ad6698562c3d43d0311922e7f6aebb8998021530

URL: https://github.com/llvm/llvm-project/commit/ad6698562c3d43d0311922e7f6aebb8998021530
DIFF: https://github.com/llvm/llvm-project/commit/ad6698562c3d43d0311922e7f6aebb8998021530.diff

LOG: [AMDGPU] V_LDEXP_F16 encoding fix and doc update.

The amdgcn.ldexp.* intrinsics take an i32 value as src1.
The V_LDEXP_F16 instruction considers src1 an f16 operand, and therefore
src1 is implicitly truncated to 16 bits when lowering to that instruction from the
intrinsic. This is unlikely to result in an error in practice
because values that large are not useful.

The operand class of src1 in the True16 version of the instruction has
been corrected to encode correctly on GFX11.

Reviewed By: foad, rampitec

Differential Revision: https://reviews.llvm.org/D136195

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/VOP2Instructions.td
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
    llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
    llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2c63d2e77e5e..8f05eb10920c 100644

--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -343,6 +343,7 @@ def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
 def int_amdgcn_rsq_clamp : Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
+// For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used.
 def int_amdgcn_ldexp : Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable, IntrWillReturn]

diff  --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index a04eb98874b3..c3d66f458263 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -832,9 +832,20 @@ def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
 // 16-Bit Operand Instructions
 //===----------------------------------------------------------------------===//
 
+def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_I32> {
+  // The ldexp.f16 intrinsic expects a i32 src1 operand, though the hardware
+  // encoding treats src1 as an f16
+  let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
+  let Src1DPP = VGPR_32_Lo128;
+  let Src1ModDPP = IntT16VRegInputMods;
+}
+
 let isReMaterializable = 1 in {
 let FPDPRounding = 1 in {
-defm V_LDEXP_F16 : VOP2Inst_t16 <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in
+    defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+  let SubtargetPredicate = HasTrue16BitInsts in
+    defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16, AMDGPUldexp>;
 } // End FPDPRounding = 1
 // FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
 defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index 7068f4559055..0d172792cf20 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -1,13 +1,79 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
 
 declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
 
-; GCN-LABEL: {{^}}ldexp_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
-; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]]
-; GCN: buffer_store_short v[[R_F16]]
 define amdgpu_kernel void @ldexp_f16(
+; VI-LABEL: ldexp_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s14, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s6
+; VI-NEXT:    s_mov_b32 s13, s7
+; VI-NEXT:    s_mov_b32 s15, s3
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ldexp_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s14, s2
+; GFX10-NEXT:    s_mov_b32 s15, s3
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s12, s6
+; GFX10-NEXT:    s_mov_b32 s13, s7
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; GFX10-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ldexp_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT:    s_mov_b32 s10, -1
+; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s14, s10
+; GFX11-NEXT:    s_mov_b32 s15, s11
+; GFX11-NEXT:    s_mov_b32 s2, s10
+; GFX11-NEXT:    s_mov_b32 s3, s11
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s12, s6
+; GFX11-NEXT:    s_mov_b32 s13, s7
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
+; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s8, s4
+; GFX11-NEXT:    s_mov_b32 s9, s5
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -18,11 +84,61 @@ define amdgpu_kernel void @ldexp_f16(
   ret void
 }
 
-; GCN-LABEL: {{^}}ldexp_f16_imm_a
-; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
-; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
-; GCN: buffer_store_short v[[R_F16]]
 define amdgpu_kernel void @ldexp_f16_imm_a(
+; VI-LABEL: ldexp_f16_imm_a:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ldexp_f16_e32 v0, 2.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ldexp_f16_imm_a:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, 2.0, v0
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ldexp_f16_imm_a:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, 2.0, v0
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
     half addrspace(1)* %r,
     i32 addrspace(1)* %b) {
   %b.val = load i32, i32 addrspace(1)* %b
@@ -31,11 +147,61 @@ define amdgpu_kernel void @ldexp_f16_imm_a(
   ret void
 }
 
-; GCN-LABEL: {{^}}ldexp_f16_imm_b
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}}
-; GCN: buffer_store_short v[[R_F16]]
 define amdgpu_kernel void @ldexp_f16_imm_b(
+; VI-LABEL: ldexp_f16_imm_b:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ldexp_f16_e64 v0, v0, 2
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ldexp_f16_imm_b:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_ldexp_f16_e64 v0, v0, 2
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ldexp_f16_imm_b:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_ldexp_f16_e64 v0, v0, 2
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
   %a.val = load half, half addrspace(1)* %a

diff  --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
index c2e6974c688e..66cb4ca17faa 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
@@ -73,6 +73,9 @@ v_fmac_f16_e32 v5, v1, v255
 v_fmamk_f16_e32 v5, v1, 0xfe0b, v255
 // GFX11: error: operands are not valid for this GPU or mode
 
+v_ldexp_f16_e32 v5, v1, v255
+// GFX11: error: operands are not valid for this GPU or mode
+
 v_max_f16_e32 v5, v1, v255
 // GFX11: error: operands are not valid for this GPU or mode
 
@@ -142,6 +145,9 @@ v_add_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
 v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: error: operands are not valid for this GPU or mode
 
+v_ldexp_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
+// GFX11: error: operands are not valid for this GPU or mode
+
 v_max_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: error: operands are not valid for this GPU or mode
 
@@ -211,6 +217,9 @@ v_add_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: error: operands are not valid for this GPU or mode
 
+v_ldexp_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: error: operands are not valid for this GPU or mode
+
 v_max_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: error: operands are not valid for this GPU or mode
 

diff  --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
index c5f81d51daa6..876b7b88f50b 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
@@ -55,6 +55,9 @@ v_add_f16 v5, v1, v255
 v_fmac_f16 v5, v1, v255
 // GFX11: v_fmac_f16_e64
 
+v_ldexp_f16 v5, v1, v255
+// GFX11: v_ldexp_f16_e64
+
 v_max_f16 v5, v1, v255
 // GFX11: v_max_f16_e64
 
@@ -115,6 +118,9 @@ v_subrev_f16 v5, v255, v2 quad_perm:[3,2,1,0]
 v_add_f16 v5, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_add_f16_e64
 
+v_ldexp_f16 v5, v1, v255 quad_perm:[3,2,1,0]
+// GFX11: v_ldexp_f16_e64
+
 v_max_f16 v5, v1, v255 quad_perm:[3,2,1,0]
 // GFX11: v_max_f16_e64
 
@@ -175,6 +181,9 @@ v_subrev_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_add_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_add_f16_e64
 
+v_ldexp_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_ldexp_f16_e64
+
 v_max_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_max_f16_e64