[llvm] 216b5e9 - [AMDGPU] Expose RTZ version of f16 interpolation for gfx11+ (#86614)

Sun Mar 31 18:48:40 PDT 2024

Author: Ruiling, Song
Date: 2024-04-01T09:48:37+08:00
New Revision: 216b5e96664f72fdb63b6bbd6c422185c67ef818

URL: https://github.com/llvm/llvm-project/commit/216b5e96664f72fdb63b6bbd6c422185c67ef818
DIFF: https://github.com/llvm/llvm-project/commit/216b5e96664f72fdb63b6bbd6c422185c67ef818.diff

LOG: [AMDGPU] Expose RTZ version of f16 interpolation for gfx11+ (#86614)

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/VINTERPInstructions.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bda3b066b77636..3de20bb44e0c1b 100644

--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2066,6 +2066,24 @@ def int_amdgcn_interp_inreg_p2_f16 :
             [IntrNoMem, IntrSpeculatable,
              ImmArg<ArgIndex<3>>]>;
 
+// llvm.amdgcn.interp.p10.rtz.f16 <p>, <i>, <p0>, <high>
+// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
+// high selects whether high or low 16-bits are used for p and p0 operands
+def int_amdgcn_interp_p10_rtz_f16:
+  DefaultAttrsIntrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable,
+             ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.interp.p2.rtz.f16 <p>, <j>, <tmp>, <high>
+// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
+// high selects whether high or low 16-bits are used for p operand
+def int_amdgcn_interp_p2_rtz_f16 :
+  DefaultAttrsIntrinsic<[llvm_half_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable,
+             ImmArg<ArgIndex<3>>]>;
+
 // Deprecated: use llvm.amdgcn.live.mask instead.
 def int_amdgcn_ps_live : DefaultAttrsIntrinsic <
   [llvm_i1_ty],

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a42e95f140ce99..aa4ec785bf02a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3135,6 +3135,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     case Intrinsic::amdgcn_interp_inreg_p2:
     case Intrinsic::amdgcn_interp_inreg_p10_f16:
     case Intrinsic::amdgcn_interp_inreg_p2_f16:
+    case Intrinsic::amdgcn_interp_p10_rtz_f16:
+    case Intrinsic::amdgcn_interp_p2_rtz_f16:
       applyDefaultMapping(OpdMapper);
       return;
     case Intrinsic::amdgcn_permlane16:
@@ -4778,7 +4780,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_interp_inreg_p10:
     case Intrinsic::amdgcn_interp_inreg_p2:
     case Intrinsic::amdgcn_interp_inreg_p10_f16:
-    case Intrinsic::amdgcn_interp_inreg_p2_f16: {
+    case Intrinsic::amdgcn_interp_inreg_p2_f16:
+    case Intrinsic::amdgcn_interp_p10_rtz_f16:
+    case Intrinsic::amdgcn_interp_p2_rtz_f16: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

diff  --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 77063e2b70f66c..1f7bffb26a60f4 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -173,6 +173,12 @@ defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
 defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
                      V_INTERP_P2_F16_F32_inreg, f16,
                      [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
+defm : VInterpF16Pat<int_amdgcn_interp_p10_rtz_f16,
+                     V_INTERP_P10_RTZ_F16_F32_inreg, f32,
+                     [VINTERPModsHi, VINTERPMods, VINTERPModsHi]>;
+defm : VInterpF16Pat<int_amdgcn_interp_p2_rtz_f16,
+                     V_INTERP_P2_RTZ_F16_F32_inreg, f16,
+                     [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
 
 //===----------------------------------------------------------------------===//
 // VINTERP Real Instructions

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index 623360f6b1d9c5..de46037e96e802 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -147,6 +147,34 @@ main_body:
   ret half %res
 }
 
+define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_rtz_f16:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s3, exec_lo
+; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GCN-NEXT:    s_mov_b32 exec_lo, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
+  %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
+  %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
+  %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
+  %res = fadd half %l_p1, %h_p1
+  ret half %res
+}
+
 define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
 ; GCN-LABEL: v_interp_f16_imm_params:
 ; GCN:       ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
 

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index 429528e9091d13..e3dd036ecc3083 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -147,6 +147,34 @@ main_body:
   ret half %res
 }
 
+define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_rtz_f16:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s3, exec_lo
+; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GCN-NEXT:    s_mov_b32 exec_lo, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
+  %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
+  %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
+  %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
+  %res = fadd half %l_p1, %h_p1
+  ret half %res
+}
+
 define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
 ; GCN-LABEL: v_interp_f16_imm_params:
 ; GCN:       ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0