[llvm] 01b4b2a - [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (#143881)

via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 18 06:01:12 PDT 2025


Author: Fabian Ritter
Date: 2025-09-18T15:01:07+02:00
New Revision: 01b4b2a5b88c4b93d635a5049fa85e569b405982

URL: https://github.com/llvm/llvm-project/commit/01b4b2a5b88c4b93d635a5049fa85e569b405982
DIFF: https://github.com/llvm/llvm-project/commit/01b4b2a5b88c4b93d635a5049fa85e569b405982.diff

LOG: [AMDGPU][SDAG] Handle ISD::PTRADD in VOP3 patterns (#143881)

This patch mirrors similar patterns for ISD::ADD. The main difference is
that ISD::ADD is commutative, so that a pattern definition for, e.g.,
(add (mul x, y), z), automatically also handles (add z, (mul x, y)).
ISD::PTRADD is not commutative, so we would need to handle these cases
explicitly. This patch only implements (ptradd z, (op x, y)) patterns,
where the nested operation (shift or multiply) is the offset of the
ptradd (i.e., the right operand), since base pointers that are the
result of a shift or multiply seem less likely.

For SWDEV-516125.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/VOP3Instructions.td
    llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
    llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
    llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
    llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
    llvm/test/CodeGen/AMDGPU/scale-offset-global.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 19eabb46752bf..582a353632436 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -9,6 +9,10 @@
 def BITOP3_32 : ComplexPattern<i32, 4, "SelectBITOP3", [and, or, xor]>;
 def BITOP3_16 : ComplexPattern<i16, 4, "SelectBITOP3", [and, or, xor]>;
 
+// Matches PTRADD as a commutative operation.
+def ptradd_commutative : PatFrags<(ops node:$src0, node:$src1),
+  [(ptradd node:$src0, node:$src1), (ptradd node:$src1, node:$src0)]>;
+
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
 // only VOP instruction that implicitly reads VCC.
 let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
@@ -938,12 +942,18 @@ def : GCNPat<
  (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
  (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
 
-let SubtargetPredicate = HasLshlAddU64Inst in
+let SubtargetPredicate = HasLshlAddU64Inst in {
 def : GCNPat<
   (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
   (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
 >;
 
+def : GCNPat <
+  // (ptradd z, (shl x, y)) or (ptradd (shl x, y), z) -> ((x << y) + z)
+  (ThreeOpFrag<shl_0_to_4, ptradd_commutative> i64:$src0, i32:$src1, i64:$src2),
+  (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)>;
+} // End SubtargetPredicate = HasLshlAddU64Inst
+
 let SubtargetPredicate = HasAddMinMaxInsts in {
 def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
 def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
@@ -1019,19 +1029,24 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
 
 // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
 // We need to separate this because otherwise OtherPredicates would be overriden.
-class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
-    (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
+class IMAD32_Mul24_Pats_Impl<VOP3_Pseudo inst, SDPatternOperator AddOp> : GCNPat <
+    (i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
     (inst $src0, $src1, $src2, 0 /* clamp */)
     >;
 
+multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
+  def : IMAD32_Mul24_Pats_Impl<inst, add>;
+  def : IMAD32_Mul24_Pats_Impl<inst, ptradd_commutative>;
+}
+
 // exclude pre-GFX9 where it was slow
 let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
   defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
-  def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
+  defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>;
 }
 let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
   defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
-  def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
+  defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_gfx11_e64>;
 }
 
 def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
index 6bb68e1e26a14..9c49aade6099f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
@@ -119,10 +119,7 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
 ; GFX942-GISEL:       ; %bb.0:
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
-; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
 ; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 12, v0
 ; GFX942-GISEL-NEXT:    s_nop 1
 ; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
index f571030077870..3019c1d897d98 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
@@ -270,29 +270,15 @@ entry:
 }
 
 define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) {
-; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset:
-; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 m0, s2
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
-; GFX1250-SDAG-NEXT:    cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset:
-; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX1250-GISEL-NEXT:    s_mov_b32 m0, s2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-GISEL-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
-; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b32 m0, s2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-NEXT:    cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
+; GFX1250-NEXT:    s_endpgm
 entry:
   %idxprom = sext i32 %idx to i64
   %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
index dd679101047ea..d5fae1e4a9657 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
@@ -160,27 +160,14 @@ entry:
 }
 
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
-; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
-; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
-; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
-; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-GISEL-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
-; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-NEXT:    global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
+; GFX1250-NEXT:    s_endpgm
 entry:
   %idxprom = sext i32 %idx to i64
   %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
index fd35313802558..22563f8e5ff46 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
@@ -160,27 +160,14 @@ entry:
 }
 
 define amdgpu_ps void @global_store_async_from_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
-; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
-; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
-; GFX1250-SDAG-NEXT:    global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
-; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-GISEL-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
-; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-NEXT:    global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
+; GFX1250-NEXT:    s_endpgm
 entry:
   %idxprom = sext i32 %idx to i64
   %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
index 3377290ecb1e0..350d468344f65 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
@@ -169,29 +169,15 @@ entry:
 }
 
 define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
-; GFX1250-SDAG-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
-; GFX1250-SDAG:       ; %bb.0: ; %entry
-; GFX1250-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
-; GFX1250-SDAG-NEXT:    global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
-; GFX1250-GISEL:       ; %bb.0: ; %entry
-; GFX1250-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
-; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
-; GFX1250-GISEL-NEXT:    global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-NEXT:    global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %idxprom = sext i32 %idx to i64
   %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
@@ -199,3 +185,6 @@ entry:
   store <2 x i32> %val, ptr addrspace(1) %use
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 4db232cbfa8c7..0fe4d337a5bd7 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -265,18 +265,11 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
 
 ; Use non-zero shift amounts in v_lshl_add_u64.
 define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) {
-; GFX942_PTRADD-LABEL: select_v_lshl_add_u64:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: select_v_lshl_add_u64:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: select_v_lshl_add_u64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds i64, ptr %base, i64 %voffset
   ret ptr %gep
 }
@@ -284,23 +277,13 @@ define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) {
 ; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the
 ; mul into a mul24.
 define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
-; GFX942_PTRADD-LABEL: fold_mul24_into_mad:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v2, 0xfffff, v2
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v4, 0xfffff, v4
-; GFX942_PTRADD-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v4
-; GFX942_PTRADD-NEXT:    v_mul_u32_u24_e32 v2, v2, v4
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: fold_mul24_into_mad:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v2, 0xfffff, v2
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v3, 0xfffff, v4
-; GFX942_LEGACY-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: fold_mul24_into_mad:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_and_b32_e32 v2, 0xfffff, v2
+; GFX942-NEXT:    v_and_b32_e32 v3, 0xfffff, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a_masked = and i64 %a, u0xfffff
   %b_masked = and i64 %b, u0xfffff
   %mul = mul i64 %a_masked, %b_masked

diff  --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
index 32f2395c7b2ad..9dd25025d4381 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
@@ -25,20 +25,12 @@ define ptr @gep_as0(ptr %p, i64 %offset) {
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942_PTRADD-LABEL: gep_as0:
-; GFX942_PTRADD:       ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: gep_as0:
-; GFX942_LEGACY:       ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: gep_as0:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: gep_as0:
 ; GFX10:       ; %bb.0: ; %entry
@@ -188,20 +180,12 @@ define ptr @multi_gep_as0(ptr %p, i64 %offset) {
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX942_PTRADD-LABEL: multi_gep_as0:
-; GFX942_PTRADD:       ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: multi_gep_as0:
-; GFX942_LEGACY:       ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: multi_gep_as0:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: multi_gep_as0:
 ; GFX10:       ; %bb.0: ; %entry
@@ -537,3 +521,5 @@ entry:
 ; GFX12_PTRADD: {{.*}}
 ; GFX8_LEGACY: {{.*}}
 ; GFX8_PTRADD: {{.*}}
+; GFX942_LEGACY: {{.*}}
+; GFX942_PTRADD: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 307ff046d48c2..335d58c43c936 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -28,27 +28,14 @@ entry:
 }
 
 define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) {
-; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride:
-; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
-; SDAG-NEXT:    flat_load_b32 v0, v[0:1]
-; SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
-; GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
-; GISEL-NEXT:    flat_load_b32 v0, v[0:1]
-; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: flat_load_b32_idxprom_wrong_stride:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idxprom = sext i32 %idx to i64
   %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
@@ -380,16 +367,12 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ;
 ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v6, src_flat_scratch_base_hi
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 3, v[2:3]
+; GISEL-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v0, src_flat_scratch_base_hi
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v0
-; GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
+; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GISEL-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 3, s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_xor_b32_e32 v0, v5, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v5, v0
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
@@ -405,7 +388,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; GISEL-NEXT:    s_branch .LBB21_5
 ; GISEL-NEXT:  .LBB21_3: ; %atomicrmw.global
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], 1
-; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GISEL-NEXT:    s_wait_xcnt 0x0
 ; GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
index faea84e34d7eb..a0fca0e2bdc72 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
@@ -28,27 +28,14 @@ entry:
 }
 
 define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
-; SDAG-LABEL: global_load_b32_idxprom_wrong_stride:
-; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
-; SDAG-NEXT:    global_load_b32 v0, v[0:1], off
-; SDAG-NEXT:    s_wait_loadcnt 0x0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: global_load_b32_idxprom_wrong_stride:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
-; GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
-; GISEL-NEXT:    global_load_b32 v0, v[0:1], off
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_load_b32_idxprom_wrong_stride:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idxprom = sext i32 %idx to i64
   %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
@@ -349,3 +336,6 @@ entry:
 }
 
 !0 = !{i32 0, i32 1024}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}


        


More information about the llvm-commits mailing list