[llvm] 23db8e4 - [AMDGPU] Use v_mad_u64_u32 for IMAD32

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 9 11:39:56 PDT 2022


Author: Stanislav Mekhanoshin
Date: 2022-06-09T11:39:49-07:00
New Revision: 23db8e4b43225fd5e4df8cab52e0b8517ba77d49

URL: https://github.com/llvm/llvm-project/commit/23db8e4b43225fd5e4df8cab52e0b8517ba77d49
DIFF: https://github.com/llvm/llvm-project/commit/23db8e4b43225fd5e4df8cab52e0b8517ba77d49.diff

LOG: [AMDGPU] Use v_mad_u64_u32 for IMAD32

Nic Curtis done the experiments to prove it is faster than a
separate mul and add.

Fixes: SWDEV-332806

Differential Revision: https://reviews.llvm.org/D127253

Added: 
    llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll

Modified: 
    llvm/lib/Target/AMDGPU/VOP3Instructions.td
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
    llvm/test/CodeGen/AMDGPU/mad_64_32.ll
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c2e7db281359..71e71e2d1cdb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -410,7 +410,7 @@ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
 
 } // End Predicates = [Has16BitInsts, isGFX10Plus]
 
-class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
   (ops node:$x, node:$y, node:$z),
   // When the inner operation is used multiple times, selecting 3-op
   // instructions may still be beneficial -- if the other users can be
@@ -440,7 +440,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
     return true;
   }]> {
   let PredicateCodeUsesOperands = 1;
+}
 
+class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
   // The divergence predicate is irrelevant in GlobalISel, as we have
   // proper register bank checks. We just need to verify the constant
   // bus restriction when all the sources are considered.
@@ -568,6 +570,33 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
 def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
 
+// FIXME: GlobalISel in general does not handle instructions with 2 results,
+// so it cannot use these patterns.
+multiclass IMAD32_Pats <VOP3_Pseudo inst> {
+  def : GCNPat <
+        (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
+        (EXTRACT_SUBREG (inst $src0, $src1,
+                              (REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
+                                            $src2, sub0,
+                                            (i32 (IMPLICIT_DEF)), sub1),
+                                            0 /* clamp */),
+                        sub0)
+        >;
+  // Immediate src2 in the pattern above will not fold because it would be partially
+  // undef. Hence define specialized pattern for this case.
+  // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
+  // make it SDAG only.
+  def : GCNPat <
+        (ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
+        (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
+        >;
+}
+
+let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow
+defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
+let SubtargetPredicate = isGFX11Only in
+defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
+
 def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
   let Src0RC64 = VRegSrc_32;
   let Src1RC64 = SCSrc_b32;

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index be0d5040718f..0508b97d8500 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -268,12 +268,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W64-NEXT:  .LBB1_2:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W64-NEXT:    s_endpgm
 ;
@@ -298,12 +297,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8e5c4f353f90..a85e17d019c2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -288,12 +288,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -326,12 +325,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -878,11 +876,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -923,11 +920,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1]
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -967,11 +963,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1]
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v1
+; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2]
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -2000,16 +1995,16 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:  .LBB10_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2048,14 +2043,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v2
+; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -2094,14 +2089,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1032-NEXT:    v_mad_u64_u32 v[2:3], s0, s2, v2, 0
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s0, s2, v2, 0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v2
+; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5]
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v3
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index e478e2c0b62b..c0e2f7b477b0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -293,12 +293,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -327,12 +326,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -1012,13 +1010,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1053,11 +1050,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5]
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
+; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v1
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
@@ -1091,11 +1087,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
-; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
+; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
+; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v1
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 entry:
@@ -2176,18 +2171,18 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX9-NEXT:  .LBB12_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v3, v4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2220,13 +2215,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v2
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v3
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2260,13 +2255,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v4, s3, v2
-; GFX1032-NEXT:    v_mad_u64_u32 v[2:3], s2, s2, v2, 0
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s2, s2, v2, 0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v3, v4
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v2
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v3
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index ac7a403a1aac..007f7e6ef7c7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -267,12 +267,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W64-NEXT:  .LBB1_2:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W64-NEXT:    s_endpgm
 ;
@@ -297,12 +296,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 31076cd7e69c..9cfd9df76444 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -276,12 +276,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W64-NEXT:  .LBB1_2:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W64-NEXT:    s_endpgm
 ;
@@ -307,12 +306,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX10W32-NEXT:  .LBB1_2:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX10W32-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 8ee4759dd7af..09224867808c 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -296,10 +296,12 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX9-LABEL: mad_i64_i32_extops_i32_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
-; GFX9-NEXT:    v_add_u32_e32 v1, v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ext0 = sext i32 %arg0 to i64
   %ext1 = zext i32 %arg1 to i64
@@ -363,8 +365,9 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %trunc.lhs = and i64 %arg0, 8589934591
   %trunc.rhs = and i64 %arg1, 4294967295
@@ -400,10 +403,11 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v3
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %trunc.lhs = and i64 %arg0, 4294967295
   %trunc.rhs = and i64 %arg1, 8589934591

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
new file mode 100644
index 000000000000..2a06faaa5878
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+
+define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GFX9-LABEL: mad_i32_vvv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v1, v[2:3]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vvv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, v1, v[2:3]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vvv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v4, v3, v[2:3]
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GCN-LABEL: mad_i32_sss:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) {
+; GFX9-LABEL: mad_i32_vvc:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v1, 42
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vvc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, v1, 42
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vvc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v3, v2, 42
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, 42
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) {
+; GFX9-LABEL: mad_i32_vvi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x12d687
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v1, v[2:3]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vvi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, v1, 0x12d687
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vvi:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v3, v2, 0x12d687
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, 1234567
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) {
+; GFX9-LABEL: mad_i32_vcv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 42, v[1:2]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vcv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, 42, v[1:2]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vcv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], s0, v0, 42, v[1:2]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, 42
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vcc(i32 %a) {
+; GFX9-LABEL: mad_i32_vcc:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 42, 43
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vcc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, 42, 43
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v2, 42, 43
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, 42
+  %add = add i32 %mul, 43
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) {
+; GFX9-LABEL: mad_i32_vvs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v1, s[0:1]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vvs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, v1, s[0:1]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vvs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v3, v2, s[0:1]
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) {
+; GFX9-LABEL: mad_i32_vsv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, s0, v[1:2]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vsv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, s0, v[1:2]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vsv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], s0, v0, s0, v[1:2]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) {
+; GFX9-LABEL: mad_i32_svv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s0, v0, v[1:2]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_svv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, v[1:2]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_svv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], s0, s0, v0, v[1:2]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) {
+; GFX9-LABEL: mad_i32_vss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT:    v_add_u32_e32 v0, s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vss:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s2, s1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, s0, s[2:3]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vss:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_mov_b32 s2, s1
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v2, s0, s[2:3]
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) {
+; GFX9-LABEL: mad_i32_svs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_svs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s2, s1
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_svs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_mov_b32 s2, s1
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v2, s[2:3]
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
+; GFX9-LABEL: mad_i32_ssv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mul_i32 s0, s0, s1
+; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_ssv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, s1, v[0:1]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_ssv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, s0, s1, v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}
+
+define amdgpu_ps float @mad_i32_vvv_multiuse(i32 %a, i32 %b, i32 %c) {
+; GFX9-LABEL: mad_i32_vvv_multiuse:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, v1, v2
+; GFX9-NEXT:    flat_store_dword v[0:1], v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: mad_i32_vvv_multiuse:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v1, v2
+; GFX10-NEXT:    flat_store_dword v[0:1], v1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: mad_i32_vvv_multiuse:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v2
+; GFX11-NEXT:    flat_store_b32 v[0:1], v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ; return to shader part epilog
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  store i32 %mul, i32* undef
+  %cast = bitcast i32 %add to float
+  ret float %cast
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 7ae67bad68ae..b662256829b5 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -74,22 +74,21 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
 ; GFX9-NEXT:    v_add_u32_e32 v12, v17, v0
-; GFX9-NEXT:    v_add_u32_e32 v19, v9, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_madak_f32 v3, v3, v7, 0x3727c5ac
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v18, v3, v5
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v16
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, v13
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v15
+; GFX9-NEXT:    v_add_u32_e32 v19, v3, v16
+; GFX9-NEXT:    v_add_u32_e32 v3, v9, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v18
 ; GFX9-NEXT:    v_sub_u32_e32 v12, v12, v18
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v19, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4]
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[6:7], v12, v14
-; GFX9-NEXT:    v_sub_u32_e32 v18, v19, v18
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT:    v_add_u32_e32 v3, v18, v3
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, v18, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[18:19], 2, v[3:4]
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add_co_u32_e64 v18, s[6:7], v10, v18
 ; GFX9-NEXT:    v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7]
 ; GFX9-NEXT:    global_load_dword v3, v[18:19], off

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 63af19beaa0d..d5d8817783fb 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -33,13 +33,13 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; MUBUF-NEXT:    s_cbranch_execz .LBB0_2
 ; MUBUF-NEXT:  ; %bb.1: ; %if.then4.i
 ; MUBUF-NEXT:    v_add_nc_u32_e64 v0, 4, 0x4000
+; MUBUF-NEXT:    s_mov_b32 s0, 0x41c64e6d
 ; MUBUF-NEXT:    s_clause 0x1
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[36:39], 0 offen
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[36:39], 0 offen offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_add_nc_u32_e32 v0, v2, v1
-; MUBUF-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
-; MUBUF-NEXT:    v_add_nc_u32_e32 v0, 0x3039, v0
+; MUBUF-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, s0, 0x3039
 ; MUBUF-NEXT:    buffer_store_dword v0, v0, s[36:39], 0 offen
 ; MUBUF-NEXT:  .LBB0_2: ; %shader_eval_surface.exit
 ; MUBUF-NEXT:    s_endpgm
@@ -67,11 +67,11 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB0_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %if.then4.i
 ; FLATSCR-NEXT:    s_movk_i32 vcc_lo, 0x4000
+; FLATSCR-NEXT:    s_mov_b32 s0, 0x41c64e6d
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, vcc_lo offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_nc_u32_e32 v0, v1, v0
-; FLATSCR-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
-; FLATSCR-NEXT:    v_add_nc_u32_e32 v0, 0x3039, v0
+; FLATSCR-NEXT:    v_mad_u64_u32 v[0:1], s0, v0, s0, 0x3039
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
 ; FLATSCR-NEXT:  .LBB0_2: ; %shader_eval_surface.exit
 ; FLATSCR-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 609146dd4842..e4fdafe26551 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2796,33 +2796,32 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
-; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, 0x186a0, v4, 0
-; GFX1030-NEXT:    v_mul_lo_u32 v6, 0x186a0, v5
+; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v6
+; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], s4, 0x186a0, v5, 0
+; GFX1030-NEXT:    v_mad_u64_u32 v[3:4], s4, 0x186a0, v6, v[3:4]
 ; GFX1030-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v6
 ; GFX1030-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX1030-NEXT:    v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0
 ; GFX1030-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2
 ; GFX1030-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
 ; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_add_co_u32 v6, vcc_lo, v4, 2
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v5, 2
+; GFX1030-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo
 ; GFX1030-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0
 ; GFX1030-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1030-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s4
 ; GFX1030-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
-; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v4, 1
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v5, vcc_lo
+; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v5, 1
+; GFX1030-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
 ; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
 ; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc_lo
 ; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
 ; GFX1030-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_test_udiv64_mulhi_fold:


        


More information about the llvm-commits mailing list