[llvm] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant (PR #71035)

via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 2 01:07:16 PDT 2023


https://github.com/sstipanovic updated https://github.com/llvm/llvm-project/pull/71035

>From d4e215cae69fb78800065b433355e922901f4a41 Mon Sep 17 00:00:00 2001
From: Stefan Stipanovic <Stefan.Stipanovic at amd.com>
Date: Thu, 2 Nov 2023 09:06:03 +0100
Subject: [PATCH] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by
 constant.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  11 +
 .../AMDGPU/atomic_optimizations_buffer.ll     |  60 +++--
 .../atomic_optimizations_global_pointer.ll    | 232 +++++++++++-------
 .../atomic_optimizations_local_pointer.ll     | 137 ++++++-----
 .../atomic_optimizations_pixelshader.ll       | 127 +++++++---
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |  60 +++--
 .../atomic_optimizations_struct_buffer.ll     |  60 +++--
 llvm/test/CodeGen/AMDGPU/mul.ll               |  89 +++++++
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        | 146 +++++++----
 .../AMDGPU/srem-seteq-illegal-types.ll        |  15 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll    |  14 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |   4 +-
 13 files changed, 635 insertions(+), 322 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 607d59db7bcf709..bd3d59203c41122 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4080,6 +4080,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // FIXME: Probably should only be done for gfx9 and onward.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+    APInt CVal = C->getAPIntValue();
+    if (isPowerOf2_32(CVal.getZExtValue() - 1)) {
+      unsigned int Sqr = Log2_32(CVal.getZExtValue() - 1);
+      SDValue Pow2Constant = DAG.getConstant(Sqr, DL, N1.getValueType());
+      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0, Pow2Constant);
+      return DAG.getNode(ISD::ADD, DL, VT, Shl, N0);
+    }
+  }
+
   // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
   // matching.
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5aa..dda40a77aebda48 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -64,7 +67,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -92,8 +97,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -120,10 +126,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: add_i32_constant:
@@ -147,10 +154,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_constant:
@@ -176,11 +184,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W64-NEXT:    s_nop 0
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
@@ -207,11 +216,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W32-NEXT:    s_nop 0
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
@@ -1109,7 +1118,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1136,7 +1147,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1165,7 +1177,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1194,7 +1206,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1222,7 +1234,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1252,7 +1264,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -1284,7 +1296,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f83..78395020e99fa34 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -41,42 +41,81 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: add_i32_constant:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT:    s_mov_b64 s[6:7], exec
-; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT:    ; implicit-def: $vgpr1
-; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT:    s_cbranch_execz .LBB0_2
-; GFX89-NEXT:  ; %bb.1:
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT:    s_mul_i32 s2, s2, 5
-; GFX89-NEXT:    s_mov_b32 s11, 0xf000
-; GFX89-NEXT:    s_mov_b32 s10, -1
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    v_mov_b32_e32 v1, s2
-; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_wbinvl1_vol
-; GFX89-NEXT:  .LBB0_2:
-; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB0_2
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s8, s2
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT:    s_mul_i32 s2, s2, 5
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
+; GFX8-NEXT:    s_mov_b32 s9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:  .LBB0_2:
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_2
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_mul_i32 s2, s2, 5
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:  .LBB0_2:
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
@@ -108,8 +147,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -143,8 +183,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -179,9 +220,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_nop 0
@@ -217,9 +259,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_nop 0
@@ -858,13 +901,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s7, v1
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -1510,7 +1554,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -1541,7 +1587,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:  .LBB6_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
@@ -1575,7 +1622,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  .LBB6_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -1613,7 +1660,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
@@ -1649,7 +1696,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
@@ -1686,7 +1733,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1164-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -1725,7 +1772,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1132-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -2371,13 +2418,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s7, v1
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -2408,12 +2456,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  .LBB9_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s5
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s6, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
@@ -2447,12 +2496,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB9_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -2489,14 +2539,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:  .LBB9_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT:    v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1064-NEXT:    v_add_co_ci_u32_e64 v1, s[2:3], 0, 0, s[2:3]
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s4, v0
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -2529,11 +2580,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:  .LBB9_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1032-NEXT:    v_add_co_u32 v0, s4, v3, v2
+; GFX1032-NEXT:    v_add_co_ci_u32_e64 v1, s4, 0, 0, s4
 ; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
@@ -2570,15 +2622,19 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    buffer_gl1_inv
 ; GFX1164-NEXT:  .LBB9_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-NEXT:    v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s4, v0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
 ; GFX1164-NEXT:    s_nop 0
@@ -2612,12 +2668,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    buffer_gl1_inv
 ; GFX1132-NEXT:  .LBB9_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_co_u32 v0, s4, v3, v2
+; GFX1132-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s4
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
 ; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index abd9a4159f8ccd9..fdc8ee679b5b8b1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -42,7 +42,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -69,9 +71,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -97,9 +101,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -127,8 +132,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -157,8 +163,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -189,9 +196,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
@@ -222,9 +230,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
@@ -1014,12 +1023,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -1625,7 +1635,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -1653,7 +1664,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
@@ -1682,7 +1694,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
@@ -1713,7 +1725,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
@@ -1744,7 +1756,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
@@ -1777,7 +1789,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1164-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -1811,7 +1823,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1132-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -2608,12 +2620,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -2640,12 +2653,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s5
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s6, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -2673,12 +2687,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -2708,13 +2723,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1064-NEXT:    v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1064-NEXT:    v_add_co_ci_u32_e64 v1, s[2:3], 0, 0, s[2:3]
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s4, v0
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2742,10 +2758,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1032-NEXT:    v_add_co_u32 v0, s4, v3, v2
+; GFX1032-NEXT:    v_add_co_ci_u32_e64 v1, s4, 0, 0, s4
 ; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
@@ -2778,14 +2795,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:  .LBB11_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-NEXT:    v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s[2:3]
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s4, v0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -2816,11 +2837,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:  .LBB11_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_co_u32 v0, s4, v3, v2
+; GFX1132-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s4
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
 ; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 1ebd864e7e03aa9..8e6a5502b4967dc 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -37,7 +37,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7-NEXT:    s_waitcnt expcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7-NEXT:  .LBB0_4: ; %Flow
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
@@ -49,40 +52,78 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX7-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
 ; GFX7-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: add_i32_constant:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_mov_b64 s[10:11], exec
-; GFX89-NEXT:    ; implicit-def: $vgpr0
-; GFX89-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
-; GFX89-NEXT:    s_cbranch_execz .LBB0_4
-; GFX89-NEXT:  ; %bb.1:
-; GFX89-NEXT:    s_mov_b64 s[12:13], exec
-; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
-; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
-; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT:    ; implicit-def: $vgpr1
-; GFX89-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX89-NEXT:    s_cbranch_execz .LBB0_3
-; GFX89-NEXT:  ; %bb.2:
-; GFX89-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
-; GFX89-NEXT:    s_mul_i32 s12, s12, 5
-; GFX89-NEXT:    v_mov_b32_e32 v1, s12
-; GFX89-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
-; GFX89-NEXT:  .LBB0_3:
-; GFX89-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT:  .LBB0_4: ; %Flow
-; GFX89-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX89-NEXT:    s_wqm_b64 s[4:5], -1
-; GFX89-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX89-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX89-NEXT:    s_cbranch_vccnz .LBB0_6
-; GFX89-NEXT:  ; %bb.5: ; %if
-; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_mov_b64 s[10:11], exec
+; GFX8-NEXT:    ; implicit-def: $vgpr0
+; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
+; GFX8-NEXT:    s_cbranch_execz .LBB0_4
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB0_3
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
+; GFX8-NEXT:    s_mul_i32 s12, s12, 5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s12
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
+; GFX8-NEXT:  .LBB0_3:
+; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:  .LBB0_4: ; %Flow
+; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
+; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX8-NEXT:  ; %bb.5: ; %if
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b64 s[10:11], exec
+; GFX9-NEXT:    ; implicit-def: $vgpr0
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
+; GFX9-NEXT:    s_cbranch_execz .LBB0_4
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-NEXT:  ; %bb.2:
+; GFX9-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
+; GFX9-NEXT:    s_mul_i32 s12, s12, 5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
+; GFX9-NEXT:  .LBB0_3:
+; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s4
+; GFX9-NEXT:  .LBB0_4: ; %Flow
+; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX9-NEXT:  ; %bb.5: ; %if
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
@@ -108,7 +149,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX1064-NEXT:    v_add3_u32 v0, v1, v0, s4
 ; GFX1064-NEXT:  .LBB0_4: ; %Flow
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
@@ -143,7 +185,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX1032-NEXT:    v_add3_u32 v0, v1, v0, s4
 ; GFX1032-NEXT:  .LBB0_4: ; %Flow
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1032-NEXT:    s_wqm_b32 s4, -1
@@ -181,8 +224,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX1164-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-NEXT:    v_add3_u32 v0, v1, v0, s4
 ; GFX1164-NEXT:  .LBB0_4: ; %Flow
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
@@ -222,8 +266,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX1132-NEXT:    v_add3_u32 v0, v1, v0, s4
 ; GFX1132-NEXT:  .LBB0_4: ; %Flow
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1132-NEXT:    s_wqm_b32 s4, -1
@@ -650,3 +695,5 @@ if:
 else:
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX89: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index baa0c72dbf63e2d..aa17e5893d05742 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -63,7 +66,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -91,8 +96,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -119,10 +125,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: add_i32_constant:
@@ -146,10 +153,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_constant:
@@ -175,11 +183,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W64-NEXT:    s_nop 0
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
@@ -206,11 +215,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W32-NEXT:    s_nop 0
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
@@ -811,7 +820,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -838,7 +849,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -867,7 +879,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -896,7 +908,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -924,7 +936,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -954,7 +966,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -986,7 +998,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 37d421b01797945..c61bd19b1fc55df 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -65,7 +68,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -94,8 +99,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -123,10 +129,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: add_i32_constant:
@@ -151,10 +158,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_constant:
@@ -181,11 +189,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W64-NEXT:    s_nop 0
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
@@ -213,11 +222,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W32-NEXT:    s_nop 0
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
@@ -937,7 +946,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -965,7 +976,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -995,7 +1007,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1025,7 +1037,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1054,7 +1066,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1085,7 +1097,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -1118,7 +1130,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b4e9376d8277737..d797f9b6d0a92bd 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2395,6 +2395,95 @@ entry:
   ret void
 }
 
+define i32 @mul_pow2_plus_1(i32 %val) {
+; SI-LABEL: mul_pow2_plus_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mul_pow2_plus_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_pow2_plus_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_pow2_plus_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_pow2_plus_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mul_pow2_plus_1:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+  %mul = mul i32 %val, 9
+  ret i32 %mul
+}
+
+define i32 @mul_pow2_plus_1_add(i32 %val) {
+; SI-LABEL: mul_pow2_plus_1_add:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mul_pow2_plus_1_add:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_pow2_plus_1_add:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_pow2_plus_1_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-NEXT:    v_add3_u32 v0, v1, v0, 1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_pow2_plus_1_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v0, v1, v0, 1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mul_pow2_plus_1_add:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+  %mul = mul i32 %val, 9
+  %add = add i32 %mul, 1
+  ret i32 %add
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index ca79772dbed74c0..d183b3f312c4533 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -2610,27 +2610,31 @@ define i32 @v_mul_9_add_52_i32(i32 %arg) {
 ; GFX67-LABEL: v_mul_9_add_52_i32:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX67-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 52, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_9_add_52_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 52, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_9_add_52_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, 52
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_9_add_52_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 9, 52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-NEXT:    v_add3_u32 v0, v1, v0, 52
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i32 %arg, 9
   %add = add i32 %mul, 52
@@ -2642,25 +2646,33 @@ define i16 @v_mul_9_add_52_i16(i16 %arg) {
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-NEXT:    v_mad_u32_u24 v0, v0, 9, 52
+; GFX67-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 52, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_9_add_52_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u16 v0, v0, 9, 52
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 3, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, v1, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, 52, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_9_add_52_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, 9, 52
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 3, v0
+; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
+; GFX9-NEXT:    v_add_u16_e32 v0, 52, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_9_add_52_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mad_u16 v0, v0, 9, 52
+; GFX10-NEXT:    v_lshlrev_b16 v1, 3, v0
+; GFX10-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX10-NEXT:    v_add_nc_u16 v0, v0, 52
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg, 9
   %add = add i16 %mul, 52
@@ -2671,10 +2683,13 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
 ; GFX67-LABEL: v_mul_9_add_52_v2i16:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX67-NEXT:    v_mad_u32_u24 v1, v1, 9, 52
-; GFX67-NEXT:    v_mad_u32_u24 v0, v0, 9, 52
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX67-NEXT:    v_lshlrev_b32_e32 v2, 3, v1
+; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 52, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 52, v0
 ; GFX67-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2685,9 +2700,13 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_mad_u16 v1, v1, 9, 52
-; GFX8-NEXT:    v_mad_u16 v0, v0, 9, 52
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 3, v1
+; GFX8-NEXT:    v_add_u16_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 3, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 52
+; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_u16_e32 v0, 52, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2713,10 +2732,9 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
 ; GFX6-LABEL: v_mul_9_add_52_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 9
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, 9
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 9
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[0:1], 3
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 52, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2724,16 +2742,20 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
 ; GFX7-LABEL: v_mul_9_add_52_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_lo_u32 v2, v1, 9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_9_add_52_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v2, v1, 9
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2742,17 +2764,17 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
-; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, 9, v[1:2]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
+; GFX900-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_9_add_52_i64:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
-; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, 9, v[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_9_add_52_i64:
@@ -2760,7 +2782,8 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 9, 52
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v2, 9, v[1:2]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
+; GFX10-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 9
   %add = add i64 %mul, 52
@@ -2771,27 +2794,31 @@ define i32 @v_mul_5_add_1_i32(i32 %arg) {
 ; GFX67-LABEL: v_mul_5_add_1_i32:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_mul_lo_u32 v0, v0, 5
+; GFX67-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_5_add_1_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_5_add_1_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, 1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 5, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10-NEXT:    v_add3_u32 v0, v1, v0, 1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i32 %arg, 5
   %add = add i32 %mul, 1
@@ -2839,25 +2866,33 @@ define i16 @v_mul_5_add_1_i16(i16 %arg) {
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-NEXT:    v_mad_u32_u24 v0, v0, 5, 1
+; GFX67-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_5_add_1_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u16 v0, v0, 5, 1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, v1, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_5_add_1_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, 5, 1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 2, v0
+; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
+; GFX9-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mad_u16 v0, v0, 5, 1
+; GFX10-NEXT:    v_lshlrev_b16 v1, 2, v0
+; GFX10-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX10-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg, 5
   %add = add i16 %mul, 1
@@ -2905,10 +2940,13 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
 ; GFX67-LABEL: v_mul_5_add_1_v2i16:
 ; GFX67:       ; %bb.0:
 ; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX67-NEXT:    v_mad_u32_u24 v1, v1, 5, 1
-; GFX67-NEXT:    v_mad_u32_u24 v0, v0, 5, 1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX67-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
+; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_or_b32_e32 v0, v0, v2
@@ -2919,9 +2957,13 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_mad_u16 v1, v1, 5, 1
-; GFX8-NEXT:    v_mad_u16 v0, v0, 5, 1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 2, v1
+; GFX8-NEXT:    v_add_u16_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 2, v0
+; GFX8-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2995,10 +3037,9 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
 ; GFX6-LABEL: v_mul_5_add_1_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 5
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, 5
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 5
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[0:1], 2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -3006,16 +3047,20 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
 ; GFX7-LABEL: v_mul_5_add_1_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_lo_u32 v2, v1, 5
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_5_add_1_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v2, v1, 5
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3024,17 +3069,17 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
-; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, 5, v[1:2]
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX900-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_5_add_1_i64:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
-; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
-; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, 5, v[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_i64:
@@ -3042,7 +3087,8 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 5, 1
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v2, 5, v[1:2]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
+; GFX10-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 5
   %add = add i64 %mul, 1
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 126b17e718b59f1..ef5d819da79ffde 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -23,7 +23,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfe_i32 v1, v0, 0, 4
-; CHECK-NEXT:    v_mul_i32_i24_e32 v1, 3, v1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
 ; CHECK-NEXT:    v_bfe_u32 v1, v1, 7, 1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -71,20 +72,22 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
 ; CHECK-NEXT:    v_mul_hi_i32 v4, v4, s4
 ; CHECK-NEXT:    v_mul_hi_i32 v3, v3, s5
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 1, v5
+; CHECK-NEXT:    v_ashrrev_i32_e32 v5, 1, v5
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 31, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
+; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 1, v4
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 31, v3
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_mul_lo_u32 v5, v5, 9
-; CHECK-NEXT:    v_mul_lo_u32 v4, v4, 9
+; CHECK-NEXT:    v_lshlrev_b32_e32 v6, 3, v5
+; CHECK-NEXT:    v_lshlrev_b32_e32 v7, 3, v4
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v3, -9
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 930ba80ad69638d..86101e583458e8c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -158,7 +158,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec
-  ; SI-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
+  ; SI-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5.if.end:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index c71dc06c68d8d68..9183f043f052cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -92,20 +92,20 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:  .LBB2_1: ; %if.end
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; SI-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v0
+; SI-NEXT:    v_add_nc_u32_e32 v2, 1, v3
 ; SI-NEXT:    s_add_i32 s1, s1, 1
 ; SI-NEXT:    s_cmp_lt_i32 s1, s0
 ; SI-NEXT:    s_cbranch_scc0 .LBB2_6
 ; SI-NEXT:  .LBB2_2: ; %for.body
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; SI-NEXT:    s_xor_b32 s2, exec_lo, s2
 ; SI-NEXT:  ; %bb.3: ; %else
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_lo_u32 v0, v2, 3
-; SI-NEXT:    v_mul_f32_e32 v3, v1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, v1, v2
+; SI-NEXT:    v_lshl_add_u32 v3, v2, 1, v2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  ; %bb.4: ; %Flow
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
@@ -113,11 +113,11 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-NEXT:    s_cbranch_execz .LBB2_1
 ; SI-NEXT:  ; %bb.5: ; %if
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT:    v_mul_f32_e32 v3, s1, v1
-; SI-NEXT:    v_add_nc_u32_e32 v0, 1, v2
+; SI-NEXT:    v_mul_f32_e32 v0, s1, v1
+; SI-NEXT:    v_add_nc_u32_e32 v3, 1, v2
 ; SI-NEXT:    s_branch .LBB2_1
 ; SI-NEXT:  .LBB2_6: ; %for.end
-; SI-NEXT:    v_add_f32_e32 v0, v0, v3
+; SI-NEXT:    v_add_f32_e32 v0, v3, v0
 ; SI-NEXT:    ; return to shader part epilog
 entry:
 ;  %break = icmp sgt i32 %bound, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95f947cbca14f05..6bb066f06dd9a24 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1536,7 +1536,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
 ; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
 ; GFX9-W64-NEXT:  ; %bb.3: ; %IF
-; GFX9-W64-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX9-W64-NEXT:  ; %bb.4: ; %END
 ; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -1566,7 +1566,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
 ; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
 ; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
 ; GFX10-W32-NEXT:  ; %bb.3: ; %IF
-; GFX10-W32-NEXT:    v_mul_lo_u32 v0, v5, 3
+; GFX10-W32-NEXT:    v_lshl_add_u32 v0, v5, 1, v5
 ; GFX10-W32-NEXT:  ; %bb.4: ; %END
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12



More information about the llvm-commits mailing list