[llvm] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant (PR #71035)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 2 01:07:16 PDT 2023
https://github.com/sstipanovic updated https://github.com/llvm/llvm-project/pull/71035
>From d4e215cae69fb78800065b433355e922901f4a41 Mon Sep 17 00:00:00 2001
From: Stefan Stipanovic <Stefan.Stipanovic at amd.com>
Date: Thu, 2 Nov 2023 09:06:03 +0100
Subject: [PATCH] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by
constant.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 11 +
.../AMDGPU/atomic_optimizations_buffer.ll | 60 +++--
.../atomic_optimizations_global_pointer.ll | 232 +++++++++++-------
.../atomic_optimizations_local_pointer.ll | 137 ++++++-----
.../atomic_optimizations_pixelshader.ll | 127 +++++++---
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 60 +++--
.../atomic_optimizations_struct_buffer.ll | 60 +++--
llvm/test/CodeGen/AMDGPU/mul.ll | 89 +++++++
.../AMDGPU/reassoc-mul-add-1-to-mad.ll | 146 +++++++----
.../AMDGPU/srem-seteq-illegal-types.ll | 15 +-
llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 2 +-
llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll | 14 +-
llvm/test/CodeGen/AMDGPU/wqm.ll | 4 +-
13 files changed, 635 insertions(+), 322 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 607d59db7bcf709..bd3d59203c41122 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4080,6 +4080,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ // FIXME: Probably should only be done for gfx9 and onward.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+ APInt CVal = C->getAPIntValue();
+ if (isPowerOf2_32(CVal.getZExtValue() - 1)) {
+ unsigned int Sqr = Log2_32(CVal.getZExtValue() - 1);
+ SDValue Pow2Constant = DAG.getConstant(Sqr, DL, N1.getValueType());
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0, Pow2Constant);
+ return DAG.getNode(ISD::ADD, DL, VT, Shl, N0);
+ }
+ }
+
// Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
// matching.
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5aa..dda40a77aebda48 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -64,7 +67,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -92,8 +97,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -120,10 +126,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -147,10 +154,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -176,11 +184,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT: s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -207,11 +216,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1109,7 +1118,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1136,7 +1147,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1165,7 +1177,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1194,7 +1206,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -1222,7 +1234,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
@@ -1252,7 +1264,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -1284,7 +1296,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f83..78395020e99fa34 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -41,42 +41,81 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX89-LABEL: add_i32_constant:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT: ; implicit-def: $vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT: s_cbranch_execz .LBB0_2
-; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT: s_mul_i32 s2, s2, 5
-; GFX89-NEXT: s_mov_b32 s11, 0xf000
-; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, s2
-; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: .LBB0_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT: v_readfirstlane_b32 s4, v1
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr1
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB0_2
+; GFX8-NEXT: ; %bb.1:
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_mul_i32 s2, s2, 5
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: .LBB0_2:
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB0_2
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_mul_i32 s2, s2, 5
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: .LBB0_2:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
@@ -108,8 +147,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
@@ -143,8 +183,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
@@ -179,9 +220,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
@@ -217,9 +259,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
@@ -858,13 +901,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v1
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s6, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1510,7 +1554,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1541,7 +1587,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -1575,7 +1622,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -1613,7 +1660,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
@@ -1649,7 +1696,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
@@ -1686,7 +1733,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -1725,7 +1772,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -2371,13 +2418,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v1
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -2408,12 +2456,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB9_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s7, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -2447,12 +2496,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB9_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s6, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s7, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -2489,14 +2539,15 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT: v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1064-NEXT: v_add_co_ci_u32_e64 v1, s[2:3], 0, 0, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s4, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
@@ -2529,11 +2580,12 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 2, v2
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1032-NEXT: v_add_co_u32 v0, s4, v3, v2
+; GFX1032-NEXT: v_add_co_ci_u32_e64 v1, s4, 0, 0, s4
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@@ -2570,15 +2622,19 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: .LBB9_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffe
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s4, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
@@ -2612,12 +2668,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: .LBB9_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: v_lshlrev_b32_e32 v3, 2, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_add_co_u32 v0, s4, v3, v2
+; GFX1132-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index abd9a4159f8ccd9..fdc8ee679b5b8b1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -42,7 +42,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -69,9 +71,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -97,9 +101,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -127,8 +132,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -157,8 +163,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -189,9 +196,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -222,9 +230,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -1014,12 +1023,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
-; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v1
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s6, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1625,7 +1635,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1653,7 +1664,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
@@ -1682,7 +1694,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
@@ -1713,7 +1725,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
@@ -1744,7 +1756,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
@@ -1777,7 +1789,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -1811,7 +1823,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -2608,12 +2620,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
-; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v1
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -2640,12 +2653,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s7, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -2673,12 +2687,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s6, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s7, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -2708,13 +2723,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1064-NEXT: v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1064-NEXT: v_add_co_ci_u32_e64 v1, s[2:3], 0, 0, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s4, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2742,10 +2758,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 2, v2
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1032-NEXT: v_add_co_u32 v0, s4, v3, v2
+; GFX1032-NEXT: v_add_co_ci_u32_e64 v1, s4, 0, 0, s4
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@@ -2778,14 +2795,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: .LBB11_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_add_co_u32 v0, s[2:3], v3, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffe
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s4, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -2816,11 +2837,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: .LBB11_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_lshlrev_b32_e32 v3, 2, v2
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_add_co_u32 v0, s4, v3, v2
+; GFX1132-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s4
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 1ebd864e7e03aa9..8e6a5502b4967dc 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -37,7 +37,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7-NEXT: s_waitcnt expcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7-NEXT: .LBB0_4: ; %Flow
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_wqm_b64 s[4:5], -1
@@ -49,40 +52,78 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX7-NEXT: .LBB0_6: ; %UnifiedReturnBlock
; GFX7-NEXT: s_endpgm
;
-; GFX89-LABEL: add_i32_constant:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_mov_b64 s[10:11], exec
-; GFX89-NEXT: ; implicit-def: $vgpr0
-; GFX89-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
-; GFX89-NEXT: s_cbranch_execz .LBB0_4
-; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_mov_b64 s[12:13], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT: ; implicit-def: $vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[10:11], vcc
-; GFX89-NEXT: s_cbranch_execz .LBB0_3
-; GFX89-NEXT: ; %bb.2:
-; GFX89-NEXT: s_bcnt1_i32_b64 s12, s[12:13]
-; GFX89-NEXT: s_mul_i32 s12, s12, 5
-; GFX89-NEXT: v_mov_b32_e32 v1, s12
-; GFX89-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
-; GFX89-NEXT: .LBB0_3:
-; GFX89-NEXT: s_or_b64 exec, exec, s[10:11]
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: v_readfirstlane_b32 s4, v1
-; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT: .LBB0_4: ; %Flow
-; GFX89-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX89-NEXT: s_wqm_b64 s[4:5], -1
-; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX89-NEXT: s_cbranch_vccnz .LBB0_6
-; GFX89-NEXT: ; %bb.5: ; %if
-; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT: .LBB0_6: ; %UnifiedReturnBlock
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b64 s[10:11], exec
+; GFX8-NEXT: ; implicit-def: $vgpr0
+; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
+; GFX8-NEXT: s_cbranch_execz .LBB0_4
+; GFX8-NEXT: ; %bb.1:
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr1
+; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB0_3
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_bcnt1_i32_b64 s12, s[12:13]
+; GFX8-NEXT: s_mul_i32 s12, s12, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, s12
+; GFX8-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
+; GFX8-NEXT: .LBB0_3:
+; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: .LBB0_4: ; %Flow
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_wqm_b64 s[4:5], -1
+; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
+; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX8-NEXT: ; %bb.5: ; %if
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: .LBB0_6: ; %UnifiedReturnBlock
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b64 s[10:11], exec
+; GFX9-NEXT: ; implicit-def: $vgpr0
+; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
+; GFX9-NEXT: s_cbranch_execz .LBB0_4
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_mov_b64 s[12:13], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr1
+; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB0_3
+; GFX9-NEXT: ; %bb.2:
+; GFX9-NEXT: s_bcnt1_i32_b64 s12, s[12:13]
+; GFX9-NEXT: s_mul_i32 s12, s12, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
+; GFX9-NEXT: .LBB0_3:
+; GFX9-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s4
+; GFX9-NEXT: .LBB0_4: ; %Flow
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_wqm_b64 s[4:5], -1
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX9-NEXT: ; %bb.5: ; %if
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: .LBB0_6: ; %UnifiedReturnBlock
+; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
@@ -108,7 +149,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX1064-NEXT: v_add3_u32 v0, v1, v0, s4
; GFX1064-NEXT: .LBB0_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_wqm_b64 s[4:5], -1
@@ -143,7 +185,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX1032-NEXT: v_add3_u32 v0, v1, v0, s4
; GFX1032-NEXT: .LBB0_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1032-NEXT: s_wqm_b32 s4, -1
@@ -181,8 +224,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX1164-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_add3_u32 v0, v1, v0, s4
; GFX1164-NEXT: .LBB0_4: ; %Flow
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_wqm_b64 s[4:5], -1
@@ -222,8 +266,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX1132-NEXT: v_add3_u32 v0, v1, v0, s4
; GFX1132-NEXT: .LBB0_4: ; %Flow
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-NEXT: s_wqm_b32 s4, -1
@@ -650,3 +695,5 @@ if:
else:
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX89: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index baa0c72dbf63e2d..aa17e5893d05742 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -63,7 +66,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -91,8 +96,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -119,10 +125,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -146,10 +153,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -175,11 +183,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT: s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -206,11 +215,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -811,7 +820,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -838,7 +849,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -867,7 +879,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -896,7 +908,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -924,7 +936,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
@@ -954,7 +966,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -986,7 +998,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 37d421b01797945..c61bd19b1fc55df 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -65,7 +68,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -94,8 +99,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -123,10 +129,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -151,10 +158,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -181,11 +189,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT: s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -213,11 +222,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -937,7 +946,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -965,7 +976,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -995,7 +1007,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1025,7 +1037,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -1054,7 +1066,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
@@ -1085,7 +1097,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -1118,7 +1130,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b4e9376d8277737..d797f9b6d0a92bd 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2395,6 +2395,95 @@ entry:
ret void
}
+define i32 @mul_pow2_plus_1(i32 %val) {
+; SI-LABEL: mul_pow2_plus_1:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mul_pow2_plus_1:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_pow2_plus_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_pow2_plus_1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_pow2_plus_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mul_pow2_plus_1:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+ %mul = mul i32 %val, 9
+ ret i32 %mul
+}
+
+define i32 @mul_pow2_plus_1_add(i32 %val) {
+; SI-LABEL: mul_pow2_plus_1_add:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mul_pow2_plus_1_add:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: mul_pow2_plus_1_add:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, 1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_pow2_plus_1_add:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-NEXT: v_add3_u32 v0, v1, v0, 1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_pow2_plus_1_add:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v0, v1, v0, 1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mul_pow2_plus_1_add:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+ %mul = mul i32 %val, 9
+ %add = add i32 %mul, 1
+ ret i32 %add
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index ca79772dbed74c0..d183b3f312c4533 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -2610,27 +2610,31 @@ define i32 @v_mul_9_add_52_i32(i32 %arg) {
; GFX67-LABEL: v_mul_9_add_52_i32:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mul_lo_u32 v0, v0, 9
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX67-NEXT: v_add_i32_e32 v0, vcc, 52, v0
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_9_add_52_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, v0, 9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 52, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_9_add_52_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, 52
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_9_add_52_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, 9, 52
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX10-NEXT: v_add3_u32 v0, v1, v0, 52
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = mul i32 %arg, 9
%add = add i32 %mul, 52
@@ -2642,25 +2646,33 @@ define i16 @v_mul_9_add_52_i16(i16 %arg) {
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, 9, 52
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, 52, v0
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_9_add_52_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u16 v0, v0, 9, 52
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 3, v0
+; GFX8-NEXT: v_add_u16_e32 v0, v1, v0
+; GFX8-NEXT: v_add_u16_e32 v0, 52, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_9_add_52_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_legacy_u16 v0, v0, 9, 52
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 3, v0
+; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
+; GFX9-NEXT: v_add_u16_e32 v0, 52, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_9_add_52_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u16 v0, v0, 9, 52
+; GFX10-NEXT: v_lshlrev_b16 v1, 3, v0
+; GFX10-NEXT: v_add_nc_u16 v0, v1, v0
+; GFX10-NEXT: v_add_nc_u16 v0, v0, 52
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = mul i16 %arg, 9
%add = add i16 %mul, 52
@@ -2671,10 +2683,13 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
; GFX67-LABEL: v_mul_9_add_52_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, 9, 52
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, 9, 52
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 3, v1
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 52, v1
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, 52, v0
; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_or_b32_e32 v0, v0, v2
@@ -2685,9 +2700,13 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_mad_u16 v1, v1, 9, 52
-; GFX8-NEXT: v_mad_u16 v0, v0, 9, 52
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v2, 3, v1
+; GFX8-NEXT: v_add_u16_e32 v1, v2, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v2, 3, v0
+; GFX8-NEXT: v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 52
+; GFX8-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_u16_e32 v0, 52, v0
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -2713,10 +2732,9 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
; GFX6-LABEL: v_mul_9_add_52_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, 9
-; GFX6-NEXT: v_mul_hi_u32 v2, v0, 9
-; GFX6-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT: v_lshl_b64 v[2:3], v[0:1], 3
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 52, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2724,16 +2742,20 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
; GFX7-LABEL: v_mul_9_add_52_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_lo_u32 v2, v1, 9
+; GFX7-NEXT: v_mov_b32_e32 v2, v1
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_9_add_52_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v2, v1, 9
+; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -2742,17 +2764,17 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v2, v1
; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
-; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, 9, v[1:2]
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v2
+; GFX900-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_mul_9_add_52_i64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v2
; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 9, v[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_9_add_52_i64:
@@ -2760,7 +2782,8 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, 9, 52
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, v2, 9, v[1:2]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v2
+; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = mul i64 %arg, 9
%add = add i64 %mul, 52
@@ -2771,27 +2794,31 @@ define i32 @v_mul_5_add_1_i32(i32 %arg) {
; GFX67-LABEL: v_mul_5_add_1_i32:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_mul_lo_u32 v0, v0, 5
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_5_add_1_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, v0, 5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_5_add_1_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, 1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_5_add_1_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, 5, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10-NEXT: v_add3_u32 v0, v1, v0, 1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = mul i32 %arg, 5
%add = add i32 %mul, 1
@@ -2839,25 +2866,33 @@ define i16 @v_mul_5_add_1_i16(i16 %arg) {
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, 5, 1
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GFX67-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_5_add_1_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mad_u16 v0, v0, 5, 1
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u16_e32 v0, v1, v0
+; GFX8-NEXT: v_add_u16_e32 v0, 1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_5_add_1_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_legacy_u16 v0, v0, 5, 1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 2, v0
+; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
+; GFX9-NEXT: v_add_u16_e32 v0, 1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_5_add_1_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mad_u16 v0, v0, 5, 1
+; GFX10-NEXT: v_lshlrev_b16 v1, 2, v0
+; GFX10-NEXT: v_add_nc_u16 v0, v1, v0
+; GFX10-NEXT: v_add_nc_u16 v0, v0, 1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = mul i16 %arg, 5
%add = add i16 %mul, 1
@@ -2905,10 +2940,13 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
; GFX67-LABEL: v_mul_5_add_1_v2i16:
; GFX67: ; %bb.0:
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-NEXT: v_mad_u32_u24 v1, v1, 5, 1
-; GFX67-NEXT: v_mad_u32_u24 v0, v0, 5, 1
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 2, v1
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-NEXT: v_or_b32_e32 v0, v0, v2
@@ -2919,9 +2957,13 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_mad_u16 v1, v1, 5, 1
-; GFX8-NEXT: v_mad_u16 v0, v0, 5, 1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v2, 2, v1
+; GFX8-NEXT: v_add_u16_e32 v1, v2, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v2, 2, v0
+; GFX8-NEXT: v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 1
+; GFX8-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_u16_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -2995,10 +3037,9 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
; GFX6-LABEL: v_mul_5_add_1_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, 5
-; GFX6-NEXT: v_mul_hi_u32 v2, v0, 5
-; GFX6-NEXT: v_mul_lo_u32 v0, v0, 5
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT: v_lshl_b64 v[2:3], v[0:1], 2
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3006,16 +3047,20 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
; GFX7-LABEL: v_mul_5_add_1_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_lo_u32 v2, v1, 5
+; GFX7-NEXT: v_mov_b32_e32 v2, v1
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_5_add_1_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v2, v1, 5
+; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -3024,17 +3069,17 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v2, v1
; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
-; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, 5, v[1:2]
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX900-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_mul_5_add_1_i64:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 2, v2
; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 5, v[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_5_add_1_i64:
@@ -3042,7 +3087,8 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, 5, 1
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, v2, 5, v[1:2]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v2
+; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = mul i64 %arg, 5
%add = add i64 %mul, 1
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 126b17e718b59f1..ef5d819da79ffde 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -23,7 +23,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_bfe_i32 v1, v0, 0, 4
-; CHECK-NEXT: v_mul_i32_i24_e32 v1, 3, v1
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 1, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v2, 4, v1
; CHECK-NEXT: v_bfe_u32 v1, v1, 7, 1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
@@ -71,20 +72,22 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
; CHECK-NEXT: v_mul_hi_i32 v4, v4, s4
; CHECK-NEXT: v_mul_hi_i32 v3, v3, s5
; CHECK-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; CHECK-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; CHECK-NEXT: v_ashrrev_i32_e32 v5, 1, v5
; CHECK-NEXT: v_lshrrev_b32_e32 v7, 31, v4
-; CHECK-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; CHECK-NEXT: v_ashrrev_i32_e32 v4, 1, v4
; CHECK-NEXT: v_lshrrev_b32_e32 v8, 31, v3
; CHECK-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT: v_mul_lo_u32 v5, v5, 9
-; CHECK-NEXT: v_mul_lo_u32 v4, v4, 9
+; CHECK-NEXT: v_lshlrev_b32_e32 v6, 3, v5
+; CHECK-NEXT: v_lshlrev_b32_e32 v7, 3, v4
; CHECK-NEXT: v_mul_lo_u32 v3, v3, -9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 930ba80ad69638d..86101e583458e8c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -158,7 +158,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-NEXT: successors: %bb.2(0x80000000)
; SI-NEXT: {{ $}}
; SI-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec
- ; SI-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
+ ; SI-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec
; SI-NEXT: S_BRANCH %bb.2
; SI-NEXT: {{ $}}
; SI-NEXT: bb.5.if.end:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index c71dc06c68d8d68..9183f043f052cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -92,20 +92,20 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-NEXT: .LBB2_1: ; %if.end
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; SI-NEXT: v_add_nc_u32_e32 v2, 1, v0
+; SI-NEXT: v_add_nc_u32_e32 v2, 1, v3
; SI-NEXT: s_add_i32 s1, s1, 1
; SI-NEXT: s_cmp_lt_i32 s1, s0
; SI-NEXT: s_cbranch_scc0 .LBB2_6
; SI-NEXT: .LBB2_2: ; %for.body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b32 s2, vcc_lo
; SI-NEXT: s_xor_b32 s2, exec_lo, s2
; SI-NEXT: ; %bb.3: ; %else
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT: v_mul_lo_u32 v0, v2, 3
-; SI-NEXT: v_mul_f32_e32 v3, v1, v2
+; SI-NEXT: v_mul_f32_e32 v0, v1, v2
+; SI-NEXT: v_lshl_add_u32 v3, v2, 1, v2
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; %bb.4: ; %Flow
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
@@ -113,11 +113,11 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-NEXT: s_cbranch_execz .LBB2_1
; SI-NEXT: ; %bb.5: ; %if
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
-; SI-NEXT: v_mul_f32_e32 v3, s1, v1
-; SI-NEXT: v_add_nc_u32_e32 v0, 1, v2
+; SI-NEXT: v_mul_f32_e32 v0, s1, v1
+; SI-NEXT: v_add_nc_u32_e32 v3, 1, v2
; SI-NEXT: s_branch .LBB2_1
; SI-NEXT: .LBB2_6: ; %for.end
-; SI-NEXT: v_add_f32_e32 v0, v0, v3
+; SI-NEXT: v_add_f32_e32 v0, v3, v0
; SI-NEXT: ; return to shader part epilog
entry:
; %break = icmp sgt i32 %bound, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95f947cbca14f05..6bb066f06dd9a24 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1536,7 +1536,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
; GFX9-W64-NEXT: ; %bb.2: ; %Flow
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
; GFX9-W64-NEXT: ; %bb.3: ; %IF
-; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3
+; GFX9-W64-NEXT: v_lshl_add_u32 v0, v5, 1, v5
; GFX9-W64-NEXT: ; %bb.4: ; %END
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
@@ -1566,7 +1566,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
; GFX10-W32-NEXT: ; %bb.2: ; %Flow
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
; GFX10-W32-NEXT: ; %bb.3: ; %IF
-; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3
+; GFX10-W32-NEXT: v_lshl_add_u32 v0, v5, 1, v5
; GFX10-W32-NEXT: ; %bb.4: ; %END
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
More information about the llvm-commits
mailing list