[llvm] [AMDGPU] Form V_MAD_U64_U32 from mul24 (PR #72393)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 10 23:39:32 PST 2023
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/72393
>From db859b6cabe7a085a8f710f9a7c8980802ed6eb8 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 15 Nov 2023 15:20:32 +0100
Subject: [PATCH 1/4] [AMDGPU] Form V_MAD_U64_U32 from mul24
See SWDEV-421067
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 2 +
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 21 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 214 +++++++++++++++---
3 files changed, 201 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 799e102d56174..ea0cf9bcf62a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2011,6 +2011,8 @@ def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
+def HasFullRate64Ops : Predicate<"Subtarget->hasFullRate64Ops()">;
+
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 114d33b077866..3734a226c2bfc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -678,9 +678,26 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
>;
}
+// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
+// We need to separate this because otherwise OtherPredicates would be overriden.
+multiclass IMAD32_Mul24_Pats <VOP3_Pseudo inst> {
+ def : GCNPat <
+ (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
+ (inst $src0, $src1, $src2, 0 /* clamp */)
+ >;
+ def : GCNPat <
+ (i64 (add (i64 (zext (i32 (AMDGPUmul_u24 i32:$src0, i32:$src1)))), i64:$src2)),
+ (inst $src0, $src1, $src2, 0 /* clamp */)
+ >;
+}
+
// exclude pre-GFX9 where it was slow
-let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in
- defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
+let SubtargetPredicate = isGFX9Plus in {
+ let OtherPredicates = [HasNotMADIntraFwdBug] in
+ defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
+ let OtherPredicates = [HasNotMADIntraFwdBug, HasFullRate64Ops] in
+ defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>;
+}
let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 61017e809c863..f57ace4a75e81 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -8,8 +8,8 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-SDAG,GFX900-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-GISEL,GFX900-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG,GFX900-SDAG,GFX900 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL,GFX900-GISEL,GFX900 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A,GFX9-SDAG,GFX90A-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A,GFX9-GISEL,GFX90A-GISEL %s
@@ -4577,23 +4577,41 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_multi_use_mul_chain_add_other_use_all:
-; GFX900: ; %bb.0: ; %bb
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_add_u32_e32 v0, 1, v0
-; GFX900-NEXT: v_mul_lo_u32 v2, v0, v1
-; GFX900-NEXT: v_add_u32_e32 v0, v2, v0
-; GFX900-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX900-NEXT: v_add_u32_e32 v1, 1, v2
-; GFX900-NEXT: v_mul_lo_u32 v5, v0, v1
-; GFX900-NEXT: global_store_dword v[3:4], v2, off
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: global_store_dword v[3:4], v0, off
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: global_store_dword v[3:4], v5, off
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_u32_e32 v0, v5, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_all:
+; GFX900-SDAG: ; %bb.0: ; %bb
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX900-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1
+; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX900-SDAG-NEXT: v_add_u32_e32 v1, 1, v2
+; GFX900-SDAG-NEXT: v_mul_lo_u32 v5, v0, v1
+; GFX900-SDAG-NEXT: global_store_dword v[3:4], v2, off
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX900-SDAG-NEXT: global_store_dword v[3:4], v0, off
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX900-SDAG-NEXT: global_store_dword v[3:4], v5, off
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v5, v1
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_all:
+; GFX900-GISEL: ; %bb.0: ; %bb
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX900-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1
+; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v2, v0
+; GFX900-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_add_u32_e32 v1, 1, v2
+; GFX900-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1
+; GFX900-GISEL-NEXT: global_store_dword v[3:4], v2, off
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v[3:4], v0, off
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v[3:4], v5, off
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v5, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_all:
; GFX90A-SDAG: ; %bb.0: ; %bb
@@ -4761,21 +4779,37 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_multi_use_mul_chain_add_other_use_some:
-; GFX900: ; %bb.0: ; %bb
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_add_u32_e32 v0, 1, v0
-; GFX900-NEXT: v_mul_lo_u32 v2, v0, v1
-; GFX900-NEXT: v_add_u32_e32 v0, v2, v0
-; GFX900-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX900-NEXT: v_add_u32_e32 v1, 1, v2
-; GFX900-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX900-NEXT: global_store_dword v[3:4], v2, off
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: global_store_dword v[3:4], v0, off
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_some:
+; GFX900-SDAG: ; %bb.0: ; %bb
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX900-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1
+; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX900-SDAG-NEXT: v_add_u32_e32 v1, 1, v2
+; GFX900-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX900-SDAG-NEXT: global_store_dword v[3:4], v2, off
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX900-SDAG-NEXT: global_store_dword v[3:4], v0, off
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX900-SDAG-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_some:
+; GFX900-GISEL: ; %bb.0: ; %bb
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX900-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1
+; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v2, v0
+; GFX900-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_add_u32_e32 v1, 1, v2
+; GFX900-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX900-GISEL-NEXT: global_store_dword v[3:4], v2, off
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v[3:4], v0, off
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX900-GISEL-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_some:
; GFX90A-SDAG: ; %bb.0: ; %bb
@@ -6928,7 +6962,119 @@ entry:
ret <2 x i16> %add0
}
+define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
+; GFX67-LABEL: mul_u24_add64:
+; GFX67: ; %bb.0:
+; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX67-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
+; GFX67-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mul_u24_add64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-SDAG-LABEL: mul_u24_add64:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
+; GFX900-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX900-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX900-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v3, vcc
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: mul_u24_add64:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
+; GFX9-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v3, vcc
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-SDAG-LABEL: mul_u24_add64:
+; GFX90A-SDAG: ; %bb.0:
+; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_u24_add64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_u32_u24_e32 v4, v0, v1
+; GFX10-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
+ %add = add nuw nsw i64 %mul, %z
+ ret i64 %add
+}
+
+define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
+; GFX67-LABEL: mul_u24_zext_add64:
+; GFX67: ; %bb.0:
+; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX67-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX67-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mul_u24_zext_add64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-SDAG-LABEL: mul_u24_zext_add64:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX900-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX900-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: mul_u24_zext_add64:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-SDAG-LABEL: mul_u24_zext_add64:
+; GFX90A-SDAG: ; %bb.0:
+; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: mul_u24_zext_add64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
+ %mul.zext = zext i32 %mul to i64
+ %add = add nuw nsw i64 %mul.zext, %z
+ ret i64 %add
+}
+
+declare i64 @llvm.amdgcn.mul.u24.i64(i32, i32)
+declare i32 @llvm.amdgcn.mul.u24(i32, i32)
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX6: {{.*}}
; GFX7: {{.*}}
+; GFX900: {{.*}}
; GFX90A: {{.*}}
>From 0a3b269314c707455f74c91c5d529884fe53cbdc Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 1 Dec 2023 10:35:31 +0100
Subject: [PATCH 2/4] Drop add flags
---
llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index f57ace4a75e81..d9afe78d0b681 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -7014,7 +7014,7 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
- %add = add nuw nsw i64 %mul, %z
+ %add = add i64 %mul, %z
ret i64 %add
}
@@ -7066,7 +7066,7 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
%mul.zext = zext i32 %mul to i64
- %add = add nuw nsw i64 %mul.zext, %z
+ %add = add i64 %mul.zext, %z
ret i64 %add
}
>From c5ff15bc466c59c665f5f3c8add42f13dac99f39 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 8 Dec 2023 14:23:23 +0100
Subject: [PATCH 3/4] Remove unneeded pattern
---
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 24 ++++----
.../AMDGPU/atomic_optimizations_buffer.ll | 21 +++----
.../atomic_optimizations_global_pointer.ll | 44 +++++++--------
.../atomic_optimizations_local_pointer.ll | 54 +++++++++---------
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 21 +++----
.../atomic_optimizations_struct_buffer.ll | 21 +++----
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 28 +++-------
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 31 +++++------
llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll | 55 ++++++++-----------
...tack-pointer-offset-relative-frameindex.ll | 12 ++--
10 files changed, 145 insertions(+), 166 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 3734a226c2bfc..fa91ae8619716 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -680,26 +680,24 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
// We need to separate this because otherwise OtherPredicates would be overriden.
-multiclass IMAD32_Mul24_Pats <VOP3_Pseudo inst> {
- def : GCNPat <
- (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
- (inst $src0, $src1, $src2, 0 /* clamp */)
- >;
- def : GCNPat <
- (i64 (add (i64 (zext (i32 (AMDGPUmul_u24 i32:$src0, i32:$src1)))), i64:$src2)),
- (inst $src0, $src1, $src2, 0 /* clamp */)
- >;
-}
+class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
+ (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
+ (inst $src0, $src1, $src2, 0 /* clamp */)
+ >;
// exclude pre-GFX9 where it was slow
let SubtargetPredicate = isGFX9Plus in {
let OtherPredicates = [HasNotMADIntraFwdBug] in
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
let OtherPredicates = [HasNotMADIntraFwdBug, HasFullRate64Ops] in
- defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>;
+ def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
+}
+let SubtargetPredicate = isGFX11Only in {
+ let OtherPredicates = [HasNotMADIntraFwdBug] in
+ defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
+ let OtherPredicates = [HasNotMADIntraFwdBug, HasFullRate64Ops] in
+ def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
}
-let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in
- defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5..d58f84e2486b0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -390,13 +390,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
-; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -422,13 +423,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
+; GFX11W32-NEXT: s_waitcnt vmcnt(0)
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
-; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f..c2a2fcb611198 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -442,13 +442,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -482,13 +483,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
-; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1321,13 +1323,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1369,13 +1370,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2893,16 +2893,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: .LBB10_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mul_lo_u32 v5, s1, v2
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
-; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-NEXT: v_add_nc_u32_e32 v1, v4, v5
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mov_b32_e32 v1, v5
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
@@ -2943,16 +2943,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: .LBB10_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_mul_lo_u32 v5, s1, v2
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_add_nc_u32_e32 v1, v4, v5
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mov_b32_e32 v1, v5
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 60c3134445218..5da94ccb0490a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -426,13 +426,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -461,13 +462,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1460,13 +1462,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mul_lo_u32 v3, s3, v2
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mov_b32_e32 v1, v3
+; GFX1164-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1502,13 +1503,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_mul_lo_u32 v3, s3, v2
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mov_b32_e32 v1, v3
+; GFX1132-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3073,16 +3073,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1164-NEXT: .LBB12_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mul_lo_u32 v5, s3, v2
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mov_b32_e32 v1, v5
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
@@ -3117,16 +3117,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1132-NEXT: .LBB12_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_mul_lo_u32 v5, s3, v2
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
+; GFX1132-NEXT: v_add_nc_u32_e32 v1, v4, v5
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mov_b32_e32 v1, v5
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index baa0c72dbf63e..79c47f34bae97 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -389,13 +389,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
-; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -421,13 +422,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
+; GFX11W32-NEXT: s_waitcnt vmcnt(0)
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
-; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 37d421b017979..c421d25be3036 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -402,13 +402,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
-; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -435,13 +436,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
+; GFX11W32-NEXT: s_waitcnt vmcnt(0)
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
-; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index d9afe78d0b681..ae5ca52145785 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -7035,27 +7035,13 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: mul_u24_zext_add64:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1
-; GFX900-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX900-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: mul_u24_zext_add64:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1
-; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90A-SDAG-LABEL: mul_u24_zext_add64:
-; GFX90A-SDAG: ; %bb.0:
-; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
-; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: mul_u24_zext_add64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: mul_u24_zext_add64:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 3a337ec7a8733..deee87e661fac 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -391,13 +391,13 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-LABEL: mad_i64_i32_extops_i32_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
-; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i32_e32 v0, 31, v4
+; GFX11-NEXT: v_mul_lo_u32 v6, v0, v5
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
+; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext0 = sext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64
@@ -477,13 +477,12 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-LABEL: mad_u64_u32_bitops_lhs_mask_small:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
+; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_and_b32 v6, 1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
+; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
+; GFX11-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 8589934591
%trunc.rhs = and i64 %arg1, 4294967295
@@ -529,12 +528,12 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-LABEL: mad_u64_u32_bitops_rhs_mask_small:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
+; GFX11-NEXT: v_mul_lo_u32 v2, v6, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
+; GFX11-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 8589934591
diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
index 81de580a78395..9f704dc43d48f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@@ -18,9 +18,8 @@ define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_vvv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: v_mov_b32_e32 v4, v0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v3, v[2:3]
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -54,9 +53,8 @@ define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) {
;
; GFX11-LABEL: mad_i32_vvc:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 42
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 42, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 42
@@ -79,9 +77,8 @@ define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) {
;
; GFX11-LABEL: mad_i32_vvi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0x12d687
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x12d687, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 1234567
@@ -104,9 +101,8 @@ define amdgpu_ps float @mad_i32_vvi_neg(i32 %a, i32 %b) {
;
; GFX11-LABEL: mad_i32_vvi_neg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0xffffffffffed2979
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xffed2979, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, -1234567
@@ -127,8 +123,8 @@ define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) {
;
; GFX11-LABEL: mad_i32_vcv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, 42, v[1:2]
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, 42
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, 42
%add = add i32 %mul, %c
@@ -149,8 +145,8 @@ define amdgpu_ps float @mad_i32_vcc(i32 %a) {
;
; GFX11-LABEL: mad_i32_vcc:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, 42, 43
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, 42
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 43, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, 42
%add = add i32 %mul, 43
@@ -171,9 +167,8 @@ define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) {
;
; GFX11-LABEL: mad_i32_vvs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, s[0:1]
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -194,8 +189,8 @@ define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_vsv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, s0, v[1:2]
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -216,8 +211,8 @@ define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_svv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, s0, v0, v[1:2]
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: v_mul_lo_u32 v0, s0, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -240,9 +235,8 @@ define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) {
;
; GFX11-LABEL: mad_i32_vss:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: s_mov_b32 s2, s1
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, s[2:3]
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s1, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -265,9 +259,8 @@ define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) {
;
; GFX11-LABEL: mad_i32_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: s_mov_b32 s2, s1
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
+; GFX11-NEXT: v_mul_lo_u32 v0, s0, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s1, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -289,8 +282,8 @@ define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, s0, s1, v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-NEXT: s_mul_i32 s0, s0, s1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index d52655a079161..bb3236a394877 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -98,10 +98,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF11-NEXT: ; %bb.1: ; %if.then4.i
; MUBUF11-NEXT: s_movk_i32 s0, 0x4000
; MUBUF11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4
-; MUBUF11-NEXT: s_mov_b32 s0, 0x41c64e6d
; MUBUF11-NEXT: s_waitcnt vmcnt(0)
-; MUBUF11-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; MUBUF11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039
+; MUBUF11-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; MUBUF11-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0
+; MUBUF11-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0
; MUBUF11-NEXT: scratch_store_b32 off, v0, s0
; MUBUF11-NEXT: .LBB0_2: ; %shader_eval_surface.exit
; MUBUF11-NEXT: s_endpgm
@@ -125,10 +125,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; FLATSCR11-NEXT: ; %bb.1: ; %if.then4.i
; FLATSCR11-NEXT: s_movk_i32 s0, 0x4000
; FLATSCR11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4
-; FLATSCR11-NEXT: s_mov_b32 s0, 0x41c64e6d
; FLATSCR11-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR11-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; FLATSCR11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039
+; FLATSCR11-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; FLATSCR11-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0
+; FLATSCR11-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0
; FLATSCR11-NEXT: scratch_store_b32 off, v0, s0
; FLATSCR11-NEXT: .LBB0_2: ; %shader_eval_surface.exit
; FLATSCR11-NEXT: s_endpgm
>From 14f3dc8d9d1d4d088d97552ec113535edc4b476a Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 11 Dec 2023 08:39:19 +0100
Subject: [PATCH 4/4] Remove speedmodel
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 2 -
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 16 ++----
.../AMDGPU/atomic_optimizations_buffer.ll | 21 ++++---
.../atomic_optimizations_global_pointer.ll | 44 +++++++--------
.../atomic_optimizations_local_pointer.ll | 54 +++++++++---------
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 21 ++++---
.../atomic_optimizations_struct_buffer.ll | 21 ++++---
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 39 ++++++-------
...ne-sink-temporal-divergence-swdev407790.ll | 39 ++++++-------
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 31 ++++++-----
llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll | 55 +++++++++++--------
...tack-pointer-offset-relative-frameindex.ll | 12 ++--
12 files changed, 174 insertions(+), 181 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ea0cf9bcf62a8..799e102d56174 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2011,8 +2011,6 @@ def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
-def HasFullRate64Ops : Predicate<"Subtarget->hasFullRate64Ops()">;
-
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index fa91ae8619716..a73042f2e411a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -686,17 +686,13 @@ class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
>;
// exclude pre-GFX9 where it was slow
-let SubtargetPredicate = isGFX9Plus in {
- let OtherPredicates = [HasNotMADIntraFwdBug] in
- defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
- let OtherPredicates = [HasNotMADIntraFwdBug, HasFullRate64Ops] in
- def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
+let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
+ defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
+ def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
}
-let SubtargetPredicate = isGFX11Only in {
- let OtherPredicates = [HasNotMADIntraFwdBug] in
- defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
- let OtherPredicates = [HasNotMADIntraFwdBug, HasFullRate64Ops] in
- def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
+let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
+ defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
+ def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
}
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index d58f84e2486b0..f8f50c7cb23a5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -390,14 +390,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -423,13 +422,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index c2a2fcb611198..81fd166e3779f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -442,14 +442,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -483,14 +482,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
+; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1323,12 +1321,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1370,12 +1369,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2893,16 +2893,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: .LBB10_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mul_lo_u32 v5, s1, v2
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mov_b32_e32 v1, v5
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
@@ -2943,16 +2943,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: .LBB10_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v5, s1, v2
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_mov_b32_e32 v1, v5
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 5da94ccb0490a..60c3134445218 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -426,14 +426,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -462,14 +461,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1462,12 +1460,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mul_lo_u32 v3, s3, v2
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1503,12 +1502,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v3, s3, v2
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3073,16 +3073,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1164-NEXT: .LBB12_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mul_lo_u32 v5, s3, v2
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1164-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mov_b32_e32 v1, v5
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
@@ -3117,16 +3117,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1132-NEXT: .LBB12_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v5, s3, v2
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_mov_b32_e32 v1, v5
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 79c47f34bae97..baa0c72dbf63e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -389,14 +389,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -422,13 +421,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index c421d25be3036..37d421b017979 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -402,14 +402,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -436,13 +435,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index ae5ca52145785..10b30ce6ce51a 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -6981,14 +6981,11 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: mul_u24_add64:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_hi_u32_u24_e32 v4, v0, v1
-; GFX900-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1
-; GFX900-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX900-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v3, vcc
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: mul_u24_add64:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: mul_u24_add64:
; GFX9-GISEL: ; %bb.0:
@@ -6999,20 +6996,20 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v3, vcc
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-SDAG-LABEL: mul_u24_add64:
-; GFX90A-SDAG: ; %bb.0:
-; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
-; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: mul_u24_add64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: mul_u24_add64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_u32_u24_e32 v4, v0, v1
-; GFX10-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-GISEL-LABEL: mul_u24_add64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1
+; GFX10-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
%mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
%add = add i64 %mul, %z
ret i64 %add
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index d9c6fbb319019..cf58860101600 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -444,31 +444,28 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_31
; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
-; CHECK-NEXT: v_xor_b32_e32 v5, v60, v58
-; CHECK-NEXT: v_lshrrev_b64 v[3:4], 16, v[56:57]
-; CHECK-NEXT: v_mul_u32_u24_e32 v11, 0x180, v73
-; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; CHECK-NEXT: v_lshrrev_b64 v[1:2], 16, v[45:46]
-; CHECK-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58
+; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57]
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[46:47]
+; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72
-; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, s46, v11
-; CHECK-NEXT: v_lshlrev_b32_e32 v10, 12, v63
-; CHECK-NEXT: v_or_b32_e32 v4, v7, v4
-; CHECK-NEXT: v_mul_hi_u32_u24_e32 v7, 0x180, v73
-; CHECK-NEXT: v_xor_b32_e32 v6, v61, v59
-; CHECK-NEXT: v_lshlrev_b32_e32 v9, 16, v56
-; CHECK-NEXT: v_or3_b32 v10, v8, v10, v62
+; CHECK-NEXT: v_lshlrev_b32_e32 v9, 12, v63
+; CHECK-NEXT: v_xor_b32_e32 v5, v61, v59
+; CHECK-NEXT: v_lshlrev_b32_e32 v11, 16, v56
+; CHECK-NEXT: v_or_b32_e32 v3, v1, v3
+; CHECK-NEXT: v_lshrrev_b64 v[0:1], 16, v[45:46]
+; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, v10
+; CHECK-NEXT: v_or3_b32 v8, v8, v9, v62
+; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; CHECK-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
+; CHECK-NEXT: v_or_b32_e32 v1, v11, v1
; CHECK-NEXT: ; implicit-def: $vgpr42
; CHECK-NEXT: ; implicit-def: $vgpr43
; CHECK-NEXT: ; implicit-def: $vgpr44
-; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s47, v7, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v11, v0
-; CHECK-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6]
-; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo
-; CHECK-NEXT: v_or_b32_e32 v2, v9, v2
-; CHECK-NEXT: global_store_dword v[7:8], v10, off offset:4
-; CHECK-NEXT: global_store_dwordx4 v[7:8], v[1:4], off offset:8
-; CHECK-NEXT: global_store_dwordx2 v[7:8], v[5:6], off offset:24
+; CHECK-NEXT: global_store_dword v[6:7], v8, off offset:4
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[0:3], off offset:8
+; CHECK-NEXT: global_store_dwordx2 v[6:7], v[4:5], off offset:24
; CHECK-NEXT: .LBB0_31: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index deee87e661fac..3a337ec7a8733 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -391,13 +391,13 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-LABEL: mad_i64_i32_extops_i32_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ashrrev_i32_e32 v0, 31, v4
-; GFX11-NEXT: v_mul_lo_u32 v6, v0, v5
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, v[2:3]
+; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; GFX11-NEXT: v_mov_b32_e32 v3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v1
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext0 = sext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64
@@ -477,12 +477,13 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-LABEL: mad_u64_u32_bitops_lhs_mask_small:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_and_b32 v6, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
-; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
+; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v6, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 8589934591
%trunc.rhs = and i64 %arg1, 4294967295
@@ -528,12 +529,12 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-LABEL: mad_u64_u32_bitops_rhs_mask_small:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v3, 1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
-; GFX11-NEXT: v_mul_lo_u32 v2, v6, v3
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 8589934591
diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
index 9f704dc43d48f..81de580a78395 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@@ -18,8 +18,9 @@ define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_vvv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: v_mov_b32_e32 v4, v0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v3, v[2:3]
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -53,8 +54,9 @@ define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) {
;
; GFX11-LABEL: mad_i32_vvc:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 42, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 42
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 42
@@ -77,8 +79,9 @@ define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) {
;
; GFX11-LABEL: mad_i32_vvi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x12d687, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0x12d687
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, 1234567
@@ -101,8 +104,9 @@ define amdgpu_ps float @mad_i32_vvi_neg(i32 %a, i32 %b) {
;
; GFX11-LABEL: mad_i32_vvi_neg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xffed2979, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0xffffffffffed2979
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, -1234567
@@ -123,8 +127,8 @@ define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) {
;
; GFX11-LABEL: mad_i32_vcv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, 42
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, 42, v[1:2]
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, 42
%add = add i32 %mul, %c
@@ -145,8 +149,8 @@ define amdgpu_ps float @mad_i32_vcc(i32 %a) {
;
; GFX11-LABEL: mad_i32_vcc:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, 42
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 43, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, 42, 43
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, 42
%add = add i32 %mul, 43
@@ -167,8 +171,9 @@ define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) {
;
; GFX11-LABEL: mad_i32_vvs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, s[0:1]
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -189,8 +194,8 @@ define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_vsv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, s0, v[1:2]
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -211,8 +216,8 @@ define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_svv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, s0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, s0, v0, v[1:2]
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -235,8 +240,9 @@ define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) {
;
; GFX11-LABEL: mad_i32_vss:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s1, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_mov_b32 s2, s1
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, s[2:3]
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -259,8 +265,9 @@ define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) {
;
; GFX11-LABEL: mad_i32_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mul_lo_u32 v0, s0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s1, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_mov_b32 s2, s1
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
@@ -282,8 +289,8 @@ define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
;
; GFX11-LABEL: mad_i32_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_mul_i32 s0, s0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, s0, s1, v[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index bb3236a394877..d52655a079161 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -98,10 +98,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF11-NEXT: ; %bb.1: ; %if.then4.i
; MUBUF11-NEXT: s_movk_i32 s0, 0x4000
; MUBUF11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4
+; MUBUF11-NEXT: s_mov_b32 s0, 0x41c64e6d
; MUBUF11-NEXT: s_waitcnt vmcnt(0)
-; MUBUF11-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; MUBUF11-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0
-; MUBUF11-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0
+; MUBUF11-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; MUBUF11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039
; MUBUF11-NEXT: scratch_store_b32 off, v0, s0
; MUBUF11-NEXT: .LBB0_2: ; %shader_eval_surface.exit
; MUBUF11-NEXT: s_endpgm
@@ -125,10 +125,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; FLATSCR11-NEXT: ; %bb.1: ; %if.then4.i
; FLATSCR11-NEXT: s_movk_i32 s0, 0x4000
; FLATSCR11-NEXT: scratch_load_b64 v[0:1], off, s0 offset:4
+; FLATSCR11-NEXT: s_mov_b32 s0, 0x41c64e6d
; FLATSCR11-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR11-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; FLATSCR11-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0
-; FLATSCR11-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0
+; FLATSCR11-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; FLATSCR11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039
; FLATSCR11-NEXT: scratch_store_b32 off, v0, s0
; FLATSCR11-NEXT: .LBB0_2: ; %shader_eval_surface.exit
; FLATSCR11-NEXT: s_endpgm
More information about the llvm-commits
mailing list