[llvm] [LLVM][CodeGen] Add lowering for scalable vector bfloat operations. (PR #109803)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 25 08:55:38 PDT 2024
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/109803
>From 26b71a375529553128bc96bebdd0641850b4326e Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 30 Aug 2024 15:59:48 +0100
Subject: [PATCH] [LLVM][CodeGen] Add lowering for scalable vector bfloat
operations.
Specifically:
fabs, fadd, fceil, fdiv, ffloor, fma, fmax, fmaxnm, fmin, fminnm,
fmul, fnearbyint, fneg, frint, fround, froundeven, fsub, fsqrt &
ftrunc
---
.../SelectionDAG/LegalizeVectorOps.cpp | 93 +++
.../Target/AArch64/AArch64ISelLowering.cpp | 34 +-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 7 +
llvm/lib/Target/AArch64/SVEInstrFormats.td | 6 +
llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 104 +--
llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 104 +--
llvm/test/CodeGen/AArch64/sve-bf16-arith.ll | 752 ++++++++++++++++++
.../test/CodeGen/AArch64/sve-bf16-rounding.ll | 355 +++++++++
.../buffer-fat-pointer-atomicrmw-fadd.ll | 316 ++++----
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 428 +++++-----
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 490 ++++++------
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 132 ++-
12 files changed, 1970 insertions(+), 851 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 5d433204d5da08..3d60b10edb22ff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -141,6 +141,7 @@ class VectorLegalizer {
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
SDValue ExpandStore(SDNode *N);
+ SDValue ExpandBF16Arith(SDNode *Node);
SDValue ExpandFNEG(SDNode *Node);
SDValue ExpandFABS(SDNode *Node);
SDValue ExpandFCOPYSIGN(SDNode *Node);
@@ -1070,6 +1071,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
break;
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ if (SDValue Expanded = ExpandBF16Arith(Node)) {
+ Results.push_back(Expanded);
+ return;
+ }
if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG)) {
Results.push_back(Expanded);
return;
@@ -1077,6 +1082,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
break;
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ if (SDValue Expanded = ExpandBF16Arith(Node)) {
+ Results.push_back(Expanded);
+ return;
+ }
Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
return;
case ISD::SMIN:
@@ -1197,6 +1206,24 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::UCMP:
Results.push_back(TLI.expandCMP(Node, DAG));
return;
+
+ case ISD::FADD:
+ case ISD::FMUL:
+ case ISD::FMA:
+ case ISD::FDIV:
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FNEARBYINT:
+ case ISD::FRINT:
+ case ISD::FROUND:
+ case ISD::FROUNDEVEN:
+ case ISD::FTRUNC:
+ case ISD::FSQRT:
+ if (SDValue Expanded = ExpandBF16Arith(Node)) {
+ Results.push_back(Expanded);
+ return;
+ }
+ break;
}
SDValue Unrolled = DAG.UnrollVectorOp(Node);
@@ -1874,6 +1901,11 @@ void VectorLegalizer::ExpandFSUB(SDNode *Node,
TLI.isOperationLegalOrCustom(ISD::FADD, VT))
return; // Defer to LegalizeDAG
+ if (SDValue Expanded = ExpandBF16Arith(Node)) {
+ Results.push_back(Expanded);
+ return;
+ }
+
SDValue Tmp = DAG.UnrollVectorOp(Node);
Results.push_back(Tmp);
}
@@ -2134,6 +2166,67 @@ bool VectorLegalizer::tryExpandVecMathCall(
return tryExpandVecMathCall(Node, LC, Results);
}
+// Try to lower BFloat arithmetic by performing the same operation on operands
+// that have been promoted to Float32, the result of which is then truncated.
+// If promotion requires non-legal types the operation is split with the
+// promotion occuring during a successive call to this function.
+SDValue VectorLegalizer::ExpandBF16Arith(SDNode *Node) {
+ EVT VT = Node->getValueType(0);
+ if (VT.getVectorElementType() != MVT::bf16)
+ return SDValue();
+
+ SDLoc DL(Node);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned Opcode = Node->getOpcode();
+
+ // Can we promote to float and try again?
+
+ EVT PromoteVT = VT.changeVectorElementType(MVT::f32);
+ if (TLI.isTypeLegal(PromoteVT)) {
+ // Don't expand if the result is likely to be unrolled anyway.
+ if (!TLI.isOperationLegalOrCustom(Opcode, PromoteVT))
+ return SDValue();
+
+ SmallVector<SDValue, 4> Ops;
+ for (const SDValue &V : Node->op_values())
+ Ops.push_back(DAG.getNode(ISD::FP_EXTEND, DL, PromoteVT, V));
+
+ SDValue PromotedOp = DAG.getNode(Opcode, DL, PromoteVT, Ops);
+ return DAG.getNode(ISD::FP_ROUND, DL, VT, PromotedOp,
+ DAG.getIntPtrConstant(0, DL, true));
+ }
+
+ // Can we split the vector and try again?
+
+ if (VT.getVectorMinNumElements() == 1)
+ return SDValue();
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ // Restrict expansion to cases where both parts can be concatenated.
+ if (LoVT != HiVT || !TLI.isTypeLegal(LoVT))
+ return SDValue();
+
+ // Don't expand if the result is likely to be unrolled anyway.
+ if (!TLI.isOperationLegalOrCustom(Opcode, LoVT) &&
+ !TLI.isOperationLegalOrCustom(Opcode,
+ LoVT.changeVectorElementType(MVT::f32)))
+ return SDValue();
+
+ SmallVector<SDValue, 4> LoOps, HiOps;
+ for (const SDValue &V : Node->op_values()) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(V, DL, LoVT, HiVT);
+ LoOps.push_back(Lo);
+ HiOps.push_back(Hi);
+ }
+
+ SDValue SplitOpLo = DAG.getNode(Opcode, DL, LoVT, LoOps);
+ SDValue SplitOpHi = DAG.getNode(Opcode, DL, HiVT, HiOps);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SplitOpLo, SplitOpHi);
+}
+
void VectorLegalizer::UnrollStrictFPOp(SDNode *Node,
SmallVectorImpl<SDValue> &Results) {
EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4166d9bd22bc01..979820e87bfe84 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1663,12 +1663,44 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
setOperationAction(ISD::BITCAST, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Expand);
+ setOperationAction(ISD::FDIV, VT, Expand);
+ setOperationAction(ISD::FFLOOR, VT, Expand);
+ setOperationAction(ISD::FNEARBYINT, VT, Expand);
+ setOperationAction(ISD::FNEG, VT, Legal);
setOperationAction(ISD::FP_EXTEND, VT, Custom);
setOperationAction(ISD::FP_ROUND, VT, Custom);
- setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::FRINT, VT, Expand);
+ setOperationAction(ISD::FROUND, VT, Expand);
+ setOperationAction(ISD::FROUNDEVEN, VT, Expand);
+ setOperationAction(ISD::FSQRT, VT, Expand);
+ setOperationAction(ISD::FTRUNC, VT, Expand);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+
+ if (!Subtarget->hasSVEB16B16()) {
+ setOperationAction(ISD::FADD, VT, Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
+ setOperationAction(ISD::FMAXIMUM, VT, Expand);
+ setOperationAction(ISD::FMAXNUM, VT, Expand);
+ setOperationAction(ISD::FMINIMUM, VT, Expand);
+ setOperationAction(ISD::FMINNUM, VT, Expand);
+ setOperationAction(ISD::FMUL, VT, Expand);
+ setOperationAction(ISD::FSUB, VT, Expand);
+
+ } else {
+ setOperationAction(ISD::FADD, VT, Legal);
+ setOperationAction(ISD::FMA, VT, Custom);
+ setOperationAction(ISD::FMAXIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMINNUM, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Legal);
+ setOperationAction(ISD::FSUB, VT, Legal);
+ }
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 7240f6a22a87bd..078f4f2e14cabf 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -663,6 +663,13 @@ let Predicates = [HasSVEorSME] in {
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>;
+ foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in {
+ def : Pat<(VT (fabs VT:$op)),
+ (AND_ZI $op, (i64 (logical_imm64_XFORM(i64 0x7fff7fff7fff7fff))))>;
+ def : Pat<(VT (fneg VT:$op)),
+ (EOR_ZI $op, (i64 (logical_imm64_XFORM(i64 0x8000800080008000))))>;
+ }
+
// zext(cmpeq(x, splat(0))) -> cnot(x)
def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive):$Pg), nxv16i8:$Op2, (SVEDup0), SETEQ)))),
(CNOT_ZPmZ_B $Op2, $Pg, $Op2)>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 0bfac6465a1f30..c7059b8e4e8d4a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2299,6 +2299,8 @@ multiclass sve_fp_3op_u_zd_bfloat<bits<3> opc, string asm, SDPatternOperator op>
def NAME : sve_fp_3op_u_zd<0b00, opc, asm, ZPR16>;
def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv4bf16, op, nxv4bf16, nxv4bf16, !cast<Instruction>(NAME)>;
+ def : SVE_2_Op_Pat<nxv2bf16, op, nxv2bf16, nxv2bf16, !cast<Instruction>(NAME)>;
}
multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
@@ -9078,6 +9080,8 @@ multiclass sve_fp_bin_pred_bfloat<SDPatternOperator op> {
def _UNDEF : PredTwoOpPseudo<NAME, ZPR16, FalseLanesUndef>;
def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _UNDEF)>;
+ def : SVE_3_Op_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, nxv4bf16, !cast<Pseudo>(NAME # _UNDEF)>;
+ def : SVE_3_Op_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, nxv2bf16, !cast<Pseudo>(NAME # _UNDEF)>;
}
// Predicated pseudo floating point three operand instructions.
@@ -9099,6 +9103,8 @@ multiclass sve_fp_3op_pred_bfloat<SDPatternOperator op> {
def _UNDEF : PredThreeOpPseudo<NAME, ZPR16, FalseLanesUndef>;
def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _UNDEF)>;
+ def : SVE_4_Op_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, nxv4bf16, nxv4bf16, !cast<Instruction>(NAME # _UNDEF)>;
+ def : SVE_4_Op_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, nxv2bf16, nxv2bf16, !cast<Instruction>(NAME # _UNDEF)>;
}
// Predicated pseudo integer two operand instructions.
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 888b795876f7df..421dfe1b39cefe 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -652,90 +652,46 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT: mov h1, v0.h[1]
-; NOLSE-NEXT: fmov w10, s0
-; NOLSE-NEXT: mov w8, #32767 // =0x7fff
-; NOLSE-NEXT: lsl w10, w10, #16
-; NOLSE-NEXT: fmov w9, s1
-; NOLSE-NEXT: fmov s1, w10
-; NOLSE-NEXT: lsl w9, w9, #16
-; NOLSE-NEXT: fmov s0, w9
+; NOLSE-NEXT: movi v1.4s, #1
+; NOLSE-NEXT: movi v2.4s, #127, msl #8
+; NOLSE-NEXT: shll v0.4s, v0.4h, #16
; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start
; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
-; NOLSE-NEXT: ldaxr w9, [x0]
-; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: mov h3, v2.h[1]
-; NOLSE-NEXT: fmov w11, s2
-; NOLSE-NEXT: lsl w11, w11, #16
-; NOLSE-NEXT: fmov w10, s3
-; NOLSE-NEXT: fmov s3, w11
-; NOLSE-NEXT: lsl w10, w10, #16
-; NOLSE-NEXT: fmaxnm s3, s3, s1
-; NOLSE-NEXT: fmov s2, w10
-; NOLSE-NEXT: fmaxnm s2, s2, s0
-; NOLSE-NEXT: fmov w11, s3
-; NOLSE-NEXT: ubfx w13, w11, #16, #1
-; NOLSE-NEXT: add w11, w11, w8
-; NOLSE-NEXT: fmov w10, s2
-; NOLSE-NEXT: add w11, w13, w11
-; NOLSE-NEXT: lsr w11, w11, #16
-; NOLSE-NEXT: ubfx w12, w10, #16, #1
-; NOLSE-NEXT: add w10, w10, w8
-; NOLSE-NEXT: fmov s3, w11
-; NOLSE-NEXT: add w10, w12, w10
-; NOLSE-NEXT: lsr w10, w10, #16
-; NOLSE-NEXT: fmov s2, w10
-; NOLSE-NEXT: mov v3.h[1], v2.h[0]
-; NOLSE-NEXT: fmov w10, s3
-; NOLSE-NEXT: stlxr w11, w10, [x0]
-; NOLSE-NEXT: cbnz w11, .LBB7_1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s3, w8
+; NOLSE-NEXT: shll v3.4s, v3.4h, #16
+; NOLSE-NEXT: fmaxnm v3.4s, v3.4s, v0.4s
+; NOLSE-NEXT: ushr v4.4s, v3.4s, #16
+; NOLSE-NEXT: and v4.16b, v4.16b, v1.16b
+; NOLSE-NEXT: add v3.4s, v4.4s, v3.4s
+; NOLSE-NEXT: addhn v3.4h, v3.4s, v2.4s
+; NOLSE-NEXT: fmov w9, s3
+; NOLSE-NEXT: stlxr w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB7_1
; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT: fmov d0, x9
+; NOLSE-NEXT: fmov d0, x8
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
; LSE: // %bb.0:
-; LSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT: mov h1, v0.h[1]
-; LSE-NEXT: fmov w10, s0
-; LSE-NEXT: mov w8, #32767 // =0x7fff
+; LSE-NEXT: movi v1.4s, #1
+; LSE-NEXT: movi v2.4s, #127, msl #8
+; LSE-NEXT: shll v3.4s, v0.4h, #16
; LSE-NEXT: ldr s0, [x0]
-; LSE-NEXT: lsl w10, w10, #16
-; LSE-NEXT: fmov w9, s1
-; LSE-NEXT: fmov s2, w10
-; LSE-NEXT: lsl w9, w9, #16
-; LSE-NEXT: fmov s1, w9
; LSE-NEXT: .LBB7_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: mov h3, v0.h[1]
-; LSE-NEXT: fmov w10, s0
-; LSE-NEXT: lsl w10, w10, #16
-; LSE-NEXT: fmov w9, s3
-; LSE-NEXT: fmov s4, w10
-; LSE-NEXT: lsl w9, w9, #16
-; LSE-NEXT: fmaxnm s4, s4, s2
-; LSE-NEXT: fmov s3, w9
-; LSE-NEXT: fmaxnm s3, s3, s1
-; LSE-NEXT: fmov w10, s4
-; LSE-NEXT: ubfx w12, w10, #16, #1
-; LSE-NEXT: add w10, w10, w8
-; LSE-NEXT: fmov w9, s3
-; LSE-NEXT: add w10, w12, w10
-; LSE-NEXT: lsr w10, w10, #16
-; LSE-NEXT: ubfx w11, w9, #16, #1
-; LSE-NEXT: add w9, w9, w8
-; LSE-NEXT: fmov s4, w10
-; LSE-NEXT: add w9, w11, w9
-; LSE-NEXT: lsr w9, w9, #16
-; LSE-NEXT: fmov s3, w9
-; LSE-NEXT: fmov w9, s0
-; LSE-NEXT: mov v4.h[1], v3.h[0]
-; LSE-NEXT: mov w11, w9
-; LSE-NEXT: fmov w10, s4
-; LSE-NEXT: casal w11, w10, [x0]
-; LSE-NEXT: fmov s0, w11
-; LSE-NEXT: cmp w11, w9
+; LSE-NEXT: shll v4.4s, v0.4h, #16
+; LSE-NEXT: fmov w8, s0
+; LSE-NEXT: fmaxnm v4.4s, v4.4s, v3.4s
+; LSE-NEXT: mov w10, w8
+; LSE-NEXT: ushr v5.4s, v4.4s, #16
+; LSE-NEXT: and v5.16b, v5.16b, v1.16b
+; LSE-NEXT: add v4.4s, v5.4s, v4.4s
+; LSE-NEXT: addhn v4.4h, v4.4s, v2.4s
+; LSE-NEXT: fmov w9, s4
+; LSE-NEXT: casal w10, w9, [x0]
+; LSE-NEXT: fmov s0, w10
+; LSE-NEXT: cmp w10, w8
; LSE-NEXT: b.ne .LBB7_1
; LSE-NEXT: // %bb.2: // %atomicrmw.end
; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index a3665c6e428608..468136f67c302c 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -652,90 +652,46 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half>
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bfloat> %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT: mov h1, v0.h[1]
-; NOLSE-NEXT: fmov w10, s0
-; NOLSE-NEXT: mov w8, #32767 // =0x7fff
-; NOLSE-NEXT: lsl w10, w10, #16
-; NOLSE-NEXT: fmov w9, s1
-; NOLSE-NEXT: fmov s1, w10
-; NOLSE-NEXT: lsl w9, w9, #16
-; NOLSE-NEXT: fmov s0, w9
+; NOLSE-NEXT: movi v1.4s, #1
+; NOLSE-NEXT: movi v2.4s, #127, msl #8
+; NOLSE-NEXT: shll v0.4s, v0.4h, #16
; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start
; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
-; NOLSE-NEXT: ldaxr w9, [x0]
-; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: mov h3, v2.h[1]
-; NOLSE-NEXT: fmov w11, s2
-; NOLSE-NEXT: lsl w11, w11, #16
-; NOLSE-NEXT: fmov w10, s3
-; NOLSE-NEXT: fmov s3, w11
-; NOLSE-NEXT: lsl w10, w10, #16
-; NOLSE-NEXT: fminnm s3, s3, s1
-; NOLSE-NEXT: fmov s2, w10
-; NOLSE-NEXT: fminnm s2, s2, s0
-; NOLSE-NEXT: fmov w11, s3
-; NOLSE-NEXT: ubfx w13, w11, #16, #1
-; NOLSE-NEXT: add w11, w11, w8
-; NOLSE-NEXT: fmov w10, s2
-; NOLSE-NEXT: add w11, w13, w11
-; NOLSE-NEXT: lsr w11, w11, #16
-; NOLSE-NEXT: ubfx w12, w10, #16, #1
-; NOLSE-NEXT: add w10, w10, w8
-; NOLSE-NEXT: fmov s3, w11
-; NOLSE-NEXT: add w10, w12, w10
-; NOLSE-NEXT: lsr w10, w10, #16
-; NOLSE-NEXT: fmov s2, w10
-; NOLSE-NEXT: mov v3.h[1], v2.h[0]
-; NOLSE-NEXT: fmov w10, s3
-; NOLSE-NEXT: stlxr w11, w10, [x0]
-; NOLSE-NEXT: cbnz w11, .LBB7_1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s3, w8
+; NOLSE-NEXT: shll v3.4s, v3.4h, #16
+; NOLSE-NEXT: fminnm v3.4s, v3.4s, v0.4s
+; NOLSE-NEXT: ushr v4.4s, v3.4s, #16
+; NOLSE-NEXT: and v4.16b, v4.16b, v1.16b
+; NOLSE-NEXT: add v3.4s, v4.4s, v3.4s
+; NOLSE-NEXT: addhn v3.4h, v3.4s, v2.4s
+; NOLSE-NEXT: fmov w9, s3
+; NOLSE-NEXT: stlxr w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB7_1
; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT: fmov d0, x9
+; NOLSE-NEXT: fmov d0, x8
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
; LSE: // %bb.0:
-; LSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT: mov h1, v0.h[1]
-; LSE-NEXT: fmov w10, s0
-; LSE-NEXT: mov w8, #32767 // =0x7fff
+; LSE-NEXT: movi v1.4s, #1
+; LSE-NEXT: movi v2.4s, #127, msl #8
+; LSE-NEXT: shll v3.4s, v0.4h, #16
; LSE-NEXT: ldr s0, [x0]
-; LSE-NEXT: lsl w10, w10, #16
-; LSE-NEXT: fmov w9, s1
-; LSE-NEXT: fmov s2, w10
-; LSE-NEXT: lsl w9, w9, #16
-; LSE-NEXT: fmov s1, w9
; LSE-NEXT: .LBB7_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: mov h3, v0.h[1]
-; LSE-NEXT: fmov w10, s0
-; LSE-NEXT: lsl w10, w10, #16
-; LSE-NEXT: fmov w9, s3
-; LSE-NEXT: fmov s4, w10
-; LSE-NEXT: lsl w9, w9, #16
-; LSE-NEXT: fminnm s4, s4, s2
-; LSE-NEXT: fmov s3, w9
-; LSE-NEXT: fminnm s3, s3, s1
-; LSE-NEXT: fmov w10, s4
-; LSE-NEXT: ubfx w12, w10, #16, #1
-; LSE-NEXT: add w10, w10, w8
-; LSE-NEXT: fmov w9, s3
-; LSE-NEXT: add w10, w12, w10
-; LSE-NEXT: lsr w10, w10, #16
-; LSE-NEXT: ubfx w11, w9, #16, #1
-; LSE-NEXT: add w9, w9, w8
-; LSE-NEXT: fmov s4, w10
-; LSE-NEXT: add w9, w11, w9
-; LSE-NEXT: lsr w9, w9, #16
-; LSE-NEXT: fmov s3, w9
-; LSE-NEXT: fmov w9, s0
-; LSE-NEXT: mov v4.h[1], v3.h[0]
-; LSE-NEXT: mov w11, w9
-; LSE-NEXT: fmov w10, s4
-; LSE-NEXT: casal w11, w10, [x0]
-; LSE-NEXT: fmov s0, w11
-; LSE-NEXT: cmp w11, w9
+; LSE-NEXT: shll v4.4s, v0.4h, #16
+; LSE-NEXT: fmov w8, s0
+; LSE-NEXT: fminnm v4.4s, v4.4s, v3.4s
+; LSE-NEXT: mov w10, w8
+; LSE-NEXT: ushr v5.4s, v4.4s, #16
+; LSE-NEXT: and v5.16b, v5.16b, v1.16b
+; LSE-NEXT: add v4.4s, v5.4s, v4.4s
+; LSE-NEXT: addhn v4.4h, v4.4s, v2.4s
+; LSE-NEXT: fmov w9, s4
+; LSE-NEXT: casal w10, w9, [x0]
+; LSE-NEXT: fmov s0, w10
+; LSE-NEXT: cmp w10, w8
; LSE-NEXT: b.ne .LBB7_1
; LSE-NEXT: // %bb.2: // %atomicrmw.end
; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
new file mode 100644
index 00000000000000..e8468ddfeed181
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
@@ -0,0 +1,752 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FABS
+;
+
+define <vscale x 2 x bfloat> @fabs_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fabs_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.h, z0.h, #0x7fff
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fabs_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fabs_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.h, z0.h, #0x7fff
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fabs_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fabs_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.h, z0.h, #0x7fff
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FADD
+;
+
+define <vscale x 2 x bfloat> @fadd_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fadd <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fadd_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fadd <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fadd_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fadd z2.s, z3.s, z2.s
+; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fadd <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FDIV
+;
+
+define <vscale x 2 x bfloat> @fdiv_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; CHECK-LABEL: fdiv_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fdiv <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fdiv_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; CHECK-LABEL: fdiv_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = fdiv <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fdiv_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: fdiv_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z2.s, z2.s, #16
+; CHECK-NEXT: lsl z3.s, z3.s, #16
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z2.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = fdiv <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMAX
+;
+
+define <vscale x 2 x bfloat> @fmax_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmax_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmax_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmax z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMAXNM
+;
+
+define <vscale x 2 x bfloat> @fmaxnm_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmaxnm_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmaxnm_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMIN
+;
+
+define <vscale x 2 x bfloat> @fmin_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmin_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmin_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmin z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMINNM
+;
+
+define <vscale x 2 x bfloat> @fminnm_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fminnm_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fminnm_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fminnm_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fminnm_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fminnm z2.s, p0/m, z2.s, z3.s
+; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fminnm_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMLA
+;
+
+define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmla_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.d
+; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmla_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.s
+; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
+; NOB16B16-LABEL: fmla_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z3.s, z1.h
+; NOB16B16-NEXT: uunpkhi z4.s, z0.h
+; NOB16B16-NEXT: uunpkhi z5.s, z2.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: uunpklo z2.s, z2.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z4.s, z4.s, #16
+; NOB16B16-NEXT: lsl z5.s, z5.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s
+; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmla_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h
+; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; B16B16-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FMUL
+;
+
+define <vscale x 2 x bfloat> @fmul_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmul_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fmul <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fmul_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmul_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fmul <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fmul z2.s, z3.s, z2.s
+; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmul_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fmul <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FNEG
+;
+
+define <vscale x 2 x bfloat> @fneg_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fneg_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor z0.h, z0.h, #0x8000
+; CHECK-NEXT: ret
+ %res = fneg <vscale x 2 x bfloat> %a
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fneg_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fneg_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor z0.h, z0.h, #0x8000
+; CHECK-NEXT: ret
+ %res = fneg <vscale x 4 x bfloat> %a
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fneg_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fneg_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor z0.h, z0.h, #0x8000
+; CHECK-NEXT: ret
+ %res = fneg <vscale x 8 x bfloat> %a
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FSQRT
+;
+
+define <vscale x 2 x bfloat> @fsqrt_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fsqrt_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fsqrt_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fsqrt_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s
+; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FSUB
+;
+
+define <vscale x 2 x bfloat> @fsub_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_nxv2bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.d
+; NOB16B16-NEXT: fsub z0.s, p0/m, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fsub_nxv2bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fsub <vscale x 2 x bfloat> %a, %b
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fsub_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_nxv4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fsub_nxv4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fsub <vscale x 4 x bfloat> %a, %b
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fsub_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fsub_nxv8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: uunpkhi z2.s, z1.h
+; NOB16B16-NEXT: uunpkhi z3.s, z0.h
+; NOB16B16-NEXT: uunpklo z1.s, z1.h
+; NOB16B16-NEXT: uunpklo z0.s, z0.h
+; NOB16B16-NEXT: ptrue p0.s
+; NOB16B16-NEXT: lsl z2.s, z2.s, #16
+; NOB16B16-NEXT: lsl z3.s, z3.s, #16
+; NOB16B16-NEXT: lsl z1.s, z1.s, #16
+; NOB16B16-NEXT: lsl z0.s, z0.s, #16
+; NOB16B16-NEXT: fsub z2.s, z3.s, z2.s
+; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fsub_nxv8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
+; B16B16-NEXT: ret
+ %res = fsub <vscale x 8 x bfloat> %a, %b
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll b/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll
new file mode 100644
index 00000000000000..65d273d1209827
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FCEIL
+;
+
+define <vscale x 2 x bfloat> @frintp_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintp_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.ceil.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintp_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintp_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.ceil.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintp_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintp_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintp z1.s, p0/m, z1.s
+; CHECK-NEXT: frintp z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.ceil.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FFLOOR
+;
+
+define <vscale x 2 x bfloat> @frintm_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintm_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.floor.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintm_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintm_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.floor.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintm_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintm_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintm z1.s, p0/m, z1.s
+; CHECK-NEXT: frintm z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.floor.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FNEARBYINT
+;
+
+define <vscale x 2 x bfloat> @frinti_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frinti_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frinti_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frinti_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frinti_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frinti_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frinti z1.s, p0/m, z1.s
+; CHECK-NEXT: frinti z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FRINT
+;
+
+define <vscale x 2 x bfloat> @frintx_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintx_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.rint.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintx_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintx_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.rint.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintx_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintx_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintx z1.s, p0/m, z1.s
+; CHECK-NEXT: frintx z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.rint.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; ROUND
+;
+
+define <vscale x 2 x bfloat> @frinta_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frinta_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.round.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frinta_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frinta_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.round.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frinta_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frinta_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frinta z1.s, p0/m, z1.s
+; CHECK-NEXT: frinta z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.round.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; ROUNDEVEN
+;
+
+define <vscale x 2 x bfloat> @frintn_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintn_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.roundeven.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintn_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintn_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.roundeven.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintn_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintn_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintn z1.s, p0/m, z1.s
+; CHECK-NEXT: frintn z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.roundeven.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+;
+; FTRUNC
+;
+
+define <vscale x 2 x bfloat> @frintz_nxv2bf16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: frintz_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x bfloat> @llvm.trunc.nxv2bf16(<vscale x 2 x bfloat> %a)
+ ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @frintz_nxv4bf16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: frintz_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.trunc.nxv4bf16(<vscale x 4 x bfloat> %a)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @frintz_nxv8bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: frintz_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: lsl z1.s, z1.s, #16
+; CHECK-NEXT: lsl z0.s, z0.s, #16
+; CHECK-NEXT: frintz z1.s, p0/m, z1.s
+; CHECK-NEXT: frintz z0.s, p0/m, z0.s
+; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
+; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.trunc.nxv8bf16(<vscale x 8 x bfloat> %a)
+ ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 2 x bfloat> @llvm.ceil.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.ceil.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.ceil.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.floor.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.floor.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.floor.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.rint.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.rint.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.rint.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.round.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.round.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.round.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.roundeven.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.roundeven.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.roundeven.nxv8bf16( <vscale x 8 x bfloat>)
+
+declare <vscale x 2 x bfloat> @llvm.trunc.nxv2bf16( <vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.trunc.nxv4bf16( <vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.trunc.nxv8bf16( <vscale x 8 x bfloat>)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index e195026c13d27a..0571960539aedc 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -8840,19 +8840,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s4, s6, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
@@ -8861,7 +8861,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
@@ -8991,19 +8991,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
@@ -9247,29 +9246,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s4, s6, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX940-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX940-NEXT: v_add3_u32 v0, v0, v6, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v7, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
@@ -9392,27 +9391,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v0, v0, v6, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
@@ -9665,7 +9663,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
+; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
@@ -9684,25 +9682,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX940-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX940-NEXT: s_movk_i32 s10, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX940-NEXT: s_mov_b32 s11, 0x7060302
; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Loop Header: Depth=1
; GFX940-NEXT: ; Child Loop BB28_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX940-NEXT: v_add_f32_e32 v4, v4, v9
-; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[8:9]
; GFX940-NEXT: s_mov_b64 s[8:9], exec
+; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10
+; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v10
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc
; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10
; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5
@@ -9723,7 +9721,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v10, s[4:7], 0 offen sc0
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB28_4
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
@@ -9907,7 +9905,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
+; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9926,23 +9924,22 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 16, v5
; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9
-; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[8:9]
+; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc
; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
@@ -9962,7 +9959,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v10, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_4
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
@@ -10309,19 +10306,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s4, s6, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
@@ -10330,7 +10327,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
@@ -10460,19 +10457,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
@@ -10716,29 +10712,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s4, s6, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX940-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX940-NEXT: v_add3_u32 v0, v0, v6, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v7, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
@@ -10861,27 +10857,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v0, v0, v6, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
@@ -11116,19 +11111,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s4, s6, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
@@ -11137,7 +11132,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
@@ -11267,19 +11262,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
@@ -11523,29 +11517,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s4, s6, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX940-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX940-NEXT: v_add3_u32 v0, v0, v6, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v7, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
@@ -11668,27 +11662,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v0, v0, v6, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
@@ -11922,29 +11915,29 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_add_i32 s4, s6, 0x400
; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX940-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX940-NEXT: v_add3_u32 v0, v0, v6, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v7, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
@@ -12067,27 +12060,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v0, v0, v6, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 1ae1204e3cde18..b639697f16453d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -15317,29 +15317,28 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -15594,29 +15593,28 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -15885,33 +15883,32 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v0, v0, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
@@ -16160,36 +16157,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB71_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16430,36 +16426,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB72_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16716,36 +16711,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dword v1, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[0:1]
+; GFX90A-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v2, v2, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB73_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -16999,29 +16993,28 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -17276,38 +17269,37 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB75_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17556,29 +17548,28 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -17827,36 +17818,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB77_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18100,29 +18090,28 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -18371,36 +18360,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB79_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 361cc1e9e6c1db..b880b70064929f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -18660,29 +18660,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -18989,29 +18988,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -19320,29 +19318,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -19650,36 +19647,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB81_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19969,36 +19965,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB82_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20291,36 +20286,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB83_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -20626,29 +20620,28 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -20954,38 +20947,37 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -21283,29 +21275,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -21607,36 +21598,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -21931,29 +21921,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -22255,36 +22244,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB89_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -22579,29 +22567,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v2, s9
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -22903,36 +22890,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v4, v4, v6, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB91_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 295ae94902da73..bcfcf4406b6457 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -5930,32 +5930,31 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
@@ -6238,32 +6237,31 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
@@ -6542,35 +6540,34 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v3, v0
+; GFX90A-NEXT: ds_read_b32 v4, v0
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v1, v1, v6, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -6840,35 +6837,34 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532
+; GFX90A-NEXT: ds_read_b32 v4, v0 offset:65532
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8
-; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; GFX90A-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX90A-NEXT: v_add3_u32 v1, v1, v6, s8
+; GFX90A-NEXT: v_add3_u32 v8, v8, v7, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v6, v6
+; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9
-; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
+; GFX90A-NEXT: v_perm_b32 v1, v5, v1, s9
+; GFX90A-NEXT: ds_cmpst_rtn_b32 v1, v0, v4, v1 offset:65532
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
More information about the llvm-commits
mailing list