[llvm] Nvptx port LowerBITCAST to SelectionDAG (PR #120903)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 25 07:48:10 PST 2025
https://github.com/GrumpyPigSkin updated https://github.com/llvm/llvm-project/pull/120903
>From e1b6fce91b52484f6cf72690acfe38c20a5de5ef Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Sun, 22 Dec 2024 15:08:25 +0000
Subject: [PATCH 01/15] Ported LowerBITCAST from NVPTXISelLowering.cpp to
SelectionDAG/LegalizeTypes.cpp.
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 9 +++--
.../CodeGen/SelectionDAG/LegalizeTypes.cpp | 33 +++++++++++++++++++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 26 +--------------
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 2 --
5 files changed, 42 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index be7521f3416850..8a6bfc0c66cd82 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2174,8 +2174,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
}
SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
- // This should only occur in unusual situations like bitcasting to an
- // x86_fp80, so just turn it into a store+load
+
+ // Use the custom lowering.
+ if (const auto Res = LowerBitcast(N)) {
+ return Res;
+ }
+
+ // If it fails fall back to the default method
return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index b6abad830c371e..8df3e5ec163e8f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -910,6 +910,39 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
}
+static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT,
+ SDValue Value) {
+ if (Value->getValueType(0) == VT)
+ return Value;
+ return DAG.getNode(ISD::BITCAST, DL, VT, Value);
+}
+
+SDValue DAGTypeLegalizer::LowerBitcast(SDNode *Node) const {
+ assert(Node->getOpcode() == ISD::BITCAST ||
+ Node->getOpcode() == ISD::FP_ROUND && "Unexpected opcode!");
+ // Handle bitcasting from v2i8 without hitting the default promotion
+ // strategy which goes through stack memory.
+ EVT FromVT = Node->getOperand(0)->getValueType(0);
+ if (FromVT != MVT::v2i8) {
+ return SDValue();
+ }
+
+ // Pack vector elements into i16 and bitcast to final type
+ SDLoc DL(Node);
+ SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
+ Node->getOperand(0), DAG.getIntPtrConstant(0, DL));
+ SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
+ Node->getOperand(0), DAG.getIntPtrConstant(1, DL));
+ SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
+ SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
+ SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
+ SDValue AsInt = DAG.getNode(
+ ISD::OR, DL, MVT::i16,
+ {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
+ EVT ToVT = Node->getValueType(0);
+ return MaybeBitcast(DAG, DL, ToVT, AsInt);
+}
+
/// Replace the node's results with custom code provided by the target and
/// return "true", or do nothing and return "false".
/// The last parameter is FALSE if we are dealing with a node with legal
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 571a710cc92a34..30951112069ed5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -216,6 +216,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue BitConvertToInteger(SDValue Op);
SDValue BitConvertVectorToIntegerVector(SDValue Op);
SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
+ SDValue LowerBitcast(SDNode *N) const;
bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
bool CustomWidenLowerNode(SDNode *N, EVT VT);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c1f717694a4c7..2eaeb624004730 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2086,30 +2086,6 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
-SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
- // Handle bitcasting from v2i8 without hitting the default promotion
- // strategy which goes through stack memory.
- EVT FromVT = Op->getOperand(0)->getValueType(0);
- if (FromVT != MVT::v2i8) {
- return Op;
- }
-
- // Pack vector elements into i16 and bitcast to final type
- SDLoc DL(Op);
- SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
- SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
- SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
- SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
- SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
- SDValue AsInt = DAG.getNode(
- ISD::OR, DL, MVT::i16,
- {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
- EVT ToVT = Op->getValueType(0);
- return MaybeBitcast(DAG, DL, ToVT, AsInt);
-}
-
// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
// would get lowered as two constant loads and vector-packing move.
// Instead we want just a constant move:
@@ -2619,7 +2595,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
case ISD::BITCAST:
- return LowerBITCAST(Op, DAG);
+ return SDValue();
case ISD::EXTRACT_SUBVECTOR:
return Op;
case ISD::EXTRACT_VECTOR_ELT:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 4a98fe21b81dc6..446ff1536d36cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -265,8 +265,6 @@ class NVPTXTargetLowering : public TargetLowering {
const NVPTXSubtarget &STI; // cache the subtarget here
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
- SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
-
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
>From 3320d5585b2b37df05f4dcd54cf9ae11aba42e00 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Sun, 22 Dec 2024 15:12:47 +0000
Subject: [PATCH 02/15] Removed redundant assert check
---
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 1 -
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 3 +--
2 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8a6bfc0c66cd82..bcb59e3c2aef3e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2174,7 +2174,6 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
}
SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
-
// Use the custom lowering.
if (const auto Res = LowerBitcast(N)) {
return Res;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 8df3e5ec163e8f..4aecf667b2cee1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -918,8 +918,7 @@ static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT,
}
SDValue DAGTypeLegalizer::LowerBitcast(SDNode *Node) const {
- assert(Node->getOpcode() == ISD::BITCAST ||
- Node->getOpcode() == ISD::FP_ROUND && "Unexpected opcode!");
+ assert(Node->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
EVT FromVT = Node->getOperand(0)->getValueType(0);
>From d7cb1339321d41b6f489450f16ea529eac194889 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Sun, 22 Dec 2024 16:54:58 +0000
Subject: [PATCH 03/15] Addressed Most Code Review Comments
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 3 +--
.../CodeGen/SelectionDAG/LegalizeTypes.cpp | 22 ++++++++-----------
2 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index bcb59e3c2aef3e..05cbcf3297ac3d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2175,9 +2175,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
// Use the custom lowering.
- if (const auto Res = LowerBitcast(N)) {
+ if (SDValue Res = LowerBitcast(N))
return Res;
- }
// If it fails fall back to the default method
return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 4aecf667b2cee1..b91530d6f0bb69 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -910,21 +910,13 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
}
-static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT,
- SDValue Value) {
- if (Value->getValueType(0) == VT)
- return Value;
- return DAG.getNode(ISD::BITCAST, DL, VT, Value);
-}
-
SDValue DAGTypeLegalizer::LowerBitcast(SDNode *Node) const {
assert(Node->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
// Handle bitcasting from v2i8 without hitting the default promotion
// strategy which goes through stack memory.
EVT FromVT = Node->getOperand(0)->getValueType(0);
- if (FromVT != MVT::v2i8) {
+ if (FromVT != MVT::v2i8)
return SDValue();
- }
// Pack vector elements into i16 and bitcast to final type
SDLoc DL(Node);
@@ -932,14 +924,18 @@ SDValue DAGTypeLegalizer::LowerBitcast(SDNode *Node) const {
Node->getOperand(0), DAG.getIntPtrConstant(0, DL));
SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
Node->getOperand(0), DAG.getIntPtrConstant(1, DL));
+
SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
- SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
+
+ EVT ShiftAmtTy = TLI.getShiftAmountTy(Extend1.getValueType(), DAG.getDataLayout());
+ SDValue ShiftConst = DAG.getShiftAmountConstant(8, ShiftAmtTy, DL);
SDValue AsInt = DAG.getNode(
- ISD::OR, DL, MVT::i16,
- {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
+ ISD::OR, DL, MVT::i16, Extend0,
+ DAG.getNode(ISD::SHL, DL, Extend1.getValueType(), Extend1, ShiftConst));
EVT ToVT = Node->getValueType(0);
- return MaybeBitcast(DAG, DL, ToVT, AsInt);
+
+ return DAG.getBitcast( ToVT, AsInt);
}
/// Replace the node's results with custom code provided by the target and
>From b67448702e3fde5b94ceaeed6d3a78b75cf248da Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Sun, 22 Dec 2024 17:00:01 +0000
Subject: [PATCH 04/15] Applied code formatting
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index b91530d6f0bb69..8f42877bcb8b66 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -924,18 +924,19 @@ SDValue DAGTypeLegalizer::LowerBitcast(SDNode *Node) const {
Node->getOperand(0), DAG.getIntPtrConstant(0, DL));
SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
Node->getOperand(0), DAG.getIntPtrConstant(1, DL));
-
+
SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
-
- EVT ShiftAmtTy = TLI.getShiftAmountTy(Extend1.getValueType(), DAG.getDataLayout());
+
+ EVT ShiftAmtTy =
+ TLI.getShiftAmountTy(Extend1.getValueType(), DAG.getDataLayout());
SDValue ShiftConst = DAG.getShiftAmountConstant(8, ShiftAmtTy, DL);
SDValue AsInt = DAG.getNode(
ISD::OR, DL, MVT::i16, Extend0,
DAG.getNode(ISD::SHL, DL, Extend1.getValueType(), Extend1, ShiftConst));
EVT ToVT = Node->getValueType(0);
-
- return DAG.getBitcast( ToVT, AsInt);
+
+ return DAG.getBitcast(ToVT, AsInt);
}
/// Replace the node's results with custom code provided by the target and
>From f8dadde2fa135391b5bd60f8629546f3a639a228 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Mon, 23 Dec 2024 17:23:33 +0000
Subject: [PATCH 05/15] Generalised bit packing and unpacking
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 14 ++-
.../CodeGen/SelectionDAG/LegalizeTypes.cpp | 112 ++++++++++++++----
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 37 ------
4 files changed, 100 insertions(+), 67 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 05cbcf3297ac3d..0eaf2a5dc44f1c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -474,7 +474,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
switch (getTypeAction(InVT)) {
case TargetLowering::TypeLegal:
- break;
+ // Try and use in-register bitcast
+ if (SDValue Res = LowerBitcastInRegister(N))
+ return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+ Res);
+ // Fallback to stack load store
+ break;
+
case TargetLowering::TypePromoteInteger:
if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector())
// The input promotes to the same size. Convert the promoted value.
@@ -2174,11 +2180,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
}
SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
- // Use the custom lowering.
- if (SDValue Res = LowerBitcast(N))
+ // Try and use in register bitcast
+ if (SDValue Res = LowerBitcastInRegister(N))
return Res;
- // If it fails fall back to the default method
+ // Fallback
return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 8f42877bcb8b66..8220b9a9ffc9fd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -910,33 +910,95 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
}
-SDValue DAGTypeLegalizer::LowerBitcast(SDNode *Node) const {
- assert(Node->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
- // Handle bitcasting from v2i8 without hitting the default promotion
- // strategy which goes through stack memory.
- EVT FromVT = Node->getOperand(0)->getValueType(0);
- if (FromVT != MVT::v2i8)
+SDValue DAGTypeLegalizer::PackBitcastInRegister(SDNode *N) const {
+ assert(N->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
+
+ EVT FromVT = N->getOperand(0)->getValueType(0);
+ EVT ToVT = N->getValueType(0);
+
+ if (!FromVT.isVector() || !ToVT.isInteger())
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Get the number of elements we need to pack into the integer
+ unsigned NumElems = FromVT.getVectorNumElements();
+ EVT ElemVT = FromVT.getVectorElementType();
+ unsigned ElemBits = ElemVT.getSizeInBits();
+
+ EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), ElemBits * NumElems);
+ SDValue Packed = DAG.getConstant(0, DL, PackVT);
+
+ // Determine endianness
+ bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+ for (unsigned I = 0; I < NumElems; ++I) {
+ unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
+ SDValue Elem =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, N->getOperand(0),
+ DAG.getIntPtrConstant(ElementIndex, DL));
+ SDValue ExtElem = DAG.getNode(ISD::ZERO_EXTEND, DL, PackVT, Elem);
+ SDValue ShiftAmount = DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL);
+ SDValue ShiftedElem =
+ DAG.getNode(ISD::SHL, DL, PackVT, ExtElem, ShiftAmount);
+
+ Packed = DAG.getNode(ISD::OR, DL, PackVT, Packed, ShiftedElem);
+ }
+
+ return DAG.getBitcast(ToVT, Packed);
+}
+
+
+SDValue DAGTypeLegalizer::UnpackBitcastInRegister(SDNode *N) const {
+ assert(N->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
+ EVT FromVT = N->getOperand(0)->getValueType(0);
+ EVT ToVT = N->getValueType(0);
+
+ if (!FromVT.isInteger() || !ToVT.isVector())
return SDValue();
- // Pack vector elements into i16 and bitcast to final type
- SDLoc DL(Node);
- SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Node->getOperand(0), DAG.getIntPtrConstant(0, DL));
- SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
- Node->getOperand(0), DAG.getIntPtrConstant(1, DL));
-
- SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
- SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
-
- EVT ShiftAmtTy =
- TLI.getShiftAmountTy(Extend1.getValueType(), DAG.getDataLayout());
- SDValue ShiftConst = DAG.getShiftAmountConstant(8, ShiftAmtTy, DL);
- SDValue AsInt = DAG.getNode(
- ISD::OR, DL, MVT::i16, Extend0,
- DAG.getNode(ISD::SHL, DL, Extend1.getValueType(), Extend1, ShiftConst));
- EVT ToVT = Node->getValueType(0);
-
- return DAG.getBitcast(ToVT, AsInt);
+ SDLoc DL(N);
+
+ unsigned NumElems = ToVT.getVectorNumElements();
+ EVT ElemVT = ToVT.getVectorElementType();
+ unsigned ElemBits = ElemVT.getSizeInBits();
+
+ // Ensure the integer has enough bits
+ unsigned PackedBits = FromVT.getSizeInBits();
+ assert(PackedBits >= ElemBits * NumElems &&
+ "Packed type does not have enough bits to represent the vector!");
+
+ // Determine endianness
+ bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+ // Hold all the vector elements
+ SmallVector<SDValue, 8> Elements;
+ Elements.reserve(NumElems);
+
+ for (unsigned I = 0; I < NumElems; ++I) {
+ unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
+ unsigned ShiftAmountVal = ElemBits * ElementIndex;
+
+ SDValue ShiftAmount =
+ DAG.getShiftAmountConstant(ShiftAmountVal, FromVT, DL);
+ SDValue Shifted =
+ DAG.getNode(ISD::SRL, DL, FromVT, N->getOperand(0), ShiftAmount);
+ SDValue Element = DAG.getNode(ISD::TRUNCATE, DL, ElemVT, Shifted);
+ Elements.push_back(Element);
+ }
+
+ return DAG.getBuildVector(ToVT, DL, Elements);
+}
+
+
+SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
+ // Try the pack, if we aren't going from vector -> scalar it will backout immediately.
+ if (SDValue Res = PackBitcastInRegister(N)) {
+ return Res;
+ }
+
+ // If we get here then try and unpack the bitcast
+ return UnpackBitcastInRegister(N);
}
/// Replace the node's results with custom code provided by the target and
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 30951112069ed5..dd45b1e2f10896 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -216,7 +216,9 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue BitConvertToInteger(SDValue Op);
SDValue BitConvertVectorToIntegerVector(SDValue Op);
SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
- SDValue LowerBitcast(SDNode *N) const;
+ SDValue PackBitcastInRegister(SDNode *N) const;
+ SDValue UnpackBitcastInRegister(SDNode *N) const;
+ SDValue LowerBitcastInRegister(SDNode *N) const;
bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
bool CustomWidenLowerNode(SDNode *N, EVT VT);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2eaeb624004730..7d06139120d712 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -472,13 +472,6 @@ VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
return VectorInfo;
}
-static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT,
- SDValue Value) {
- if (Value->getValueType(0) == VT)
- return Value;
- return DAG.getNode(ISD::BITCAST, DL, VT, Value);
-}
-
// NVPTXTargetLowering Constructor.
NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
const NVPTXSubtarget &STI)
@@ -622,9 +615,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
- // Custom conversions to/from v2i8.
- setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
-
// Only logical ops can be done on v4i8 directly, others must be done
// elementwise.
setOperationAction(
@@ -2594,8 +2584,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return Op;
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
- case ISD::BITCAST:
- return SDValue();
case ISD::EXTRACT_SUBVECTOR:
return Op;
case ISD::EXTRACT_VECTOR_ELT:
@@ -5178,28 +5166,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
}
-static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &Results) {
- // Handle bitcasting to v2i8 without hitting the default promotion
- // strategy which goes through stack memory.
- SDValue Op(Node, 0);
- EVT ToVT = Op->getValueType(0);
- if (ToVT != MVT::v2i8) {
- return;
- }
-
- // Bitcast to i16 and unpack elements into a vector
- SDLoc DL(Node);
- SDValue AsInt = MaybeBitcast(DAG, DL, MVT::i16, Op->getOperand(0));
- SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
- SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
- SDValue Vec1 =
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
- DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
- Results.push_back(
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
-}
-
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) {
@@ -5435,9 +5401,6 @@ void NVPTXTargetLowering::ReplaceNodeResults(
switch (N->getOpcode()) {
default:
report_fatal_error("Unhandled custom legalization");
- case ISD::BITCAST:
- ReplaceBITCAST(N, DAG, Results);
- return;
case ISD::LOAD:
ReplaceLoadVector(N, DAG, Results);
return;
>From a6ca08a5a571dd29c3465f2e2b8ece82853ab07b Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Mon, 23 Dec 2024 17:25:29 +0000
Subject: [PATCH 06/15] Formatting
---
.../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 11 +++++------
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 5 ++---
2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 0eaf2a5dc44f1c..15aca9e5a9d48e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -475,12 +475,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
switch (getTypeAction(InVT)) {
case TargetLowering::TypeLegal:
// Try and use in-register bitcast
- if (SDValue Res = LowerBitcastInRegister(N))
- return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
- Res);
- // Fallback to stack load store
- break;
-
+ if (SDValue Res = LowerBitcastInRegister(N))
+ return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Res);
+ // Fallback to stack load store
+ break;
+
case TargetLowering::TypePromoteInteger:
if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector())
// The input promotes to the same size. Convert the promoted value.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 8220b9a9ffc9fd..27393907fd36a0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -948,7 +948,6 @@ SDValue DAGTypeLegalizer::PackBitcastInRegister(SDNode *N) const {
return DAG.getBitcast(ToVT, Packed);
}
-
SDValue DAGTypeLegalizer::UnpackBitcastInRegister(SDNode *N) const {
assert(N->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
EVT FromVT = N->getOperand(0)->getValueType(0);
@@ -990,9 +989,9 @@ SDValue DAGTypeLegalizer::UnpackBitcastInRegister(SDNode *N) const {
return DAG.getBuildVector(ToVT, DL, Elements);
}
-
SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
- // Try the pack, if we aren't going from vector -> scalar it will backout immediately.
+ // Try the pack, if we aren't going from vector -> scalar it will backout
+ // immediately.
if (SDValue Res = PackBitcastInRegister(N)) {
return Res;
}
>From 75a533c088097f2fe1743baf3f6a8c7fb558946d Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Tue, 24 Dec 2024 10:25:23 +0000
Subject: [PATCH 07/15] Check for only scalar integer types
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 27393907fd36a0..8375339f378c6f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -916,7 +916,7 @@ SDValue DAGTypeLegalizer::PackBitcastInRegister(SDNode *N) const {
EVT FromVT = N->getOperand(0)->getValueType(0);
EVT ToVT = N->getValueType(0);
- if (!FromVT.isVector() || !ToVT.isInteger())
+ if (!FromVT.isVector() || !ToVT.isScalarInteger())
return SDValue();
SDLoc DL(N);
@@ -953,7 +953,7 @@ SDValue DAGTypeLegalizer::UnpackBitcastInRegister(SDNode *N) const {
EVT FromVT = N->getOperand(0)->getValueType(0);
EVT ToVT = N->getValueType(0);
- if (!FromVT.isInteger() || !ToVT.isVector())
+ if (!FromVT.isScalarInteger() || !ToVT.isVector())
return SDValue();
SDLoc DL(N);
>From 686f69810e96eb12eb6c86c3b08c559a3aad167d Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Thu, 16 Jan 2025 18:50:07 +0000
Subject: [PATCH 08/15] Merged function
---
.../CodeGen/SelectionDAG/LegalizeTypes.cpp | 122 ++++++++----------
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 -
2 files changed, 53 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 8375339f378c6f..8d5549f9c9f8e3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -910,94 +910,78 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
}
-SDValue DAGTypeLegalizer::PackBitcastInRegister(SDNode *N) const {
+SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
+ // Try the pack, if we aren't going from vector -> scalar it will backout
+ // immediately.
assert(N->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
EVT FromVT = N->getOperand(0)->getValueType(0);
EVT ToVT = N->getValueType(0);
- if (!FromVT.isVector() || !ToVT.isScalarInteger())
- return SDValue();
-
SDLoc DL(N);
- // Get the number of elements we need to pack into the integer
- unsigned NumElems = FromVT.getVectorNumElements();
- EVT ElemVT = FromVT.getVectorElementType();
- unsigned ElemBits = ElemVT.getSizeInBits();
-
- EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), ElemBits * NumElems);
- SDValue Packed = DAG.getConstant(0, DL, PackVT);
-
- // Determine endianness
bool IsBigEndian = DAG.getDataLayout().isBigEndian();
- for (unsigned I = 0; I < NumElems; ++I) {
- unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
- SDValue Elem =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, N->getOperand(0),
- DAG.getIntPtrConstant(ElementIndex, DL));
- SDValue ExtElem = DAG.getNode(ISD::ZERO_EXTEND, DL, PackVT, Elem);
- SDValue ShiftAmount = DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL);
- SDValue ShiftedElem =
- DAG.getNode(ISD::SHL, DL, PackVT, ExtElem, ShiftAmount);
-
- Packed = DAG.getNode(ISD::OR, DL, PackVT, Packed, ShiftedElem);
- }
-
- return DAG.getBitcast(ToVT, Packed);
-}
-
-SDValue DAGTypeLegalizer::UnpackBitcastInRegister(SDNode *N) const {
- assert(N->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
- EVT FromVT = N->getOperand(0)->getValueType(0);
- EVT ToVT = N->getValueType(0);
-
- if (!FromVT.isScalarInteger() || !ToVT.isVector())
- return SDValue();
-
- SDLoc DL(N);
+ // Pack the values in register
+ if (FromVT.isVector() && ToVT.isScalarInteger()) {
+
+ EVT ElemVT = FromVT.getVectorElementType();
+ unsigned NumElems = FromVT.getVectorNumElements();
+ unsigned ElemBits = ElemVT.getSizeInBits();
+
+ unsigned PackedBits = ToVT.getSizeInBits();
+ assert(PackedBits >= ElemBits * NumElems &&
+ "Scalar type does not have enough bits to pack vector values.");
+
+ EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), ElemBits * NumElems);
+ SDValue Packed = DAG.getConstant(0, DL, PackVT);
+
+ for (unsigned I = 0; I < NumElems; ++I) {
+ unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
+ SDValue Elem =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, N->getOperand(0),
+ DAG.getIntPtrConstant(ElementIndex, DL));
+ SDValue ExtElem = DAG.getNode(ISD::ZERO_EXTEND, DL, PackVT, Elem);
+ SDValue ShiftAmount = DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL);
+ SDValue ShiftedElem =
+ DAG.getNode(ISD::SHL, DL, PackVT, ExtElem, ShiftAmount);
+
+ Packed = DAG.getNode(ISD::OR, DL, PackVT, Packed, ShiftedElem);
+ }
- unsigned NumElems = ToVT.getVectorNumElements();
- EVT ElemVT = ToVT.getVectorElementType();
- unsigned ElemBits = ElemVT.getSizeInBits();
+ return DAG.getBitcast(ToVT, Packed);
- // Ensure the integer has enough bits
- unsigned PackedBits = FromVT.getSizeInBits();
- assert(PackedBits >= ElemBits * NumElems &&
- "Packed type does not have enough bits to represent the vector!");
+ } else if (FromVT.isScalarInteger() && ToVT.isVector()) {
- // Determine endianness
- bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+ EVT ElemVT = ToVT.getVectorElementType();
+ unsigned NumElems = ToVT.getVectorNumElements();
+ unsigned ElemBits = ElemVT.getSizeInBits();
- // Hold all the vector elements
- SmallVector<SDValue, 8> Elements;
- Elements.reserve(NumElems);
+ // Ensure the integer has enough bits
+ unsigned PackedBits = FromVT.getSizeInBits();
+ assert(PackedBits >= ElemBits * NumElems &&
+ "Vector does not have enough bits to unpack scalar type.");
- for (unsigned I = 0; I < NumElems; ++I) {
- unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
- unsigned ShiftAmountVal = ElemBits * ElementIndex;
+ // Hold all the vector elements
+ SmallVector<SDValue, 8> Elements;
+ Elements.reserve(NumElems);
- SDValue ShiftAmount =
- DAG.getShiftAmountConstant(ShiftAmountVal, FromVT, DL);
- SDValue Shifted =
- DAG.getNode(ISD::SRL, DL, FromVT, N->getOperand(0), ShiftAmount);
- SDValue Element = DAG.getNode(ISD::TRUNCATE, DL, ElemVT, Shifted);
- Elements.push_back(Element);
- }
+ for (unsigned I = 0; I < NumElems; ++I) {
+ unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
+ unsigned ShiftAmountVal = ElemBits * ElementIndex;
- return DAG.getBuildVector(ToVT, DL, Elements);
-}
+ SDValue ShiftAmount =
+ DAG.getShiftAmountConstant(ShiftAmountVal, FromVT, DL);
+ SDValue Shifted =
+ DAG.getNode(ISD::SRL, DL, FromVT, N->getOperand(0), ShiftAmount);
+ SDValue Element = DAG.getNode(ISD::TRUNCATE, DL, ElemVT, Shifted);
+ Elements.push_back(Element);
+ }
-SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
- // Try the pack, if we aren't going from vector -> scalar it will backout
- // immediately.
- if (SDValue Res = PackBitcastInRegister(N)) {
- return Res;
+ return DAG.getBuildVector(ToVT, DL, Elements);
}
- // If we get here then try and unpack the bitcast
- return UnpackBitcastInRegister(N);
+ return {};
}
/// Replace the node's results with custom code provided by the target and
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index dd45b1e2f10896..9d0c970e350f99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -216,8 +216,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue BitConvertToInteger(SDValue Op);
SDValue BitConvertVectorToIntegerVector(SDValue Op);
SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
- SDValue PackBitcastInRegister(SDNode *N) const;
- SDValue UnpackBitcastInRegister(SDNode *N) const;
SDValue LowerBitcastInRegister(SDNode *N) const;
bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
bool CustomWidenLowerNode(SDNode *N, EVT VT);
>From d8a82eb256213db9a8a333680de064c8a02a9891 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Thu, 16 Jan 2025 18:50:32 +0000
Subject: [PATCH 09/15] Formatting
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 8d5549f9c9f8e3..dfe389c5632436 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -924,7 +924,7 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
// Pack the values in register
if (FromVT.isVector() && ToVT.isScalarInteger()) {
-
+
EVT ElemVT = FromVT.getVectorElementType();
unsigned NumElems = FromVT.getVectorNumElements();
unsigned ElemBits = ElemVT.getSizeInBits();
@@ -942,7 +942,8 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, N->getOperand(0),
DAG.getIntPtrConstant(ElementIndex, DL));
SDValue ExtElem = DAG.getNode(ISD::ZERO_EXTEND, DL, PackVT, Elem);
- SDValue ShiftAmount = DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL);
+ SDValue ShiftAmount =
+ DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL);
SDValue ShiftedElem =
DAG.getNode(ISD::SHL, DL, PackVT, ExtElem, ShiftAmount);
>From e5f7304437c26fa7e87539af6bb17076a73f1cf5 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Thu, 16 Jan 2025 18:55:07 +0000
Subject: [PATCH 10/15] Updated comments
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index dfe389c5632436..3ffb4427d77d58 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -911,8 +911,7 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
}
SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
- // Try the pack, if we aren't going from vector -> scalar it will backout
- // immediately.
+ // Lower a bitcast into in-register shift operations
assert(N->getOpcode() == ISD::BITCAST && "Unexpected opcode!");
EVT FromVT = N->getOperand(0)->getValueType(0);
@@ -922,7 +921,6 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
bool IsBigEndian = DAG.getDataLayout().isBigEndian();
- // Pack the values in register
if (FromVT.isVector() && ToVT.isScalarInteger()) {
EVT ElemVT = FromVT.getVectorElementType();
@@ -958,12 +956,10 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
unsigned NumElems = ToVT.getVectorNumElements();
unsigned ElemBits = ElemVT.getSizeInBits();
- // Ensure the integer has enough bits
unsigned PackedBits = FromVT.getSizeInBits();
assert(PackedBits >= ElemBits * NumElems &&
"Vector does not have enough bits to unpack scalar type.");
- // Hold all the vector elements
SmallVector<SDValue, 8> Elements;
Elements.reserve(NumElems);
>From 8d01cbe78c524000d4841518c0ec3c3a505f2c47 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Mon, 20 Jan 2025 23:18:20 +0000
Subject: [PATCH 11/15] Updated failing tests
---
llvm/test/CodeGen/AArch64/bitcast.ll | 28 +-
llvm/test/CodeGen/AArch64/shufflevector.ll | 60 +-
llvm/test/CodeGen/AMDGPU/build_vector-r600.ll | 14 +-
llvm/test/CodeGen/AMDGPU/ctpop16.ll | 22 +-
llvm/test/CodeGen/AMDGPU/idot8s.ll | 1548 ++--
llvm/test/CodeGen/AMDGPU/idot8u.ll | 2626 ++++---
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 895 +--
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 61 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 3777 +++++-----
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 121 +-
llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 6203 +++++++++++++++--
llvm/test/CodeGen/AMDGPU/min.ll | 136 +-
llvm/test/CodeGen/AMDGPU/r600.bitcast.ll | 39 +-
llvm/test/CodeGen/AMDGPU/shl.ll | 12 +-
llvm/test/CodeGen/AMDGPU/sra.ll | 34 +-
llvm/test/CodeGen/Mips/cconv/vector.ll | 856 ++-
.../Thumb2/LowOverheadLoops/fast-fp-loops.ll | 46 +-
llvm/test/CodeGen/Thumb2/active_lane_mask.ll | 43 +-
llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll | 536 +-
llvm/test/CodeGen/Thumb2/mve-masked-load.ll | 506 +-
llvm/test/CodeGen/Thumb2/mve-masked-store.ll | 726 +-
llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll | 572 +-
.../CodeGen/X86/avxneconvert-intrinsics.ll | 178 +-
llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 156 +-
llvm/test/CodeGen/X86/pr64655.ll | 46 +-
25 files changed, 11550 insertions(+), 7691 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 39f2572d9fd354..d75edc3a7bb43d 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -49,12 +49,15 @@ define <4 x i16> @foo2(<2 x i32> %a) {
define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-LABEL: bitcast_v4i8_i32:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sub sp, sp, #16
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: umov w8, v0.h[0]
+; CHECK-SD-NEXT: umov w9, v0.h[1]
+; CHECK-SD-NEXT: umov w10, v0.h[2]
+; CHECK-SD-NEXT: and w8, w8, #0xff
+; CHECK-SD-NEXT: bfi w8, w9, #8, #8
+; CHECK-SD-NEXT: umov w9, v0.h[3]
+; CHECK-SD-NEXT: bfi w8, w10, #16, #8
+; CHECK-SD-NEXT: orr w0, w8, w9, lsl #24
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bitcast_v4i8_i32:
@@ -99,15 +102,10 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){
define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-LABEL: bitcast_v2i16_i32:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sub sp, sp, #16
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [sp, #12]
-; CHECK-SD-NEXT: strh w8, [sp, #14]
-; CHECK-SD-NEXT: ldr w0, [sp, #12]
-; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: bfi w0, w8, #16, #16
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bitcast_v2i16_i32:
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 0f5b240e387ed0..0221ffcb19063d 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -229,15 +229,17 @@ define <2 x i1> @shufflevector_v2i1(<2 x i1> %a, <2 x i1> %b){
define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-LABEL: shufflevector_v4i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sub sp, sp, #16
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: ext v0.8b, v1.8b, v0.8b, #6
-; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v0.4h
-; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: add sp, sp, #16
-; CHECK-SD-NEXT: ret
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: umov w8, v0.h[1]
+; CHECK-SD-NEXT: umov w9, v0.h[2]
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: umov w10, v1.h[0]
+; CHECK-SD-NEXT: and w8, w8, #0xff
+; CHECK-SD-NEXT: bfi w8, w9, #8, #8
+; CHECK-SD-NEXT: umov w9, v1.h[3]
+; CHECK-SD-NEXT: bfi w8, w10, #16, #8
+; CHECK-SD-NEXT: orr w0, w8, w9, lsl #24
+; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v4i8:
; CHECK-GI: // %bb.0:
@@ -285,15 +287,11 @@ define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b){
define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-LABEL: shufflevector_v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sub sp, sp, #16
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [sp, #12]
-; CHECK-SD-NEXT: strh w8, [sp, #14]
-; CHECK-SD-NEXT: ldr w0, [sp, #12]
-; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mov w0, v0.s[1]
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: fmov w8, s1
+; CHECK-SD-NEXT: bfi w0, w8, #16, #16
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v2i16:
@@ -462,14 +460,13 @@ define <2 x i1> @shufflevector_v2i1_zeroes(<2 x i1> %a, <2 x i1> %b){
define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-LABEL: shufflevector_v4i8_zeroes:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sub sp, sp, #16
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
-; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: add sp, sp, #16
-; CHECK-SD-NEXT: ret
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: umov w8, v0.h[0]
+; CHECK-SD-NEXT: and w9, w8, #0xff
+; CHECK-SD-NEXT: orr w9, w9, w9, lsl #8
+; CHECK-SD-NEXT: bfi w9, w8, #16, #8
+; CHECK-SD-NEXT: orr w0, w9, w8, lsl #24
+; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
; CHECK-GI: // %bb.0:
@@ -495,16 +492,9 @@ define <32 x i8> @shufflevector_v32i8_zeroes(<32 x i8> %a, <32 x i8> %b){
define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-LABEL: shufflevector_v2i16_zeroes:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sub sp, sp, #16
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: dup v1.2s, v0.s[0]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strh w9, [sp, #12]
-; CHECK-SD-NEXT: mov w8, v1.s[1]
-; CHECK-SD-NEXT: strh w8, [sp, #14]
-; CHECK-SD-NEXT: ldr w0, [sp, #12]
-; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: bfi w0, w0, #16, #16
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll b/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll
index 2abcbbcdd1bc6e..1e061841bd2c50 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector-r600.ll
@@ -45,12 +45,12 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
; R600-LABEL: build_vector_v2i16:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: MOV T4.X, literal.x,
-; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: MOV T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; R600-NEXT: 393221(5.510200e-40), 2(2.802597e-45)
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
@@ -61,14 +61,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
; R600-LABEL: build_vector_v2i16_trunc:
; R600: ; %bb.0:
; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
+; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; R600-NEXT: OR_INT T4.X, PV.W, literal.x,
-; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: OR_INT T0.X, PV.W, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; R600-NEXT: 327680(4.591775e-40), 2(2.802597e-45)
%srl = lshr i32 %a, 16
%trunc = trunc i32 %srl to i16
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 17ab8fc780fb41..71d950fc47facb 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -368,26 +368,26 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: BCNT_INT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T1.W, PS,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: BCNT_INT T1.W, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT T0.X, PV.W, PS,
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index add62a5c39cb14..96bf7b8bd96d90 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -11,63 +11,55 @@
define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX7-LABEL: idot8_acc32:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
-; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
-; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
-; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
-; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
-; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
-; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
-; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
+; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, s4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
+; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
+; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
+; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
+; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
+; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
+; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
+; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot8_acc32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -78,10 +70,6 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -119,17 +107,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -169,17 +151,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -191,12 +167,6 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1]
@@ -214,13 +184,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1]
@@ -306,71 +270,65 @@ entry:
define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX7-LABEL: idot8_acc16:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
-; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
-; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
-; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
-; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
-; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
+; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot8_acc16:
; GFX8: ; %bb.0: ; %entry
@@ -378,7 +336,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -391,11 +348,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -454,11 +406,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
;
; GFX9-LABEL: idot8_acc16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
@@ -468,7 +415,6 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ushort v3, v0, s[0:1]
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -527,89 +473,77 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
;
; GFX9-DL-LABEL: idot8_acc16:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
-; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
-; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
-; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-DL-NEXT: s_endpgm
+; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v16, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v15, v3
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v9, v14, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
+; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-XNACK-LABEL: idot8_acc16:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9]
@@ -678,17 +612,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
@@ -828,71 +756,65 @@ entry:
define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX7-LABEL: idot8_acc8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
-; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
-; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
-; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
-; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
-; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
-; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
-; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
-; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
-; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
-; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
+; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
+; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
+; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
+; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
+; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
+; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
+; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
+; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
+; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot8_acc8:
; GFX8: ; %bb.0: ; %entry
@@ -900,7 +822,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -913,11 +834,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -976,11 +892,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
;
; GFX9-LABEL: idot8_acc8:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
@@ -990,7 +901,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1]
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -1049,11 +959,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
;
; GFX9-DL-LABEL: idot8_acc8:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
@@ -1063,7 +968,6 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
@@ -1122,16 +1026,10 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
;
; GFX10-DL-XNACK-LABEL: idot8_acc8:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9]
@@ -1200,17 +1098,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
@@ -1351,65 +1243,57 @@ entry:
define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-LABEL: idot8_multiuses_mul1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4
-; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16
-; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
-; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
-; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
-; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
-; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
-; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
-; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
-; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
-; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
-; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v16, v1, v9, s4
+; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v1, v9, v16
+; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4
+; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v3, v10, v1
+; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
+; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4
+; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4
+; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4
+; GFX7-NEXT: v_mad_i32_i24 v1, v7, v14, v1
+; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
+; GFX7-NEXT: v_mad_i32_i24 v1, v8, v15, v1
+; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot8_multiuses_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1420,10 +1304,6 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -1463,17 +1343,11 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1514,17 +1388,11 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1566,12 +1434,6 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1]
@@ -1619,12 +1481,6 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1]
@@ -1742,63 +1598,55 @@ entry:
define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: idot8_acc32_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2
-; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
-; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4
-; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4
-; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0
-; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
-; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
-; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0
-; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0
-; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: s_endpgm
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2
+; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
+; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
+; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
+; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4
+; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v0
+; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
+; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
+; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
+; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4
+; GFX7-NEXT: v_bfe_i32 v15, v0, 4, 4
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4
+; GFX7-NEXT: v_mad_i32_i24 v0, v8, v15, v0
+; GFX7-NEXT: v_mad_i32_i24 v0, v7, v14, v0
+; GFX7-NEXT: v_mad_i32_i24 v0, v6, v13, v0
+; GFX7-NEXT: v_mad_i32_i24 v0, v5, v12, v0
+; GFX7-NEXT: v_mad_i32_i24 v0, v4, v11, v0
+; GFX7-NEXT: v_mad_i32_i24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_i32_i24 v0, v1, v9, v0
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: idot8_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1809,10 +1657,6 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4
@@ -1850,17 +1694,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1
; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4
@@ -1900,17 +1738,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -1922,12 +1754,6 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1]
@@ -1945,13 +1771,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1]
@@ -2001,11 +1821,6 @@ entry:
define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: idot8_acc16_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -2015,12 +1830,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr
; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
@@ -2073,7 +1887,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2086,251 +1899,198 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2
-; GFX8-NEXT: v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2
+; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v18
-; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11
-; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v17
-; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
+; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
+; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
+; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
-; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v16
-; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
-; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2
-; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
-; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
+; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
+; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
+; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
+; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
+; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
+; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
+; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17
+; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2
; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
+; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
+; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10
+; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16
; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15
-; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2
-; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
+; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18
+; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2
+; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15
-; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
-; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14
-; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
-; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
-; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
+; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2
+; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: idot8_acc16_vecMul:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v4, 12
+; GFX9-NEXT: s_mov_b32 s2, 0x5040100
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ushort v3, v0, s[0:1]
-; GFX9-NEXT: s_mov_b32 s2, 0x5040100
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 4, v2
+; GFX9-NEXT: v_perm_b32 v9, v9, v1, s2
+; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2
+; GFX9-NEXT: v_perm_b32 v5, v15, v2, s2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 4, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
-; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
-; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v1
-; GFX9-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2
-; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 20, v2
-; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14
-; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15
-; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 20, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2
+; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2
-; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2
-; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2
-; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16
-; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
-; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15
-; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17
-; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
-; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8
-; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2
-; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2
-; GFX9-NEXT: v_perm_b32 v4, v17, v16, s2
-; GFX9-NEXT: v_perm_b32 v9, v10, v9, s2
-; GFX9-NEXT: v_perm_b32 v10, v15, v14, s2
+; GFX9-NEXT: v_alignbit_b32 v1, v6, v1, 16
+; GFX9-NEXT: v_perm_b32 v6, v14, v13, s2
+; GFX9-NEXT: v_alignbit_b32 v2, v12, v2, 16
+; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_lo_u16 v5, v9, v5
+; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_e32 v3, v5, v3
; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v4
-; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v10
+; GFX9-NEXT: v_pk_mul_lo_u16 v2, v7, v6
; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u16_e32 v3, v3, v4
-; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_perm_b32 v8, v11, v10, s2
; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
+; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
+; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8
; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u16_e32 v1, v1, v4
+; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_acc16_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12
+; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 4, v2
+; GFX9-DL-NEXT: v_perm_b32 v9, v9, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v15, v2, s2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1
-; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v1
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v2
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v1
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 20, v2
-; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v1
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 20, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 24, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2
-; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2
-; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2
-; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s2
-; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s2
-; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s2
+; GFX9-DL-NEXT: v_alignbit_b32 v1, v6, v1, 16
+; GFX9-DL-NEXT: v_perm_b32 v6, v14, v13, s2
+; GFX9-DL-NEXT: v_alignbit_b32 v2, v12, v2, 16
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v9, v5
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3
; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v4
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v10
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v7, v6
; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4
-; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_perm_b32 v8, v11, v10, s2
; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8
; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4
+; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9]
@@ -2340,92 +2100,68 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v1
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v2
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v2
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v2
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v8
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v13
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v14
-; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 12, v1
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 8, v2
+; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v5, v1, 0x5040100
+; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v6, v2, 0x5040100
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1
+; GFX10-DL-XNACK-NEXT: v_perm_b32 v9, v10, v9, 0x5040100
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_perm_b32 v10, v12, v11, 0x5040100
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 24, v1
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 28, v1
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX10-DL-XNACK-NEXT: v_alignbit_b32 v1, v8, v1, 16
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v9 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v10 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v6
+; GFX10-DL-XNACK-NEXT: v_alignbit_b32 v6, v11, v2, 16
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 24, v2
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v5
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v2
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15
+; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v5, v3
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v16
-; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8
-; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v17
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v8, v9
+; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v10
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v7, v4, 0x5040100
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v13, 0x5040100
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v12
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
-; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v5
+; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v6, 0x5040100
-; GFX10-DL-XNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4
-; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v3, v1
+; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v4, v2
+; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
@@ -2434,73 +2170,55 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v8
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v13
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v14
-; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 4, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 12, v1
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 8, v0
+; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v5, v1, 0x5040100
+; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v6, v0, 0x5040100
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 12, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1
+; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v9, v10, v9, 0x5040100
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v10, v12, v11, 0x5040100
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 24, v1
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 28, v1
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0
+; GFX10-DL-NOXNACK-NEXT: v_alignbit_b32 v1, v8, v1, 16
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v9 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v10 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v6
+; GFX10-DL-NOXNACK-NEXT: v_alignbit_b32 v6, v11, v0, 16
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 24, v0
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v5
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v0
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15
+; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 28, v0
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v16
-; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8
-; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v17
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v8, v9
+; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v10
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v7, v4, 0x5040100
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v13, 0x5040100
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v12
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
-; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v5
+; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v6, 0x5040100
-; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4
-; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0
-; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
+; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v1
+; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v4, v0
+; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v5
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
@@ -2546,11 +2264,6 @@ entry:
define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: idot8_acc8_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -2565,7 +2278,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
@@ -2614,111 +2326,100 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
-; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
-; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
-; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2
-; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10
-; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16
-; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
-; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
-; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6
-; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15
-; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18
-; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
-; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
-; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11
-; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
-; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
-; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
-; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
-; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14
-; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18
-; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19
-; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11
-; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15
-; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3
-; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v3, v8, v4
-; GFX8-NEXT: v_add_u16_e32 v3, v3, v5
-; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
-; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
-; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2
-; GFX8-NEXT: v_add_u16_e32 v2, v2, v6
-; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2
-; GFX8-NEXT: v_add_u16_e32 v2, v2, v10
-; GFX8-NEXT: flat_store_byte v[0:1], v2
-; GFX8-NEXT: s_endpgm
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, 12
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
+; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
+; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2
+; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10
+; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16
+; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17
+; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7
+; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15
+; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18
+; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19
+; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12
+; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11
+; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9
+; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8
+; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14
+; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13
+; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
+; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2
+; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
+; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
+; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14
+; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13
+; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18
+; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19
+; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11
+; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15
+; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u16_e32 v3, v8, v4
+; GFX8-NEXT: v_add_u16_e32 v3, v3, v5
+; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
+; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
+; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2
+; GFX8-NEXT: v_add_u16_e32 v2, v2, v6
+; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2
+; GFX8-NEXT: v_add_u16_e32 v2, v2, v10
+; GFX8-NEXT: flat_store_byte v[0:1], v2
+; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: idot8_acc8_vecMul:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
@@ -2728,7 +2429,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1]
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1
@@ -2806,11 +2506,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX9-DL-LABEL: idot8_acc8_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
@@ -2820,7 +2515,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1
@@ -2898,17 +2592,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9]
@@ -2999,17 +2687,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 069bebdf3c469d..d9ddef9b43cd40 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -9,11 +9,6 @@
define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc32:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -28,33 +23,32 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
-; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 15, v2
+; GFX7-NEXT: v_bfe_u32 v4, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
-; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v10, 15, v0
+; GFX7-NEXT: v_bfe_u32 v11, v0, 4, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v3, v3, v10, s4
+; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v3, v4, v11, v3
+; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v3, v5, v12, v3
+; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v3, v6, v13, v3
+; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4
+; GFX7-NEXT: v_mad_u32_u24 v3, v7, v14, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v3, v8, v15, v3
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3
; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -64,8 +58,6 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -76,37 +68,33 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
-; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v3
+; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4
+; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4
+; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4
+; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4
+; GFX8-NEXT: v_bfe_u32 v14, v3, 20, 4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0
-; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4
-; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4
-; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4
-; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4
-; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4
-; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v0
+; GFX8-NEXT: v_bfe_u32 v7, v0, 4, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
-; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
-; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0
+; GFX8-NEXT: v_bfe_u32 v9, v0, 8, 4
+; GFX8-NEXT: v_mad_u32_u24 v4, v6, v7, v4
+; GFX8-NEXT: v_bfe_u32 v11, v0, 12, 4
+; GFX8-NEXT: v_mad_u32_u24 v4, v8, v9, v4
+; GFX8-NEXT: v_bfe_u32 v13, v0, 16, 4
+; GFX8-NEXT: v_mad_u32_u24 v4, v10, v11, v4
+; GFX8-NEXT: v_bfe_u32 v15, v0, 20, 4
+; GFX8-NEXT: v_mad_u32_u24 v4, v12, v13, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0
+; GFX8-NEXT: v_bfe_u32 v3, v3, 24, 4
+; GFX8-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX8-NEXT: v_mad_u32_u24 v4, v14, v15, v4
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v4
+; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -117,48 +105,42 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
-; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4
-; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
-; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4
-; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v2
+; GFX9-NEXT: v_bfe_u32 v7, v1, 4, 4
+; GFX9-NEXT: v_bfe_u32 v8, v2, 4, 4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2
+; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
+; GFX9-NEXT: v_bfe_u32 v10, v2, 8, 4
+; GFX9-NEXT: v_bfe_u32 v11, v1, 12, 4
+; GFX9-NEXT: v_bfe_u32 v12, v2, 12, 4
+; GFX9-NEXT: v_bfe_u32 v13, v1, 16, 4
+; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
+; GFX9-NEXT: v_bfe_u32 v15, v1, 20, 4
+; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4
+; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v6
+; GFX9-NEXT: v_mul_u32_u24_e32 v6, v7, v8
+; GFX9-NEXT: v_mul_u32_u24_e32 v7, v9, v10
+; GFX9-NEXT: v_mul_u32_u24_e32 v8, v11, v12
; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
-; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16
-; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
-; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, v3, v4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
-; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
-; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
-; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
-; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
-; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10
-; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
-; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
+; GFX9-NEXT: v_add3_u32 v3, v5, s0, v6
+; GFX9-NEXT: v_mul_u32_u24_e32 v9, v13, v14
+; GFX9-NEXT: v_mul_u32_u24_e32 v10, v15, v16
+; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8
+; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10
+; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -167,17 +149,11 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -189,12 +165,6 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
@@ -282,11 +252,6 @@ entry:
define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc16:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -301,33 +266,32 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 15, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v11, 15, v0
+; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -337,8 +301,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: v_mov_b32_e32 v5, 15
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -351,37 +314,41 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
+; GFX8-NEXT: v_and_b32_e32 v16, 15, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4
-; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
+; GFX8-NEXT: v_and_b32_e32 v17, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX8-NEXT: v_and_b32_sdwa v18, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 15, v15
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_u16 v4, v16, v17, v4
+; GFX8-NEXT: v_and_b32_e32 v9, 15, v9
+; GFX8-NEXT: v_and_b32_e32 v14, 15, v14
+; GFX8-NEXT: v_mad_u16 v4, v5, v10, v4
+; GFX8-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4
+; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT: v_mad_u16 v4, v18, v19, v4
+; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
-; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
-; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
-; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
+; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -390,44 +357,47 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: v_mov_b32_e32 v4, 15
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ushort v3, v0, s[6:7]
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
-; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4
-; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-NEXT: v_and_b32_e32 v16, 15, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, 15, v9
+; GFX9-NEXT: v_and_b32_e32 v9, 15, v14
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_legacy_u16 v3, v15, v16, v3
+; GFX9-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v9, v3
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
+; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT: v_mad_legacy_u16 v3, v17, v18, v3
+; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
+; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -436,44 +406,47 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v9
+; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v15, v16, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v17, v18, v3
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
+; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -483,46 +456,49 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
+; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6
+; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15
+; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1
+; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2
+; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7
+; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8
+; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_mad_u16 v3, v9, v10, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
-; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7]
+; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT: v_mad_u16 v3, v7, v5, v3
+; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -600,11 +576,6 @@ entry:
define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc8:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -619,33 +590,32 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 15, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v11, 15, v0
+; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -655,8 +625,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: v_mov_b32_e32 v5, 15
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -669,37 +638,41 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
+; GFX8-NEXT: v_and_b32_e32 v16, 15, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4
-; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
+; GFX8-NEXT: v_and_b32_e32 v17, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX8-NEXT: v_and_b32_sdwa v18, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 15, v15
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_u16 v4, v16, v17, v4
+; GFX8-NEXT: v_and_b32_e32 v9, 15, v9
+; GFX8-NEXT: v_and_b32_e32 v14, 15, v14
+; GFX8-NEXT: v_mad_u16 v4, v5, v10, v4
+; GFX8-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4
+; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT: v_mad_u16 v4, v18, v19, v4
+; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
-; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
-; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
-; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
+; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -708,44 +681,47 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: v_mov_b32_e32 v4, 15
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7]
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
-; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4
-; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-NEXT: v_and_b32_e32 v16, 15, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, 15, v9
+; GFX9-NEXT: v_and_b32_e32 v9, 15, v14
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_legacy_u16 v3, v15, v16, v3
+; GFX9-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-NEXT: v_mad_legacy_u16 v3, v4, v9, v3
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
+; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT: v_mad_legacy_u16 v3, v17, v18, v3
+; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
+; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -754,44 +730,47 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v9
+; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v15, v16, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v17, v18, v3
+; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
+; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1
; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -801,46 +780,49 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
+; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v6
+; GFX10-DL-NEXT: v_mov_b32_e32 v5, 15
+; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v7
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1
+; GFX10-DL-NEXT: v_mad_u16 v3, v8, v9, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2
+; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v7
+; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8
+; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_mad_u16 v3, v9, v10, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
-; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
+; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT: v_mad_u16 v3, v7, v5, v3
+; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -918,11 +900,6 @@ entry:
define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc4:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -937,33 +914,32 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 15, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v11, 15, v0
+; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -974,8 +950,6 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -988,28 +962,21 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
-; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
@@ -1017,6 +984,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
@@ -1028,42 +996,34 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7]
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
@@ -1075,42 +1035,34 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
@@ -1124,43 +1076,35 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 4, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 12, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 20, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
@@ -1225,11 +1169,6 @@ entry:
define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_CommutationInsideMAD:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -1244,33 +1183,32 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 15, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v11, 15, v0
+; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -1281,8 +1219,6 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1295,28 +1231,21 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
-; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
@@ -1324,6 +1253,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
@@ -1335,42 +1265,34 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7]
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
@@ -1382,42 +1304,34 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1
; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
@@ -1431,43 +1345,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 4, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_mad_u16 v4, v2, v3, v4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 12, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 20, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
+; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7]
@@ -1530,11 +1436,6 @@ entry:
define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_multiuses_mul1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -1549,36 +1450,35 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
-; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v3, 15, v2
+; GFX7-NEXT: v_bfe_u32 v4, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
-; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v10, 15, v0
+; GFX7-NEXT: v_bfe_u32 v11, v0, 4, 4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v16, v2, v0, s4
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16
-; GFX7-NEXT: v_mad_u32_u24 v2, v8, v15, v16
-; GFX7-NEXT: v_mad_u32_u24 v2, v7, v14, v2
-; GFX7-NEXT: v_mad_u32_u24 v2, v6, v13, v2
-; GFX7-NEXT: v_mad_u32_u24 v2, v5, v12, v2
-; GFX7-NEXT: v_mad_u32_u24 v2, v4, v11, v2
-; GFX7-NEXT: v_mad_u32_u24 v2, v3, v10, v2
-; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v2
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v16, v3, v10, s4
+; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v4, v4, v11, v16
+; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v4, v5, v12, v4
+; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v4, v6, v13, v4
+; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4
+; GFX7-NEXT: v_mad_u32_u24 v4, v7, v14, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v4, v8, v15, v4
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v4
+; GFX7-NEXT: v_mad_u32_u24 v3, v3, v10, v16
+; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -1587,8 +1487,6 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1599,39 +1497,35 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
-; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v3
+; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4
+; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4
+; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4
+; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4
+; GFX8-NEXT: v_bfe_u32 v14, v3, 20, 4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0
-; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4
-; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4
-; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4
-; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4
-; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4
-; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v0
+; GFX8-NEXT: v_bfe_u32 v7, v0, 4, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16
-; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16
-; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3
-; GFX8-NEXT: v_mad_u32_u24 v3, v6, v13, v3
-; GFX8-NEXT: v_mad_u32_u24 v3, v5, v12, v3
-; GFX8-NEXT: v_mad_u32_u24 v3, v4, v11, v3
-; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
+; GFX8-NEXT: v_mad_u32_u24 v16, v4, v5, s0
+; GFX8-NEXT: v_bfe_u32 v9, v0, 8, 4
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v16
+; GFX8-NEXT: v_mad_u32_u24 v5, v6, v7, v16
+; GFX8-NEXT: v_bfe_u32 v11, v0, 12, 4
+; GFX8-NEXT: v_mad_u32_u24 v5, v8, v9, v5
+; GFX8-NEXT: v_bfe_u32 v13, v0, 16, 4
+; GFX8-NEXT: v_mad_u32_u24 v5, v10, v11, v5
+; GFX8-NEXT: v_bfe_u32 v15, v0, 20, 4
+; GFX8-NEXT: v_mad_u32_u24 v5, v12, v13, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0
+; GFX8-NEXT: v_bfe_u32 v3, v3, 24, 4
+; GFX8-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX8-NEXT: v_mad_u32_u24 v5, v14, v15, v5
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v5
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1642,49 +1536,43 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
-; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4
-; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v2
+; GFX9-NEXT: v_bfe_u32 v7, v1, 4, 4
+; GFX9-NEXT: v_bfe_u32 v8, v2, 4, 4
+; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
+; GFX9-NEXT: v_bfe_u32 v10, v2, 8, 4
+; GFX9-NEXT: v_bfe_u32 v11, v1, 12, 4
+; GFX9-NEXT: v_bfe_u32 v12, v2, 12, 4
+; GFX9-NEXT: v_bfe_u32 v13, v1, 16, 4
; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2
+; GFX9-NEXT: v_bfe_u32 v15, v1, 20, 4
+; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4
+; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, v3, v4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0
-; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16
-; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
-; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
-; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
-; GFX9-NEXT: v_add3_u32 v2, v2, v9, v8
-; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
-; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
-; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6
-; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4
-; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2
+; GFX9-NEXT: v_mad_u32_u24 v3, v5, v6, s0
+; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v10
+; GFX9-NEXT: v_mul_u32_u24_e32 v10, v11, v12
+; GFX9-NEXT: v_mad_u32_u24 v4, v7, v8, v3
+; GFX9-NEXT: v_mul_u32_u24_e32 v11, v13, v14
+; GFX9-NEXT: v_mul_u32_u24_e32 v12, v15, v16
+; GFX9-NEXT: v_add3_u32 v4, v4, v9, v10
+; GFX9-NEXT: v_add3_u32 v4, v4, v11, v12
+; GFX9-NEXT: v_mul_u32_u24_e32 v17, v5, v6
+; GFX9-NEXT: v_add3_u32 v1, v4, v1, v2
+; GFX9-NEXT: v_add3_u32 v1, v17, v3, v1
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -1693,49 +1581,43 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v2
+; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 4, 4
+; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 4, 4
+; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
+; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 8, 4
+; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 12, 4
+; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 12, 4
+; GFX9-DL-NEXT: v_bfe_u32 v13, v1, 16, 4
; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2
+; GFX9-DL-NEXT: v_bfe_u32 v15, v1, 20, 4
+; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4
+; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
+; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13
-; GFX9-DL-NEXT: v_add3_u32 v2, v2, v9, v8
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11
-; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6
-; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4
-; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v3, v5, v6, s0
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v10
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v10, v11, v12
+; GFX9-DL-NEXT: v_mad_u32_u24 v4, v7, v8, v3
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v11, v13, v14
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v12, v15, v16
+; GFX9-DL-NEXT: v_add3_u32 v4, v4, v9, v10
+; GFX9-DL-NEXT: v_add3_u32 v4, v4, v11, v12
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v5, v6
+; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v2
+; GFX9-DL-NEXT: v_add3_u32 v1, v17, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
@@ -1745,12 +1627,6 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
@@ -1758,38 +1634,38 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v2
-; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1
-; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4
-; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4
+; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
+; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4
+; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4
+; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
+; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 12, 4
+; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 12, 4
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v8
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0
+; GFX10-DL-NEXT: v_mad_u32_u24 v8, v4, v5, s0
+; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
+; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 16, 4
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v10, v10, v11
+; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v10, v13
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v6, v6, v9, v8
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v9, v12, v13
+; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v2, v2, 24, 4
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14
-; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v7
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v2
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9
-; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5
-; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v11, v11, v14
+; GFX10-DL-NEXT: v_add3_u32 v6, v6, v7, v10
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v4, v5
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_add3_u32 v2, v6, v9, v11
+; GFX10-DL-NEXT: v_add3_u32 v0, v2, v1, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0
+; GFX10-DL-NEXT: v_add3_u32 v0, v3, v8, v0
; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
@@ -1868,11 +1744,6 @@ entry:
define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc32_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -1887,7 +1758,6 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2
; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4
@@ -1923,8 +1793,6 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1935,21 +1803,17 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
-; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_bfe_u32 v4, v3, 24, 4
+; GFX8-NEXT: v_bfe_u32 v5, v3, 20, 4
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 4
+; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4
+; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0
+; GFX8-NEXT: v_bfe_u32 v9, v3, 4, 4
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4
; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4
; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4
@@ -1959,13 +1823,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
-; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
-; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v9, v15, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v8, v14, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v7, v13, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v6, v12, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v5, v11, v0
+; GFX8-NEXT: v_mad_u32_u24 v0, v4, v10, v0
+; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1976,28 +1840,22 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1
-; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4
-; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2
+; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4
+; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4
+; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4
+; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4
+; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4
+; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4
; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4
; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4
@@ -2006,18 +1864,18 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2
-; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16
-; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15
-; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, v10, v16
+; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v15
+; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v14
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2
-; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13
-; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12
-; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7
-; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11
-; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10
-; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5
-; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3
+; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v13
+; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v12
+; GFX9-NEXT: v_add3_u32 v1, v1, v9, v8
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v11
+; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v4
+; GFX9-NEXT: v_add3_u32 v1, v1, v7, v6
+; GFX9-NEXT: v_add3_u32 v1, v1, v5, v3
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -2026,17 +1884,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
@@ -2048,12 +1900,6 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
@@ -2106,11 +1952,6 @@ entry:
define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc16_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -2125,34 +1966,37 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_and_b32_e32 v8, 15, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v15, 15, v0
+; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 20, v2
+; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 20, v0
+; GFX7-NEXT: v_bfe_u32 v11, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
+; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 28, v2
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_bfe_u32 v3, v0, 24, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 28, v0
+; GFX7-NEXT: v_alignbit_b32 v0, v10, v0, 16
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v11, v1
+; GFX7-NEXT: v_bfe_u32 v10, v2, 16, 4
; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
+; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v10, v8, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v4, v3, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -2161,8 +2005,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: v_mov_b32_e32 v5, 15
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2175,156 +2018,166 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 20, v3
+; GFX8-NEXT: v_and_b32_sdwa v16, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4
-; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT: v_and_b32_sdwa v17, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_and_b32_sdwa v18, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT: v_and_b32_e32 v12, 15, v12
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
-; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
-; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
-; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
+; GFX8-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2
+; GFX8-NEXT: v_and_b32_e32 v9, 15, v9
+; GFX8-NEXT: v_and_b32_e32 v14, 15, v14
+; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2
+; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2
+; GFX8-NEXT: v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT: v_and_b32_e32 v15, 15, v15
+; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2
+; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2
+; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2
+; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udot8_acc16_vecMul:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: v_mov_b32_e32 v4, 15
+; GFX9-NEXT: s_mov_b32 s2, 0x5040100
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: global_load_ushort v3, v0, s[6:7]
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: global_load_ushort v3, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_and_b32_e32 v4, 15, v1
-; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4
-; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v11, 15, v2
-; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4
-; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0
-; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0
-; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0
-; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7
-; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0
-; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-NEXT: v_and_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v17, 15, v2
+; GFX9-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, 15, v9
+; GFX9-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT: v_and_b32_e32 v9, 15, v14
+; GFX9-NEXT: v_perm_b32 v7, v7, v8, s2
+; GFX9-NEXT: v_perm_b32 v8, v9, v17, s2
+; GFX9-NEXT: v_perm_b32 v4, v4, v15, s2
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2
+; GFX9-NEXT: v_perm_b32 v1, v5, v1, s2
+; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8
+; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-NEXT: v_perm_b32 v2, v11, v18, s2
+; GFX9-NEXT: v_perm_b32 v5, v6, v16, s2
+; GFX9-NEXT: v_perm_b32 v6, v12, v13, s2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
-; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9
+; GFX9-NEXT: v_pk_mul_lo_u16 v2, v5, v2
+; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v6
; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0
-; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0
-; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0
; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
-; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10
; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: global_store_short v0, v1, s[6:7]
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc16_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15
+; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1
-; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2
-; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4
-; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0
-; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0
-; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0
-; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7
-; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0
-; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-DL-NEXT: v_and_b32_sdwa v16, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2
+; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v9
+; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14
+; GFX9-DL-NEXT: v_perm_b32 v7, v7, v8, s2
+; GFX9-DL-NEXT: v_perm_b32 v8, v9, v17, s2
+; GFX9-DL-NEXT: v_perm_b32 v4, v4, v15, s2
+; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT: v_perm_b32 v2, v10, v2, s2
+; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s2
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-DL-NEXT: v_perm_b32 v2, v11, v18, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v6, v16, s2
+; GFX9-DL-NEXT: v_perm_b32 v6, v12, v13, s2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v2
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v6
; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0
-; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0
; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10
; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc16_vecMul:
@@ -2333,12 +2186,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
@@ -2346,41 +2194,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 4, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
-; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100
-; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100
-; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
-; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
-; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX10-DL-NEXT: v_perm_b32 v7, v7, v9, 0x5040100
+; GFX10-DL-NEXT: v_perm_b32 v6, v6, v10, 0x5040100
+; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v11
+; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v12
+; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v13
+; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v2
+; GFX10-DL-NEXT: v_perm_b32 v10, v11, v10, 0x5040100
+; GFX10-DL-NEXT: v_perm_b32 v8, v8, v9, 0x5040100
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v6
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7
-; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
-; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4
-; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
-; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100
+; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3
+; GFX10-DL-NEXT: v_and_b32_sdwa v12, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10
+; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 28, v1
+; GFX10-DL-NEXT: v_perm_b32 v6, v7, v6, 0x5040100
+; GFX10-DL-NEXT: v_perm_b32 v5, v5, v12, 0x5040100
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v8
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v2
+; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v6
+; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
+; GFX10-DL-NEXT: v_perm_b32 v2, v8, v2, 0x5040100
+; GFX10-DL-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
@@ -2430,11 +2286,6 @@ entry:
define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc8_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -2449,34 +2300,39 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v7, 15, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v13, 15, v0
+; GFX7-NEXT: v_bfe_u32 v11, v0, 4, 4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1
+; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v11, v1
+; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_bfe_u32 v9, v2, 16, 4
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v12, v1
+; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 24
+; GFX7-NEXT: v_bfe_u32 v3, v0, 20, 4
+; GFX7-NEXT: v_alignbit_b32 v0, v10, v0, 24
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1
+; GFX7-NEXT: v_and_b32_e32 v16, 0xf0f, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v7, 0xf0f, v0
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1
+; GFX7-NEXT: v_bfe_u32 v10, v16, 8, 8
+; GFX7-NEXT: v_bfe_u32 v5, v7, 8, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v10, v5, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -2485,8 +2341,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: v_mov_b32_e32 v5, 15
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2499,67 +2354,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3
-; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4
-; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2
-; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4
-; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4
-; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4
-; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4
-; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4
-; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17
-; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v6, 15, v3
-; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v13, 15, v2
-; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19
-; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15
-; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v9, v18, v9
-; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v3, v2, v11
-; GFX8-NEXT: v_or_b32_e32 v7, v8, v7
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3
+; GFX8-NEXT: v_and_b32_sdwa v17, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v18, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 12, v2
+; GFX8-NEXT: v_and_b32_sdwa v19, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2
+; GFX8-NEXT: v_and_b32_e32 v9, 15, v9
+; GFX8-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT: v_and_b32_e32 v14, 15, v14
+; GFX8-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX8-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_lo_u16_e32 v11, v18, v5
+; GFX8-NEXT: v_and_b32_e32 v16, 15, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 15, v15
+; GFX8-NEXT: v_mul_lo_u16_e32 v15, v17, v19
+; GFX8-NEXT: v_or_b32_e32 v6, v11, v6
+; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v13
+; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_mul_lo_u16_e32 v3, v16, v3
+; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v2, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX8-NEXT: v_or_b32_e32 v7, v15, v7
+; GFX8-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX8-NEXT: v_or_b32_e32 v9, v3, v10
+; GFX8-NEXT: v_or_b32_sdwa v3, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v2
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v3, v6, v4
-; GFX8-NEXT: v_add_u16_e32 v3, v3, v5
-; GFX8-NEXT: v_add_u16_e32 v3, v3, v7
+; GFX8-NEXT: v_add_u16_e32 v4, v9, v4
+; GFX8-NEXT: v_add_u16_e32 v3, v4, v3
+; GFX8-NEXT: v_add_u16_e32 v3, v3, v8
; GFX8-NEXT: v_add_u16_e32 v2, v3, v2
-; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2
-; GFX8-NEXT: v_add_u16_e32 v2, v2, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT: v_add_u16_e32 v2, v2, v9
+; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2
+; GFX8-NEXT: v_add_u16_e32 v2, v2, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v6
+; GFX8-NEXT: v_mad_u16 v2, v18, v5, v2
+; GFX8-NEXT: v_add_u16_e32 v2, v2, v6
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udot8_acc8_vecMul:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
@@ -2568,63 +2422,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1]
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
-; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4
-; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4
-; GFX9-NEXT: v_and_b32_e32 v12, 15, v2
-; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4
-; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2
-; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16
-; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14
-; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v0, v18, v10
-; GFX9-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v2
+; GFX9-NEXT: v_and_b32_sdwa v18, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT: v_and_b32_e32 v0, 15, v9
+; GFX9-NEXT: v_and_b32_e32 v9, 15, v14
+; GFX9-NEXT: v_mul_lo_u16_e32 v14, v17, v2
+; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v10, v16, v18
+; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v12
+; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v1, v15, v1
+; GFX9-NEXT: v_or_b32_sdwa v10, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v0, v14, v6
+; GFX9-NEXT: v_or_b32_e32 v6, v7, v8
+; GFX9-NEXT: v_or_b32_e32 v7, v1, v9
+; GFX9-NEXT: v_or_b32_sdwa v1, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT: v_or_b32_e32 v5, v5, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-NEXT: v_or_b32_e32 v10, v12, v0
+; GFX9-NEXT: v_or_b32_e32 v9, v9, v0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v9
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u16_e32 v4, v5, v4
+; GFX9-NEXT: v_add_u16_e32 v4, v7, v4
; GFX9-NEXT: v_add_u16_e32 v1, v4, v1
; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
-; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
+; GFX9-NEXT: v_mad_legacy_u16 v0, v17, v2, v0
; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
-; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
-; GFX9-NEXT: v_add_u16_e32 v0, v0, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
+; GFX9-NEXT: v_add_u16_e32 v0, v0, v5
; GFX9-NEXT: global_store_byte v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc8_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
@@ -2633,125 +2490,136 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1]
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4
-; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4
-; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v10
-; GFX9-DL-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX9-DL-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v2
+; GFX9-DL-NEXT: v_and_b32_sdwa v18, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v9
+; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v14
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v14, v17, v2
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v16, v18
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v12
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, v15, v1
+; GFX9-DL-NEXT: v_or_b32_sdwa v10, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_e32 v0, v14, v6
+; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v8
+; GFX9-DL-NEXT: v_or_b32_e32 v7, v1, v9
+; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v8
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-DL-NEXT: v_or_b32_e32 v10, v12, v0
+; GFX9-DL-NEXT: v_or_b32_e32 v9, v9, v0
; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v10
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v9
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u16_e32 v4, v5, v4
+; GFX9-DL-NEXT: v_add_u16_e32 v4, v7, v4
; GFX9-DL-NEXT: v_add_u16_e32 v1, v4, v1
; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
-; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
+; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v17, v2, v0
; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
-; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
-; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
+; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5
; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[10:11]
+; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 15
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
-; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1
-; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
-; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4
-; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 8, 4
-; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9
-; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
-; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 20, 4
-; GFX10-DL-NEXT: v_mul_lo_u16 v1, v7, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8
+; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v13
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX10-DL-NEXT: v_and_b32_e32 v12, 15, v12
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1
+; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT: v_and_b32_e32 v14, 15, v14
+; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v12
+; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v15, 15, v1
+; GFX10-DL-NEXT: v_and_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v2
+; GFX10-DL-NEXT: v_and_b32_sdwa v13, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_sdwa v12, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v6
+; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v11
+; GFX10-DL-NEXT: v_mul_lo_u16 v6, v9, v14
+; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v10
+; GFX10-DL-NEXT: v_mul_lo_u16 v1, v15, v1
+; GFX10-DL-NEXT: v_mul_lo_u16 v2, v0, v2
; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2
-; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v9
-; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 24, 4
-; GFX10-DL-NEXT: v_or_b32_e32 v6, v1, v6
-; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v15
-; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v14
-; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v0
-; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v13
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v7
-; GFX10-DL-NEXT: v_mul_lo_u16 v11, v10, v16
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX10-DL-NEXT: v_mul_lo_u16 v8, v17, v12
+; GFX10-DL-NEXT: v_mul_lo_u16 v9, v16, v13
+; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5
; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2
-; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8
-; GFX10-DL-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v9
-; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v13
+; GFX10-DL-NEXT: v_or_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v6
+; GFX10-DL-NEXT: v_or_b32_sdwa v6, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_e32 v2, v8, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v10
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
-; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v9
+; GFX10-DL-NEXT: v_add_nc_u16 v3, v1, v3
+; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_add_nc_u16 v6, v3, v8
; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6
+; GFX10-DL-NEXT: v_add_nc_u16 v0, v6, v7
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2
-; GFX10-DL-NEXT: v_mad_u16 v0, v12, v7, v0
+; GFX10-DL-NEXT: v_mad_u16 v0, v17, v12, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8
-; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v5
+; GFX10-DL-NEXT: v_mad_u16 v0, v16, v13, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT: global_store_byte v4, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2793,11 +2661,6 @@ entry:
define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot8_acc4_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT: s_mov_b32 s14, -1
-; GFX7-NEXT: s_mov_b32 s15, 0xe8f000
-; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -2812,33 +2675,32 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
-; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 15, v2
+; GFX7-NEXT: v_bfe_u32 v5, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_and_b32_e32 v11, 15, v0
+; GFX7-NEXT: v_bfe_u32 v12, v0, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v9, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v16, v0, 20, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 24, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -2849,8 +2711,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2863,28 +2723,21 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s15, 0xe80000
-; GFX8-NEXT: s_add_u32 s12, s12, s11
-; GFX8-NEXT: s_addc_u32 s13, s13, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
-; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2
-; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4
-; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2
@@ -2892,6 +2745,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2
; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2
; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2
; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
@@ -2903,57 +2757,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7]
; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-NEXT: s_add_u32 s12, s12, s11
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_and_b32_e32 v4, 15, v1
-; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4
-; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4
-; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 4, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 28, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_e32 v11, 15, v2
-; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4
-; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0
-; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0
-; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0
-; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4
-; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4
-; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4
-; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 4, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 20, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 12, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2
; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0
-; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u16_e32 v3, v4, v3
-; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9
-; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0
-; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0
-; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0
-; GFX9-NEXT: v_add_u16_e32 v3, v3, v5
+; GFX9-NEXT: v_alignbit_b32 v9, v13, v2, 16
+; GFX9-NEXT: v_alignbit_b32 v7, v7, v1, 16
+; GFX9-NEXT: v_perm_b32 v2, v10, v2, s0
+; GFX9-NEXT: v_perm_b32 v1, v4, v1, s0
; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10
-; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u16_e32 v3, v3, v2
-; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u16_e32 v2, v2, v1
+; GFX9-NEXT: v_perm_b32 v11, v12, v11, s0
+; GFX9-NEXT: v_perm_b32 v5, v6, v5, s0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v2, v1, v3
+; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v11
; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u16_e32 v1, v1, v5
+; GFX9-NEXT: v_pk_mul_lo_u16 v6, v7, v9
+; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_perm_b32 v14, v15, v14, s0
+; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
+; GFX9-NEXT: v_pk_mul_lo_u16 v4, v8, v14
+; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u16_e32 v1, v1, v4
+; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
@@ -2963,57 +2807,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-DL-NEXT: s_mov_b32 s14, -1
-; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000
-; GFX9-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1
-; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
-; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 28, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2
-; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4
-; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0
-; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0
-; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0
-; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
-; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 12, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 24, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2
; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0
-; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9
-; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0
-; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0
-; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5
+; GFX9-DL-NEXT: v_alignbit_b32 v9, v13, v2, 16
+; GFX9-DL-NEXT: v_alignbit_b32 v7, v7, v1, 16
+; GFX9-DL-NEXT: v_perm_b32 v2, v10, v2, s0
+; GFX9-DL-NEXT: v_perm_b32 v1, v4, v1, s0
; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10
-; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2
-; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1
+; GFX9-DL-NEXT: v_perm_b32 v11, v12, v11, s0
+; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v11
; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v9
+; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_perm_b32 v14, v15, v14, s0
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v8, v14
+; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4
+; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
@@ -3024,57 +2858,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s14, -1
-; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s12, s12, s11
-; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4
-; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
-; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100
-; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100
-; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1
+; GFX10-DL-NEXT: v_perm_b32 v4, v4, v1, 0x5040100
+; GFX10-DL-NEXT: v_perm_b32 v5, v5, v2, 0x5040100
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v2
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
+; GFX10-DL-NEXT: v_perm_b32 v5, v8, v7, 0x5040100
; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
-; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
-; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 20, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
-; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7
-; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
-; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4
-; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
-; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
-; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 20, v1
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v5
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
+; GFX10-DL-NEXT: v_alignbit_b32 v4, v4, v1, 16
+; GFX10-DL-NEXT: v_alignbit_b32 v7, v8, v2, 16
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
-; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100
-; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
+; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v4
; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100
+; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
+; GFX10-DL-NEXT: v_perm_b32 v1, v1, v5, 0x5040100
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 8704f4e780448b..c746089733df08 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1572,96 +1572,85 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i
;
; EG-LABEL: v5i8_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 0, @16, KC0[], KC1[]
+; EG-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 4 @6
-; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
+; EG-NEXT: ALU 19, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T5.XW, T2.X
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
-; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
-; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
-; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
-; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3
+; EG-NEXT: VTX_READ_8 T2.X, T0.X, 46, #3
+; EG-NEXT: VTX_READ_8 T3.X, T0.X, 44, #3
+; EG-NEXT: VTX_READ_8 T4.X, T0.X, 45, #3
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 48, #3
; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: MOV * T5.X, 0.0,
-; EG-NEXT: ALU clause starting at 17:
+; EG-NEXT: MOV T0.X, 0.0,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
-; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
-; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: LSHL T5.X, T2.W, PV.W,
+; EG-NEXT: ALU clause starting at 21:
+; EG-NEXT: LSHL * T1.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T5.X, T0.X, PV.W,
; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV T5.Z, 0.0,
-; EG-NEXT: AND_INT T1.W, T9.X, literal.x,
-; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: LSHL T1.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T2.W, T7.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: OR_INT T1.W, PS, PV.W,
-; EG-NEXT: LSHL * T2.W, T0.Z, literal.x,
+; EG-NEXT: LSHL * T1.W, T4.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T1.W, T3.X, PV.W,
+; EG-NEXT: LSHL * T2.W, T2.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T1.W, PV.W, PS,
-; EG-NEXT: AND_INT * T2.W, T6.X, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: OR_INT T6.X, PV.W, PS,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHL * T2.W, T1.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
+; EG-NEXT: LSHR * T2.X, T0.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v5i8_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 0, @16, KC0[], KC1[]
+; CM-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 4 @6
-; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X
-; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X
+; CM-NEXT: ALU 18, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
+; CM-NEXT: MEM_RAT MSKOR T5.XW, T0.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3
-; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3
-; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3
-; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3
-; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3
+; CM-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3
+; CM-NEXT: VTX_READ_8 T2.X, T0.X, 46, #3
+; CM-NEXT: VTX_READ_8 T3.X, T0.X, 44, #3
+; CM-NEXT: VTX_READ_8 T4.X, T0.X, 45, #3
+; CM-NEXT: VTX_READ_8 T0.X, T0.X, 48, #3
; CM-NEXT: ALU clause starting at 16:
-; CM-NEXT: MOV * T5.X, 0.0,
-; CM-NEXT: ALU clause starting at 17:
+; CM-NEXT: MOV T0.X, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
-; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
-; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
-; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
+; CM-NEXT: ALU clause starting at 21:
+; CM-NEXT: LSHL * T1.W, T1.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T5.X, T0.X, PV.W,
; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: MOV T5.Z, 0.0,
-; CM-NEXT: AND_INT * T1.W, T9.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Y, T8.X, literal.x,
-; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
-; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
-; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
-; CM-NEXT: LSHL * T1.W, PV.Y, literal.x,
+; CM-NEXT: LSHL * T1.W, T4.X, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T7.X, T0.W, literal.x,
+; CM-NEXT: OR_INT T0.Z, T3.X, PV.W,
+; CM-NEXT: LSHL * T1.W, T2.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR T0.X, T0.W, literal.x,
; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
-; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43)
-; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
-; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
+; CM-NEXT: LSHL * T0.W, T1.X, literal.y,
+; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; CM-NEXT: OR_INT * T1.X, PV.Z, PV.W,
+; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <5 x i8> %in, ptr addrspace(1) %out, align 4
@@ -2418,214 +2407,88 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
;
; EG-LABEL: v8i8_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 1, @36, KC0[], KC1[]
-; EG-NEXT: TEX 0 @20
-; EG-NEXT: ALU 5, @38, KC0[], KC1[]
-; EG-NEXT: TEX 0 @22
-; EG-NEXT: ALU 5, @44, KC0[], KC1[]
-; EG-NEXT: TEX 0 @24
-; EG-NEXT: ALU 7, @50, KC0[], KC1[]
-; EG-NEXT: TEX 0 @26
-; EG-NEXT: ALU 7, @58, KC0[], KC1[]
-; EG-NEXT: TEX 0 @28
-; EG-NEXT: ALU 7, @66, KC0[], KC1[]
-; EG-NEXT: TEX 0 @30
-; EG-NEXT: ALU 7, @74, KC0[], KC1[]
-; EG-NEXT: TEX 0 @32
-; EG-NEXT: ALU 5, @82, KC0[], KC1[]
-; EG-NEXT: TEX 0 @34
-; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
+; EG-NEXT: ALU 0, @24, KC0[], KC1[]
+; EG-NEXT: TEX 2 @8
+; EG-NEXT: ALU 2, @25, KC0[], KC1[]
+; EG-NEXT: TEX 4 @14
+; EG-NEXT: ALU 14, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 20:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
-; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
-; EG-NEXT: Fetch clause starting at 26:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
-; EG-NEXT: Fetch clause starting at 28:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
-; EG-NEXT: Fetch clause starting at 30:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
-; EG-NEXT: Fetch clause starting at 32:
-; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
-; EG-NEXT: Fetch clause starting at 34:
-; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
-; EG-NEXT: ALU clause starting at 36:
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: MOV * T5.X, 0.0,
-; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: LSHL T0.W, T6.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 44:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T6.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 50:
-; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 58:
-; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 66:
-; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65281(nan)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 74:
-; EG-NEXT: AND_INT T0.W, T6.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65281(nan)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: Fetch clause starting at 8:
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 49, #3
+; EG-NEXT: VTX_READ_8 T2.X, T0.X, 50, #3
+; EG-NEXT: VTX_READ_8 T3.X, T0.X, 48, #3
+; EG-NEXT: Fetch clause starting at 14:
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3
+; EG-NEXT: VTX_READ_8 T3.X, T0.X, 51, #3
+; EG-NEXT: VTX_READ_8 T4.X, T0.X, 46, #3
+; EG-NEXT: VTX_READ_8 T5.X, T0.X, 44, #3
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 45, #3
+; EG-NEXT: ALU clause starting at 24:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 25:
+; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 82:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T6.X, literal.y,
-; EG-NEXT: -256(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T5.Y, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 88:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
-; EG-NEXT: -256(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT T5.X, PV.W, PS,
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T0.W, T3.X, PV.W,
+; EG-NEXT: ALU clause starting at 28:
+; EG-NEXT: LSHL T1.W, T2.X, literal.x,
+; EG-NEXT: LSHL * T2.W, T0.X, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: OR_INT T0.Y, T5.X, PS,
+; EG-NEXT: LSHL T0.Z, T4.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: OR_INT T0.W, T0.W, PV.W,
+; EG-NEXT: LSHL * T1.W, T3.X, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: OR_INT T1.Y, PV.W, PS,
+; EG-NEXT: OR_INT T0.W, PV.Y, PV.Z,
+; EG-NEXT: LSHL * T1.W, T1.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T1.X, PV.W, PS,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v8i8_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 1, @36, KC0[], KC1[]
-; CM-NEXT: TEX 0 @20
-; CM-NEXT: ALU 5, @38, KC0[], KC1[]
-; CM-NEXT: TEX 0 @22
-; CM-NEXT: ALU 5, @44, KC0[], KC1[]
-; CM-NEXT: TEX 0 @24
-; CM-NEXT: ALU 7, @50, KC0[], KC1[]
-; CM-NEXT: TEX 0 @26
-; CM-NEXT: ALU 7, @58, KC0[], KC1[]
-; CM-NEXT: TEX 0 @28
-; CM-NEXT: ALU 7, @66, KC0[], KC1[]
-; CM-NEXT: TEX 0 @30
-; CM-NEXT: ALU 7, @74, KC0[], KC1[]
-; CM-NEXT: TEX 0 @32
-; CM-NEXT: ALU 5, @82, KC0[], KC1[]
-; CM-NEXT: TEX 0 @34
-; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
+; CM-NEXT: ALU 0, @24, KC0[], KC1[]
+; CM-NEXT: TEX 2 @8
+; CM-NEXT: ALU 2, @25, KC0[], KC1[]
+; CM-NEXT: TEX 4 @14
+; CM-NEXT: ALU 14, @28, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
-; CM-NEXT: Fetch clause starting at 20:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3
-; CM-NEXT: Fetch clause starting at 22:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3
-; CM-NEXT: Fetch clause starting at 24:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3
-; CM-NEXT: Fetch clause starting at 26:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3
-; CM-NEXT: Fetch clause starting at 28:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3
-; CM-NEXT: Fetch clause starting at 30:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3
-; CM-NEXT: Fetch clause starting at 32:
-; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3
-; CM-NEXT: Fetch clause starting at 34:
-; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3
-; CM-NEXT: ALU clause starting at 36:
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: MOV * T5.X, 0.0,
-; CM-NEXT: ALU clause starting at 38:
-; CM-NEXT: LSHL T0.Z, T6.X, literal.x,
-; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
-; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
-; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 44:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T6.X, literal.y,
-; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 50:
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 58:
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 66:
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -65281(nan), 8(1.121039e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 74:
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -65281(nan), 8(1.121039e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 82:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T6.X, literal.y,
-; CM-NEXT: -256(nan), 255(3.573311e-43)
-; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.Y,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 88:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T5.X, literal.y,
-; CM-NEXT: -256(nan), 255(3.573311e-43)
-; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W,
-; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
+; CM-NEXT: Fetch clause starting at 8:
+; CM-NEXT: VTX_READ_8 T1.X, T0.X, 49, #3
+; CM-NEXT: VTX_READ_8 T2.X, T0.X, 50, #3
+; CM-NEXT: VTX_READ_8 T3.X, T0.X, 48, #3
+; CM-NEXT: Fetch clause starting at 14:
+; CM-NEXT: VTX_READ_8 T1.X, T0.X, 47, #3
+; CM-NEXT: VTX_READ_8 T3.X, T0.X, 51, #3
+; CM-NEXT: VTX_READ_8 T4.X, T0.X, 46, #3
+; CM-NEXT: VTX_READ_8 T5.X, T0.X, 44, #3
+; CM-NEXT: VTX_READ_8 T0.X, T0.X, 45, #3
+; CM-NEXT: ALU clause starting at 24:
+; CM-NEXT: MOV * T0.X, 0.0,
+; CM-NEXT: ALU clause starting at 25:
+; CM-NEXT: LSHL * T0.W, T1.X, literal.x,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT * T0.W, T3.X, PV.W,
+; CM-NEXT: ALU clause starting at 28:
+; CM-NEXT: LSHL T0.Z, T2.X, literal.x,
+; CM-NEXT: LSHL * T1.W, T0.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; CM-NEXT: OR_INT T0.X, T5.X, PV.W,
+; CM-NEXT: LSHL T0.Y, T4.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: OR_INT T0.Z, T0.W, PV.Z,
+; CM-NEXT: LSHL * T0.W, T3.X, literal.y, BS:VEC_201
+; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; CM-NEXT: OR_INT T1.Y, PV.Z, PV.W,
+; CM-NEXT: OR_INT T0.Z, PV.X, PV.Y,
+; CM-NEXT: LSHL * T0.W, T1.X, literal.x,
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT * T1.X, PV.Z, PV.W,
+; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <8 x i8> %in, ptr addrspace(1) %out
@@ -3147,406 +3010,144 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
;
; EG-LABEL: v16i8_arg:
; EG: ; %bb.0: ; %entry
-; EG-NEXT: ALU 1, @68, KC0[], KC1[]
-; EG-NEXT: TEX 0 @36
-; EG-NEXT: ALU 5, @70, KC0[], KC1[]
-; EG-NEXT: TEX 0 @38
-; EG-NEXT: ALU 5, @76, KC0[], KC1[]
-; EG-NEXT: TEX 0 @40
-; EG-NEXT: ALU 5, @82, KC0[], KC1[]
-; EG-NEXT: TEX 0 @42
-; EG-NEXT: ALU 5, @88, KC0[], KC1[]
-; EG-NEXT: TEX 0 @44
-; EG-NEXT: ALU 7, @94, KC0[], KC1[]
-; EG-NEXT: TEX 0 @46
-; EG-NEXT: ALU 7, @102, KC0[], KC1[]
-; EG-NEXT: TEX 0 @48
-; EG-NEXT: ALU 7, @110, KC0[], KC1[]
-; EG-NEXT: TEX 0 @50
-; EG-NEXT: ALU 7, @118, KC0[], KC1[]
-; EG-NEXT: TEX 0 @52
-; EG-NEXT: ALU 7, @126, KC0[], KC1[]
-; EG-NEXT: TEX 0 @54
-; EG-NEXT: ALU 7, @134, KC0[], KC1[]
-; EG-NEXT: TEX 0 @56
-; EG-NEXT: ALU 7, @142, KC0[], KC1[]
-; EG-NEXT: TEX 0 @58
-; EG-NEXT: ALU 7, @150, KC0[], KC1[]
-; EG-NEXT: TEX 0 @60
-; EG-NEXT: ALU 5, @158, KC0[], KC1[]
-; EG-NEXT: TEX 0 @62
-; EG-NEXT: ALU 5, @164, KC0[], KC1[]
-; EG-NEXT: TEX 0 @64
-; EG-NEXT: ALU 5, @170, KC0[], KC1[]
-; EG-NEXT: TEX 0 @66
-; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
+; EG-NEXT: ALU 0, @42, KC0[], KC1[]
+; EG-NEXT: TEX 3 @10
+; EG-NEXT: ALU 3, @43, KC0[], KC1[]
+; EG-NEXT: TEX 4 @18
+; EG-NEXT: ALU 7, @47, KC0[], KC1[]
+; EG-NEXT: TEX 6 @28
+; EG-NEXT: ALU 21, @55, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 36:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
-; EG-NEXT: Fetch clause starting at 38:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
-; EG-NEXT: Fetch clause starting at 40:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
-; EG-NEXT: Fetch clause starting at 42:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
-; EG-NEXT: Fetch clause starting at 44:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
-; EG-NEXT: Fetch clause starting at 46:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
-; EG-NEXT: Fetch clause starting at 48:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
-; EG-NEXT: Fetch clause starting at 50:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
-; EG-NEXT: Fetch clause starting at 52:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
-; EG-NEXT: Fetch clause starting at 54:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
-; EG-NEXT: Fetch clause starting at 56:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
-; EG-NEXT: Fetch clause starting at 58:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
-; EG-NEXT: Fetch clause starting at 60:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
-; EG-NEXT: Fetch clause starting at 62:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
-; EG-NEXT: Fetch clause starting at 64:
-; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
-; EG-NEXT: Fetch clause starting at 66:
-; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
-; EG-NEXT: ALU clause starting at 68:
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: MOV * T7.X, 0.0,
-; EG-NEXT: ALU clause starting at 70:
-; EG-NEXT: LSHL T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 76:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 82:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 88:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 94:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 102:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 110:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 118:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 126:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65281(nan)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 134:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65281(nan)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T3.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 142:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65281(nan)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: Fetch clause starting at 10:
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 65, #3
+; EG-NEXT: VTX_READ_8 T2.X, T0.X, 66, #3
+; EG-NEXT: VTX_READ_8 T3.X, T0.X, 64, #3
+; EG-NEXT: VTX_READ_8 T4.X, T0.X, 61, #3
+; EG-NEXT: Fetch clause starting at 18:
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 62, #3
+; EG-NEXT: VTX_READ_8 T3.X, T0.X, 60, #3
+; EG-NEXT: VTX_READ_8 T4.X, T0.X, 67, #3
+; EG-NEXT: VTX_READ_8 T5.X, T0.X, 56, #3
+; EG-NEXT: VTX_READ_8 T6.X, T0.X, 57, #3
+; EG-NEXT: Fetch clause starting at 28:
+; EG-NEXT: VTX_READ_8 T2.X, T0.X, 55, #3
+; EG-NEXT: VTX_READ_8 T3.X, T0.X, 59, #3
+; EG-NEXT: VTX_READ_8 T4.X, T0.X, 54, #3
+; EG-NEXT: VTX_READ_8 T5.X, T0.X, 52, #3
+; EG-NEXT: VTX_READ_8 T6.X, T0.X, 53, #3
+; EG-NEXT: VTX_READ_8 T7.X, T0.X, 63, #3
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 58, #3
+; EG-NEXT: ALU clause starting at 42:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 43:
+; EG-NEXT: LSHL T0.W, T1.X, literal.x,
+; EG-NEXT: LSHL * T1.W, T4.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 150:
-; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65281(nan)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: OR_INT * T0.W, T3.X, PV.W,
+; EG-NEXT: ALU clause starting at 47:
+; EG-NEXT: LSHL T2.W, T2.X, literal.x,
+; EG-NEXT: LSHL * T3.W, T6.X, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: OR_INT T0.Y, T5.X, PS,
+; EG-NEXT: OR_INT T0.Z, T0.W, PV.W,
+; EG-NEXT: LSHL T0.W, T4.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: OR_INT * T1.W, T3.X, T1.W,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 55:
+; EG-NEXT: LSHL * T2.W, T1.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, T0.X, literal.x,
+; EG-NEXT: OR_INT T1.Y, T1.W, PV.W,
+; EG-NEXT: LSHL T1.Z, T7.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL T1.W, T6.X, literal.z, BS:VEC_201
+; EG-NEXT: OR_INT * T0.W, T0.Z, T0.W,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: ALU clause starting at 158:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT: -256(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T7.W, PV.W, PS,
-; EG-NEXT: MOV T2.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T3.X,
-; EG-NEXT: ALU clause starting at 164:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT: -256(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
-; EG-NEXT: MOV T3.X, PV.Z,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: ALU clause starting at 170:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT: -256(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T7.Y, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T5.X,
-; EG-NEXT: ALU clause starting at 176:
-; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, T7.X, literal.y,
-; EG-NEXT: -256(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT T7.X, PV.W, PS,
-; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT T1.X, T5.X, PV.W,
+; EG-NEXT: LSHL T2.Y, T4.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: OR_INT T0.Z, PV.Y, PV.Z,
+; EG-NEXT: OR_INT T1.W, T0.Y, PV.X,
+; EG-NEXT: LSHL * T2.W, T3.X, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: OR_INT T0.Y, PV.W, PS,
+; EG-NEXT: OR_INT T1.W, PV.X, PV.Y,
+; EG-NEXT: LSHL * T2.W, T2.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: v16i8_arg:
; CM: ; %bb.0: ; %entry
-; CM-NEXT: ALU 1, @68, KC0[], KC1[]
-; CM-NEXT: TEX 0 @36
-; CM-NEXT: ALU 5, @70, KC0[], KC1[]
-; CM-NEXT: TEX 0 @38
-; CM-NEXT: ALU 5, @76, KC0[], KC1[]
-; CM-NEXT: TEX 0 @40
-; CM-NEXT: ALU 5, @82, KC0[], KC1[]
-; CM-NEXT: TEX 0 @42
-; CM-NEXT: ALU 5, @88, KC0[], KC1[]
-; CM-NEXT: TEX 0 @44
-; CM-NEXT: ALU 7, @94, KC0[], KC1[]
-; CM-NEXT: TEX 0 @46
-; CM-NEXT: ALU 7, @102, KC0[], KC1[]
-; CM-NEXT: TEX 0 @48
-; CM-NEXT: ALU 7, @110, KC0[], KC1[]
-; CM-NEXT: TEX 0 @50
-; CM-NEXT: ALU 7, @118, KC0[], KC1[]
-; CM-NEXT: TEX 0 @52
-; CM-NEXT: ALU 7, @126, KC0[], KC1[]
-; CM-NEXT: TEX 0 @54
-; CM-NEXT: ALU 7, @134, KC0[], KC1[]
-; CM-NEXT: TEX 0 @56
-; CM-NEXT: ALU 7, @142, KC0[], KC1[]
-; CM-NEXT: TEX 0 @58
-; CM-NEXT: ALU 7, @150, KC0[], KC1[]
-; CM-NEXT: TEX 0 @60
-; CM-NEXT: ALU 5, @158, KC0[], KC1[]
-; CM-NEXT: TEX 0 @62
-; CM-NEXT: ALU 5, @164, KC0[], KC1[]
-; CM-NEXT: TEX 0 @64
-; CM-NEXT: ALU 5, @170, KC0[], KC1[]
-; CM-NEXT: TEX 0 @66
-; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
+; CM-NEXT: ALU 0, @42, KC0[], KC1[]
+; CM-NEXT: TEX 3 @10
+; CM-NEXT: ALU 4, @43, KC0[], KC1[]
+; CM-NEXT: TEX 4 @18
+; CM-NEXT: ALU 8, @48, KC0[], KC1[]
+; CM-NEXT: TEX 6 @28
+; CM-NEXT: ALU 19, @57, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
-; CM-NEXT: Fetch clause starting at 36:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3
-; CM-NEXT: Fetch clause starting at 38:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3
-; CM-NEXT: Fetch clause starting at 40:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3
-; CM-NEXT: Fetch clause starting at 42:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3
-; CM-NEXT: Fetch clause starting at 44:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3
-; CM-NEXT: Fetch clause starting at 46:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3
-; CM-NEXT: Fetch clause starting at 48:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3
-; CM-NEXT: Fetch clause starting at 50:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3
-; CM-NEXT: Fetch clause starting at 52:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3
-; CM-NEXT: Fetch clause starting at 54:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3
-; CM-NEXT: Fetch clause starting at 56:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3
-; CM-NEXT: Fetch clause starting at 58:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3
-; CM-NEXT: Fetch clause starting at 60:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3
-; CM-NEXT: Fetch clause starting at 62:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3
-; CM-NEXT: Fetch clause starting at 64:
-; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3
-; CM-NEXT: Fetch clause starting at 66:
-; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3
-; CM-NEXT: ALU clause starting at 68:
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: MOV * T7.X, 0.0,
-; CM-NEXT: ALU clause starting at 70:
-; CM-NEXT: LSHL T0.Z, T8.X, literal.x,
-; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
-; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38)
-; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 76:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 82:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 88:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T5.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 94:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 102:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 110:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 118:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T5.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 126:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -65281(nan), 8(1.121039e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 134:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -65281(nan), 8(1.121039e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 142:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -65281(nan), 8(1.121039e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 150:
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
-; CM-NEXT: -65281(nan), 8(1.121039e-44)
-; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT: MOV T5.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T2.X,
-; CM-NEXT: ALU clause starting at 158:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT: -256(nan), 255(3.573311e-43)
-; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W,
-; CM-NEXT: MOV T2.X, PV.W,
-; CM-NEXT: MOV * T0.Y, T3.X,
-; CM-NEXT: ALU clause starting at 164:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT: -256(nan), 255(3.573311e-43)
-; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W,
-; CM-NEXT: MOV T3.X, PV.Z,
-; CM-NEXT: MOV * T0.Y, T4.X,
-; CM-NEXT: ALU clause starting at 170:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT: -256(nan), 255(3.573311e-43)
-; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W,
-; CM-NEXT: MOV T4.X, PV.Y,
-; CM-NEXT: MOV * T0.Y, T5.X,
-; CM-NEXT: ALU clause starting at 176:
-; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT: AND_INT * T0.W, T7.X, literal.y,
-; CM-NEXT: -256(nan), 255(3.573311e-43)
-; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W,
-; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
+; CM-NEXT: Fetch clause starting at 10:
+; CM-NEXT: VTX_READ_8 T1.X, T0.X, 65, #3
+; CM-NEXT: VTX_READ_8 T2.X, T0.X, 66, #3
+; CM-NEXT: VTX_READ_8 T3.X, T0.X, 61, #3
+; CM-NEXT: VTX_READ_8 T4.X, T0.X, 64, #3
+; CM-NEXT: Fetch clause starting at 18:
+; CM-NEXT: VTX_READ_8 T1.X, T0.X, 67, #3
+; CM-NEXT: VTX_READ_8 T3.X, T0.X, 52, #3
+; CM-NEXT: VTX_READ_8 T4.X, T0.X, 53, #3
+; CM-NEXT: VTX_READ_8 T5.X, T0.X, 62, #3
+; CM-NEXT: VTX_READ_8 T6.X, T0.X, 60, #3
+; CM-NEXT: Fetch clause starting at 28:
+; CM-NEXT: VTX_READ_8 T2.X, T0.X, 55, #3
+; CM-NEXT: VTX_READ_8 T3.X, T0.X, 59, #3
+; CM-NEXT: VTX_READ_8 T4.X, T0.X, 54, #3
+; CM-NEXT: VTX_READ_8 T5.X, T0.X, 58, #3
+; CM-NEXT: VTX_READ_8 T6.X, T0.X, 56, #3
+; CM-NEXT: VTX_READ_8 T7.X, T0.X, 63, #3
+; CM-NEXT: VTX_READ_8 T0.X, T0.X, 57, #3
+; CM-NEXT: ALU clause starting at 42:
+; CM-NEXT: MOV * T0.X, 0.0,
+; CM-NEXT: ALU clause starting at 43:
+; CM-NEXT: LSHL * T0.W, T1.X, literal.x,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T0.Z, T4.X, PV.W,
+; CM-NEXT: LSHL * T0.W, T3.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: ALU clause starting at 48:
+; CM-NEXT: LSHL T2.X, T2.X, literal.x,
+; CM-NEXT: OR_INT T0.Y, T6.X, T0.W, BS:VEC_120/SCL_212
+; CM-NEXT: LSHL * T1.Z, T5.X, literal.x, BS:VEC_201
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, T4.X, literal.x,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T1.Y, T3.X, PV.W,
+; CM-NEXT: OR_INT T1.Z, T0.Y, T1.Z,
+; CM-NEXT: OR_INT * T0.W, T0.Z, T2.X,
+; CM-NEXT: ALU clause starting at 57:
+; CM-NEXT: LSHL T0.Z, T1.X, literal.x,
+; CM-NEXT: LSHL * T1.W, T0.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; CM-NEXT: LSHL T0.X, T7.X, literal.x,
+; CM-NEXT: OR_INT T0.Y, T6.X, PV.W, BS:VEC_120/SCL_212
+; CM-NEXT: LSHL T2.Z, T5.X, literal.y, BS:VEC_201
+; CM-NEXT: OR_INT * T0.W, T0.W, PV.Z,
+; CM-NEXT: 24(3.363116e-44), 16(2.242078e-44)
+; CM-NEXT: LSHL T1.X, T4.X, literal.x,
+; CM-NEXT: OR_INT T0.Y, PV.Y, PV.Z,
+; CM-NEXT: OR_INT T0.Z, T1.Z, PV.X,
+; CM-NEXT: LSHL * T1.W, T3.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; CM-NEXT: OR_INT T0.Y, PV.Y, PV.W,
+; CM-NEXT: OR_INT T1.Z, T1.Y, PV.X,
+; CM-NEXT: LSHL * T1.W, T2.X, literal.x,
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT * T0.X, PV.Z, PV.W,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
store <16 x i8> %in, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index bb98af4e7a5c7f..48c8ab60d2829c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1122,18 +1122,18 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Y, T4.X, literal.x,
+; EG-NEXT: LSHR * T0.Y, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i32:
@@ -1206,21 +1206,20 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
+; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.W, T4.X, literal.x,
-; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR * T0.Y, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i32:
; GFX12: ; %bb.0:
@@ -5716,20 +5715,20 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Z, T4.X, literal.x,
+; EG-NEXT: LSHR * T0.Z, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
-; EG-NEXT: MOV T4.Y, 0.0,
-; EG-NEXT: MOV T4.W, 0.0,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV T0.W, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i64:
@@ -5812,22 +5811,22 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: ASHR * T4.W, T4.X, literal.x,
+; EG-NEXT: ASHR * T0.W, T0.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: ASHR * T4.Z, T4.X, literal.x,
+; EG-NEXT: ASHR * T0.Z, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T4.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT: ASHR * T4.Y, PV.X, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index ff55ab8859c833..a990982c7e3134 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1272,23 +1272,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, PV.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T4.Y, T4.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T4.W, T4.X, literal.y,
+; EG-NEXT: MOV T0.W, literal.x,
+; EG-NEXT: LSHR * T1.W, T0.X, literal.y,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_UINT * T1.Z, T0.X, literal.x, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT * T1.Y, T0.X, literal.x, T0.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i32:
@@ -1375,26 +1375,25 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T4.X, 1
+; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
+; EG-NEXT: ASHR * T1.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T1.X, T0.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.X, literal.y,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T5.Z, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T4.X, literal.x,
+; EG-NEXT: BFE_INT T1.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT * T1.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i32:
@@ -1518,35 +1517,34 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
+; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T6.Z, T5.X, literal.x, PV.W,
+; EG-NEXT: MOV T0.W, literal.x,
+; EG-NEXT: LSHR * T1.W, T0.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_UINT * T1.Z, T0.X, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T7.Z, T5.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T6.W, T5.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T6.X, T5.X, literal.x,
-; EG-NEXT: BFE_UINT T7.Y, T5.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T1.Y, T0.X, literal.x, T0.W,
+; EG-NEXT: LSHR * T2.W, T0.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T1.X, T0.X, literal.x,
+; EG-NEXT: BFE_UINT T2.Z, T0.Y, literal.y, T0.W,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR * T7.W, T5.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T7.X, T5.Y, literal.x,
+; EG-NEXT: BFE_UINT * T2.Y, T0.Y, literal.x, T0.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.X, T0.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
-; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i32:
@@ -1677,38 +1675,36 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 23, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
+; EG-NEXT: ALU 21, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.y,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T5.Y, literal.y,
-; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.Y, T5.Y, literal.x,
-; EG-NEXT: BFE_INT T6.Z, PS, 0.0, literal.y,
-; EG-NEXT: BFE_INT T7.W, PV.Z, 0.0, literal.y,
-; EG-NEXT: LSHR * T0.W, T5.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT: BFE_INT T6.Y, PS, 0.0, literal.y,
-; EG-NEXT: BFE_INT T7.Z, PV.Y, 0.0, literal.y,
-; EG-NEXT: LSHR T0.W, T5.Y, literal.y,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T1.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T1.X, T0.X, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, T0.X, literal.y,
+; EG-NEXT: ASHR * T2.W, T0.Y, literal.z,
+; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T2.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T1.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.y,
+; EG-NEXT: LSHR * T3.W, T0.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: BFE_INT T1.Y, PS, 0.0, literal.y,
+; EG-NEXT: BFE_INT T2.Z, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.y,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T8.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T3.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T2.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
@@ -1911,56 +1907,54 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
+; EG-NEXT: ALU 37, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T8.Z, T7.X, literal.x, PV.W,
+; EG-NEXT: MOV T1.W, literal.x,
+; EG-NEXT: LSHR * T2.W, T0.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_UINT * T2.Z, T0.X, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T9.Z, T7.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T8.W, T7.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T8.X, T7.X, literal.x,
-; EG-NEXT: BFE_UINT T9.Y, T7.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T2.Y, T0.X, literal.x, T1.W,
+; EG-NEXT: LSHR * T3.W, T0.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T2.X, T0.X, literal.x,
+; EG-NEXT: BFE_UINT T3.Z, T0.Y, literal.y, T1.W,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T10.Z, T7.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T9.W, T7.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T9.X, T7.Y, literal.x,
-; EG-NEXT: BFE_UINT T10.Y, T7.Z, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T12.Z, T7.W, literal.y, T0.W,
-; EG-NEXT: LSHR T10.W, T7.Z, literal.z,
-; EG-NEXT: AND_INT * T10.X, T7.Z, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T3.Y, T0.Y, literal.x, T1.W,
+; EG-NEXT: LSHR * T4.W, T0.Z, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T3.X, T0.Y, literal.x,
+; EG-NEXT: BFE_UINT T4.Z, T0.Z, literal.y, T1.W,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T4.Y, T0.Z, literal.y, T1.W,
+; EG-NEXT: LSHR T5.W, T0.W, literal.z,
+; EG-NEXT: AND_INT * T4.X, T0.Z, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T12.Y, T7.W, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T13.X, PV.W, literal.x,
-; EG-NEXT: LSHR T12.W, T7.W, literal.y,
-; EG-NEXT: AND_INT * T12.X, T7.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT: BFE_UINT T5.Z, T0.W, literal.x, T1.W,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT: LSHR T6.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T5.Y, T0.W, literal.y, T1.W, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T5.X, T0.W, literal.z,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i32:
@@ -2177,64 +2171,58 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 47, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T8.X, 1
+; EG-NEXT: ALU 41, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T0.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT: LSHR T0.W, T7.W, literal.y,
-; EG-NEXT: LSHR * T1.W, T7.Z, literal.z,
+; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T9.X, T7.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Y, T7.W, literal.y,
-; EG-NEXT: LSHR T0.Z, T7.Z, literal.z,
-; EG-NEXT: LSHR T2.W, T7.Y, literal.x,
-; EG-NEXT: LSHR * T3.W, T7.X, literal.y,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T10.X, T7.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T1.Y, T7.Z, literal.y,
-; EG-NEXT: LSHR T1.Z, T7.Y, literal.y,
-; EG-NEXT: BFE_INT T9.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T7.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T11.X, T7.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T2.Y, T7.Y, literal.y,
-; EG-NEXT: BFE_INT T9.Z, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T10.W, PV.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T7.X, literal.x,
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: ASHR * T3.W, T0.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T3.X, T0.X, 0.0, literal.x,
+; EG-NEXT: LSHR T1.Y, T0.W, literal.y,
+; EG-NEXT: LSHR T1.Z, T0.Z, literal.y,
+; EG-NEXT: LSHR T1.W, T0.X, literal.y,
+; EG-NEXT: ASHR * T4.W, T0.Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T12.X, T7.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T9.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T10.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T11.W, T1.Y, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T4.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T2.Y, T0.Y, literal.y,
+; EG-NEXT: BFE_INT T3.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR T1.W, T0.X, literal.x,
+; EG-NEXT: ASHR * T5.W, T0.Z, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T7.X, PS, literal.x,
-; EG-NEXT: BFE_INT T10.Y, T2.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T11.Z, T0.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T12.W, T0.Y, 0.0, literal.y,
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T13.X, PS, literal.x,
-; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T12.Z, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR T0.W, T7.W, literal.y, BS:VEC_201
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T3.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T4.Z, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T1.W, T0.Y, literal.x,
+; EG-NEXT: ASHR * T6.W, T0.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T6.X, T0.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T4.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T5.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T1.W, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
+; EG-NEXT: LSHR T0.X, PS, literal.x,
+; EG-NEXT: BFE_INT T5.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT T6.Z, T1.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T0.W, literal.y,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T14.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T12.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T7.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T6.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
@@ -2595,97 +2583,95 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @12
-; EG-NEXT: ALU 75, @17, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1
+; EG-NEXT: ALU 73, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T1.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T9.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 17:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T13.Z, T11.X, literal.x, PV.W,
+; EG-NEXT: MOV T2.W, literal.x,
+; EG-NEXT: LSHR * T3.W, T0.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_UINT * T3.Z, T0.X, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T14.Z, T11.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T13.W, T11.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T11.X, literal.x,
-; EG-NEXT: BFE_UINT T14.Y, T11.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T3.Y, T0.X, literal.x, T2.W,
+; EG-NEXT: LSHR * T4.W, T0.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T3.X, T0.X, literal.x,
+; EG-NEXT: BFE_UINT T4.Z, T0.Y, literal.y, T2.W,
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T15.Z, T11.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T14.W, T11.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T14.X, T11.Y, literal.x,
-; EG-NEXT: BFE_UINT T15.Y, T11.Z, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T17.Z, T11.W, literal.y, T0.W,
-; EG-NEXT: LSHR T15.W, T11.Z, literal.z,
-; EG-NEXT: AND_INT * T15.X, T11.Z, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T4.Y, T0.Y, literal.x, T2.W,
+; EG-NEXT: LSHR * T5.W, T0.Z, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T4.X, T0.Y, literal.x,
+; EG-NEXT: BFE_UINT T5.Z, T0.Z, literal.y, T2.W,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T5.Y, T0.Z, literal.y, T2.W,
+; EG-NEXT: LSHR T6.W, T0.W, literal.z,
+; EG-NEXT: AND_INT * T5.X, T0.Z, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T17.Y, T11.W, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T17.W, T11.W, literal.z,
-; EG-NEXT: AND_INT * T17.X, T11.W, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T6.Z, T0.W, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT: LSHR T7.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T6.Y, T0.W, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T8.W, T1.X, literal.z,
+; EG-NEXT: AND_INT * T6.X, T0.W, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T19.Y, T12.X, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T21.Z, T12.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T19.W, T12.X, literal.z,
-; EG-NEXT: AND_INT * T19.X, T12.X, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T8.Z, T1.X, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT: LSHR T9.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T8.Y, T1.X, literal.y, T2.W,
+; EG-NEXT: LSHR T10.W, T1.Y, literal.z,
+; EG-NEXT: AND_INT * T8.X, T1.X, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T21.Y, T12.Y, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44)
-; EG-NEXT: LSHR T12.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T22.Z, T12.Z, literal.y, T0.W,
-; EG-NEXT: LSHR T21.W, T12.Y, literal.z,
-; EG-NEXT: AND_INT * T21.X, T12.Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T10.Z, T1.Y, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T10.Y, T1.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T11.W, T1.Z, literal.z,
+; EG-NEXT: AND_INT * T10.X, T1.Y, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T22.Y, T12.Z, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT: LSHR T23.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T24.Z, T12.W, literal.y, T0.W,
-; EG-NEXT: LSHR T22.W, T12.Z, literal.z,
-; EG-NEXT: AND_INT * T22.X, T12.Z, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T11.Z, T1.Z, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43)
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T11.Y, T1.Z, literal.y, T2.W,
+; EG-NEXT: LSHR T13.W, T1.W, literal.z,
+; EG-NEXT: AND_INT * T11.X, T1.Z, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T24.Y, T12.W, literal.x, T0.W,
+; EG-NEXT: BFE_UINT T13.Z, T1.W, literal.x, T2.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43)
-; EG-NEXT: LSHR T25.X, PV.W, literal.x,
-; EG-NEXT: LSHR T24.W, T12.W, literal.y,
-; EG-NEXT: AND_INT * T24.X, T12.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43)
+; EG-NEXT: LSHR T14.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T13.Y, T1.W, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T13.X, T1.W, literal.z,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR * T26.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T15.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i32:
@@ -3075,122 +3061,108 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; EG-LABEL: constant_sextload_v32i8_to_v32i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @14
-; EG-NEXT: ALU 18, @19, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 0 @16
-; EG-NEXT: ALU 75, @38, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T17.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T13.X, 1
+; EG-NEXT: ALU 8, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @12
+; EG-NEXT: ALU 76, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T6.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T1.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 1
; EG-NEXT: CF_END
-; EG-NEXT: Fetch clause starting at 14:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
-; EG-NEXT: Fetch clause starting at 16:
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
-; EG-NEXT: ALU clause starting at 18:
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 19:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
+; EG-NEXT: Fetch clause starting at 12:
+; EG-NEXT: VTX_READ_128 T4.XYZW, T3.X, 0, #1
+; EG-NEXT: VTX_READ_128 T3.XYZW, T3.X, 16, #1
+; EG-NEXT: ALU clause starting at 16:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T14.X, PV.W, literal.x,
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T15.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.Z, T12.W, literal.y,
-; EG-NEXT: LSHR T0.W, T12.Z, literal.z,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: MOV * T3.X, KC0[2].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 25:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T5.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
+; EG-NEXT: LSHR T6.X, PV.W, literal.x,
+; EG-NEXT: LSHR T0.W, T3.W, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T16.X, PS, literal.x,
-; EG-NEXT: LSHR T0.Y, T12.W, literal.y,
-; EG-NEXT: LSHR T1.Z, T12.Z, literal.z,
-; EG-NEXT: LSHR T1.W, T12.Y, literal.w,
-; EG-NEXT: LSHR * T2.W, T12.Z, literal.y,
-; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
-; EG-NEXT: ALU clause starting at 38:
-; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
-; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T17.X, PV.W, literal.x,
-; EG-NEXT: LSHR T1.Y, T12.Y, literal.y,
-; EG-NEXT: LSHR T2.Z, T12.Y, literal.z,
-; EG-NEXT: LSHR T3.W, T12.X, literal.y,
-; EG-NEXT: LSHR * T4.W, T12.X, literal.z,
+; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T7.X, PS, literal.x,
+; EG-NEXT: LSHR T0.Y, T3.Z, literal.y,
+; EG-NEXT: LSHR T0.Z, T3.Y, literal.y,
+; EG-NEXT: LSHR T1.W, T3.X, literal.y,
+; EG-NEXT: ASHR * T8.W, T4.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T18.X, T11.X, 0.0, literal.x,
-; EG-NEXT: LSHR T2.Y, T11.W, literal.y,
-; EG-NEXT: LSHR T3.Z, T11.W, literal.z,
-; EG-NEXT: LSHR T5.W, T11.Z, literal.y,
-; EG-NEXT: LSHR * T6.W, T11.X, literal.z,
+; EG-NEXT: BFE_INT T8.X, T4.X, 0.0, literal.x,
+; EG-NEXT: LSHR T1.Y, T4.W, literal.y,
+; EG-NEXT: LSHR T1.Z, T4.Z, literal.y,
+; EG-NEXT: LSHR T2.W, T4.X, literal.y,
+; EG-NEXT: ASHR * T9.W, T4.Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T19.X, T11.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T3.Y, T11.Z, literal.y,
-; EG-NEXT: LSHR T4.Z, T11.Y, literal.y,
-; EG-NEXT: BFE_INT T18.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T11.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T20.X, T11.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T4.Y, T11.Y, literal.y,
-; EG-NEXT: BFE_INT T18.Z, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T19.W, PV.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T11.X, literal.x,
+; EG-NEXT: BFE_INT T9.X, T4.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T2.Y, T4.Y, literal.y,
+; EG-NEXT: BFE_INT T8.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T4.X, literal.x,
+; EG-NEXT: ASHR * T10.W, T4.Z, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T21.X, T11.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T18.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T19.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.W, T3.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T11.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T22.X, T12.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T19.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.Z, T5.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.W, T3.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T11.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T11.X, T12.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT T22.W, T4.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T11.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T23.X, T12.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T11.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T3.W, T12.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T24.X, T12.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T22.Y, PS, 0.0, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T10.X, T4.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T8.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T9.Z, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T4.Y, literal.x,
+; EG-NEXT: ASHR * T11.W, T4.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T11.X, T4.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T9.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T10.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T4.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T12.W, T3.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T12.X, T3.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T10.Y, PV.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T11.Z, T1.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T23.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T2.W, T4.W, literal.x,
+; EG-NEXT: ASHR * T4.W, T3.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T4.X, T3.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T11.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T12.Z, T1.W, 0.0, literal.x,
+; EG-NEXT: LSHR T1.W, T3.X, literal.x,
+; EG-NEXT: ASHR * T13.W, T3.Z, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T13.X, T3.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T12.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T4.Z, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T1.W, T3.Y, literal.x,
+; EG-NEXT: ASHR * T14.W, T3.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T14.X, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T4.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T13.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T1.W, T3.Z, literal.x,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43)
-; EG-NEXT: LSHR T12.X, PS, literal.x,
-; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T23.Z, T1.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T24.W, T0.Y, 0.0, literal.y,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T25.X, PS, literal.x,
-; EG-NEXT: BFE_INT T23.Y, T0.W, 0.0, literal.y,
-; EG-NEXT: BFE_INT T24.Z, T0.Z, 0.0, literal.y,
-; EG-NEXT: LSHR T0.W, T12.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43)
+; EG-NEXT: LSHR T3.X, PS, literal.x,
+; EG-NEXT: BFE_INT T13.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT T14.Z, T0.W, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T3.W, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T26.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T24.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T15.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T14.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
@@ -3872,184 +3844,182 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @22
-; EG-NEXT: ALU 59, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 57, @31, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @26
-; EG-NEXT: ALU 88, @91, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T44.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1
+; EG-NEXT: ALU 88, @89, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T30.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T6.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T4.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T22.XYZW, T21.X, 16, #1
-; EG-NEXT: VTX_READ_128 T23.XYZW, T21.X, 0, #1
+; EG-NEXT: VTX_READ_128 T3.XYZW, T2.X, 16, #1
+; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 0, #1
; EG-NEXT: Fetch clause starting at 26:
-; EG-NEXT: VTX_READ_128 T32.XYZW, T21.X, 48, #1
-; EG-NEXT: VTX_READ_128 T33.XYZW, T21.X, 32, #1
+; EG-NEXT: VTX_READ_128 T14.XYZW, T2.X, 48, #1
+; EG-NEXT: VTX_READ_128 T15.XYZW, T2.X, 32, #1
; EG-NEXT: ALU clause starting at 30:
-; EG-NEXT: MOV * T21.X, KC0[2].Z,
+; EG-NEXT: MOV * T2.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 31:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT * T19.Z, T23.X, literal.x, PV.W,
+; EG-NEXT: MOV T2.W, literal.x,
+; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_UINT * T0.Z, T4.X, literal.x, PV.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T19.Y, T23.X, literal.x, T0.W,
-; EG-NEXT: BFE_UINT T20.Z, T23.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T19.W, T23.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T19.X, T23.X, literal.x,
-; EG-NEXT: BFE_UINT T20.Y, T23.Y, literal.y, T0.W,
-; EG-NEXT: LSHR * T23.X, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T0.Y, T4.X, literal.x, T2.W,
+; EG-NEXT: LSHR * T1.W, T4.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T0.X, T4.X, literal.x,
+; EG-NEXT: BFE_UINT T1.Z, T4.Y, literal.y, T2.W,
+; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T24.Z, T23.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T20.W, T23.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T20.X, T23.Y, literal.x,
-; EG-NEXT: BFE_UINT T24.Y, T23.Z, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T25.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T26.Z, T23.W, literal.y, T0.W,
-; EG-NEXT: LSHR T24.W, T23.Z, literal.z,
-; EG-NEXT: AND_INT * T24.X, T23.Z, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T1.Y, T4.Y, literal.x, T2.W,
+; EG-NEXT: LSHR * T5.W, T4.Z, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T1.X, T4.Y, literal.x,
+; EG-NEXT: BFE_UINT T5.Z, T4.Z, literal.y, T2.W,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: LSHR T6.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T5.Y, T4.Z, literal.y, T2.W,
+; EG-NEXT: LSHR T7.W, T4.W, literal.z,
+; EG-NEXT: AND_INT * T5.X, T4.Z, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T26.Y, T23.W, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T27.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T28.Z, T22.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T26.W, T23.W, literal.z,
-; EG-NEXT: AND_INT * T26.X, T23.W, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T7.Z, T4.W, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT: LSHR T8.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T7.Y, T4.W, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T9.W, T3.X, literal.z,
+; EG-NEXT: AND_INT * T7.X, T4.W, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T28.Y, T22.X, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T29.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T30.Z, T22.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T28.W, T22.X, literal.z,
-; EG-NEXT: AND_INT * T28.X, T22.X, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T9.Z, T3.X, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT: LSHR T10.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T9.Y, T3.X, literal.y, T2.W,
+; EG-NEXT: LSHR T11.W, T3.Y, literal.z,
+; EG-NEXT: AND_INT * T9.X, T3.X, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T30.Y, T22.Y, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44)
-; EG-NEXT: LSHR T22.X, PV.W, literal.x,
-; EG-NEXT: LSHR T30.W, T22.Y, literal.y,
-; EG-NEXT: AND_INT * T30.X, T22.Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T21.Z, T22.Z, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43)
-; EG-NEXT: LSHR T31.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT * T21.Y, T22.Z, literal.y, T0.W,
+; EG-NEXT: BFE_UINT T11.Z, T3.Y, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T11.Y, T3.Y, literal.y, T2.W,
+; EG-NEXT: AND_INT * T11.X, T3.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT: ALU clause starting at 91:
-; EG-NEXT: BFE_UINT T34.Z, T22.W, literal.x, T0.W,
-; EG-NEXT: LSHR * T21.W, T22.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: AND_INT T21.X, T22.Z, literal.x,
-; EG-NEXT: BFE_UINT T34.Y, T22.W, literal.y, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
-; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
-; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T35.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T36.Z, T33.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T34.W, T22.W, literal.z,
-; EG-NEXT: AND_INT * T34.X, T22.W, literal.w,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: ADD_INT T4.W, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR * T12.W, T3.Z, literal.y,
+; EG-NEXT: 80(1.121039e-43), 24(3.363116e-44)
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT * T12.Z, T3.Z, literal.y, T2.W,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: ALU clause starting at 89:
+; EG-NEXT: BFE_UINT T12.Y, T3.Z, literal.x, T2.W,
+; EG-NEXT: LSHR * T16.W, T3.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T12.X, T3.Z, literal.x,
+; EG-NEXT: BFE_UINT T16.Z, T3.W, literal.y, T2.W,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T16.Y, T3.W, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T17.W, T15.X, literal.z,
+; EG-NEXT: AND_INT * T16.X, T3.W, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T36.Y, T33.X, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 112(1.569454e-43)
-; EG-NEXT: LSHR T37.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T38.Z, T33.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T36.W, T33.X, literal.z,
-; EG-NEXT: AND_INT * T36.X, T33.X, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T17.Z, T15.X, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT: LSHR T18.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T17.Y, T15.X, literal.y, T2.W,
+; EG-NEXT: LSHR T19.W, T15.Y, literal.z,
+; EG-NEXT: AND_INT * T17.X, T15.X, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T38.Y, T33.Y, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 128(1.793662e-43)
-; EG-NEXT: LSHR T33.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T39.Z, T33.Z, literal.y, T0.W,
-; EG-NEXT: LSHR T38.W, T33.Y, literal.z,
-; EG-NEXT: AND_INT * T38.X, T33.Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T19.Z, T15.Y, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 128(1.793662e-43)
+; EG-NEXT: LSHR T15.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T19.Y, T15.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T20.W, T15.Z, literal.z,
+; EG-NEXT: AND_INT * T19.X, T15.Y, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T39.Y, T33.Z, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 144(2.017870e-43)
-; EG-NEXT: LSHR T40.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T41.Z, T33.W, literal.y, T0.W,
-; EG-NEXT: LSHR T39.W, T33.Z, literal.z,
-; EG-NEXT: AND_INT * T39.X, T33.Z, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T20.Z, T15.Z, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 144(2.017870e-43)
+; EG-NEXT: LSHR T21.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T20.Y, T15.Z, literal.y, T2.W,
+; EG-NEXT: LSHR T22.W, T15.W, literal.z,
+; EG-NEXT: AND_INT * T20.X, T15.Z, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T41.Y, T33.W, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 160(2.242078e-43)
-; EG-NEXT: LSHR T42.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T43.Z, T32.X, literal.y, T0.W, BS:VEC_021/SCL_122
-; EG-NEXT: LSHR T41.W, T33.W, literal.z,
-; EG-NEXT: AND_INT * T41.X, T33.W, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T22.Z, T15.W, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 160(2.242078e-43)
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T22.Y, T15.W, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: LSHR T24.W, T14.X, literal.z,
+; EG-NEXT: AND_INT * T22.X, T15.W, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T43.Y, T32.X, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 176(2.466285e-43)
-; EG-NEXT: LSHR T44.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T45.Z, T32.Y, literal.y, T0.W,
-; EG-NEXT: LSHR T43.W, T32.X, literal.z,
-; EG-NEXT: AND_INT * T43.X, T32.X, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T24.Z, T14.X, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 176(2.466285e-43)
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T24.Y, T14.X, literal.y, T2.W,
+; EG-NEXT: LSHR T26.W, T14.Y, literal.z,
+; EG-NEXT: AND_INT * T24.X, T14.X, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T45.Y, T32.Y, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 192(2.690493e-43)
-; EG-NEXT: LSHR T32.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T46.Z, T32.Z, literal.y, T0.W,
-; EG-NEXT: LSHR T45.W, T32.Y, literal.z,
-; EG-NEXT: AND_INT * T45.X, T32.Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T26.Z, T14.Y, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43)
+; EG-NEXT: LSHR T14.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T26.Y, T14.Y, literal.y, T2.W,
+; EG-NEXT: LSHR T27.W, T14.Z, literal.z,
+; EG-NEXT: AND_INT * T26.X, T14.Y, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T46.Y, T32.Z, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43)
-; EG-NEXT: LSHR T47.X, PV.W, literal.x,
-; EG-NEXT: BFE_UINT T48.Z, T32.W, literal.y, T0.W,
-; EG-NEXT: LSHR T46.W, T32.Z, literal.z,
-; EG-NEXT: AND_INT * T46.X, T32.Z, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: BFE_UINT T27.Z, T14.Z, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 208(2.914701e-43)
+; EG-NEXT: LSHR T28.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T27.Y, T14.Z, literal.y, T2.W,
+; EG-NEXT: LSHR T29.W, T14.W, literal.z,
+; EG-NEXT: AND_INT * T27.X, T14.Z, literal.w,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
-; EG-NEXT: BFE_UINT T48.Y, T32.W, literal.x, T0.W,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43)
-; EG-NEXT: LSHR T49.X, PV.W, literal.x,
-; EG-NEXT: LSHR T48.W, T32.W, literal.y,
-; EG-NEXT: AND_INT * T48.X, T32.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
+; EG-NEXT: BFE_UINT T29.Z, T14.W, literal.x, T2.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 224(3.138909e-43)
+; EG-NEXT: LSHR T30.X, PV.W, literal.x,
+; EG-NEXT: BFE_UINT T29.Y, T14.W, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T29.X, T14.W, literal.z,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR * T50.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T31.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v64i8_to_v64i32:
@@ -4783,231 +4753,204 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; EG-LABEL: constant_sextload_v64i8_to_v64i32:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @32, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @24
-; EG-NEXT: ALU 40, @33, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @28
-; EG-NEXT: ALU 76, @74, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 72, @151, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T19.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T35.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T27.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T26.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T22.X, 1
+; EG-NEXT: ALU 17, @30, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 3 @22
+; EG-NEXT: ALU 78, @48, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 71, @127, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T6.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T16.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T11.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T1.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
-; EG-NEXT: Fetch clause starting at 24:
-; EG-NEXT: VTX_READ_128 T20.XYZW, T21.X, 32, #1
-; EG-NEXT: VTX_READ_128 T19.XYZW, T21.X, 48, #1
-; EG-NEXT: Fetch clause starting at 28:
-; EG-NEXT: VTX_READ_128 T31.XYZW, T21.X, 0, #1
-; EG-NEXT: VTX_READ_128 T21.XYZW, T21.X, 16, #1
-; EG-NEXT: ALU clause starting at 32:
-; EG-NEXT: MOV * T21.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 33:
-; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
+; EG-NEXT: Fetch clause starting at 22:
+; EG-NEXT: VTX_READ_128 T7.XYZW, T6.X, 0, #1
+; EG-NEXT: VTX_READ_128 T8.XYZW, T6.X, 16, #1
+; EG-NEXT: VTX_READ_128 T9.XYZW, T6.X, 32, #1
+; EG-NEXT: VTX_READ_128 T6.XYZW, T6.X, 48, #1
+; EG-NEXT: ALU clause starting at 30:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T24.X, PV.W, literal.x,
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T26.X, PV.W, literal.x,
+; EG-NEXT: LSHR T4.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: LSHR T5.X, PV.W, literal.x,
+; EG-NEXT: MOV * T6.X, KC0[2].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 48:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T10.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
-; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: LSHR T28.X, PV.W, literal.x,
-; EG-NEXT: LSHR T0.Y, T19.W, literal.y,
-; EG-NEXT: LSHR T0.Z, T19.Z, literal.z,
-; EG-NEXT: LSHR * T0.W, T19.W, literal.w,
-; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T29.X, PV.W, literal.x,
-; EG-NEXT: LSHR T1.Y, T19.Z, literal.y,
-; EG-NEXT: LSHR T1.Z, T19.Y, literal.z,
-; EG-NEXT: LSHR * T1.W, T19.Z, literal.w,
+; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
+; EG-NEXT: LSHR T11.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
+; EG-NEXT: LSHR T14.X, PV.W, literal.x,
+; EG-NEXT: LSHR T0.Y, T6.W, literal.y,
+; EG-NEXT: LSHR T0.Z, T6.Z, literal.y,
+; EG-NEXT: LSHR T0.W, T6.Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
-; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T30.X, PV.W, literal.x,
-; EG-NEXT: LSHR T2.Y, T19.Y, literal.y,
-; EG-NEXT: LSHR T2.Z, T19.Y, literal.z,
-; EG-NEXT: LSHR T2.W, T19.X, literal.y,
-; EG-NEXT: LSHR * T3.W, T19.X, literal.z,
+; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T15.X, PS, literal.x,
+; EG-NEXT: LSHR T1.Y, T6.X, literal.y,
+; EG-NEXT: LSHR T1.Z, T9.W, literal.y,
+; EG-NEXT: LSHR T1.W, T9.Z, literal.y,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 74:
-; EG-NEXT: LSHR T3.Y, T20.W, literal.x,
-; EG-NEXT: LSHR T3.Z, T20.W, literal.y,
-; EG-NEXT: LSHR T4.W, T20.Z, literal.x,
-; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.z,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T32.X, PS, literal.x,
-; EG-NEXT: LSHR T4.Y, T20.Z, literal.y,
-; EG-NEXT: LSHR T4.Z, T20.Y, literal.z,
-; EG-NEXT: LSHR T5.W, T20.Y, literal.y,
-; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 160(2.242078e-43)
-; EG-NEXT: LSHR T33.X, PS, literal.x,
-; EG-NEXT: LSHR T5.Y, T20.X, literal.y,
-; EG-NEXT: LSHR T5.Z, T20.X, literal.z,
-; EG-NEXT: LSHR T6.W, T21.W, literal.y,
-; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.w,
+; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T16.X, PS, literal.x,
+; EG-NEXT: LSHR T2.Y, T9.Y, literal.y,
+; EG-NEXT: LSHR T2.Z, T9.X, literal.y,
+; EG-NEXT: LSHR T2.W, T8.W, literal.y,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: 24(3.363116e-44), 176(2.466285e-43)
-; EG-NEXT: LSHR T34.X, PS, literal.x,
-; EG-NEXT: LSHR T6.Y, T21.W, literal.y,
-; EG-NEXT: LSHR T6.Z, T21.Z, literal.z,
-; EG-NEXT: LSHR T7.W, T21.Z, literal.y,
-; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43)
-; EG-NEXT: LSHR T35.X, PS, literal.x,
-; EG-NEXT: LSHR T7.Y, T21.Y, literal.y,
-; EG-NEXT: LSHR T7.Z, T21.Y, literal.z,
-; EG-NEXT: LSHR T8.W, T21.X, literal.y,
-; EG-NEXT: LSHR * T9.W, T21.X, literal.z,
+; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T17.X, PS, literal.x,
+; EG-NEXT: LSHR T3.Y, T8.Z, literal.y,
+; EG-NEXT: LSHR T3.Z, T8.Y, literal.y,
+; EG-NEXT: LSHR T3.W, T8.X, literal.y,
+; EG-NEXT: ASHR * T18.W, T7.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T36.X, T31.X, 0.0, literal.x,
-; EG-NEXT: LSHR T8.Y, T31.W, literal.y,
-; EG-NEXT: LSHR T8.Z, T31.W, literal.z,
-; EG-NEXT: LSHR T10.W, T31.Z, literal.y,
-; EG-NEXT: LSHR * T11.W, T31.X, literal.z,
+; EG-NEXT: BFE_INT T18.X, T7.X, 0.0, literal.x,
+; EG-NEXT: LSHR T4.Y, T7.W, literal.y,
+; EG-NEXT: LSHR T4.Z, T7.Z, literal.y,
+; EG-NEXT: LSHR T4.W, T7.X, literal.y,
+; EG-NEXT: ASHR * T19.W, T7.Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T37.X, T31.Y, 0.0, literal.x,
-; EG-NEXT: LSHR T9.Y, T31.Z, literal.y,
-; EG-NEXT: LSHR T9.Z, T31.Y, literal.y,
-; EG-NEXT: BFE_INT T36.W, PS, 0.0, literal.x,
-; EG-NEXT: LSHR * T11.W, T31.X, literal.z,
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T38.X, T31.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T10.Y, T31.Y, literal.y,
-; EG-NEXT: BFE_INT T36.Z, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T37.W, PV.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T11.W, T31.X, literal.x,
+; EG-NEXT: BFE_INT T19.X, T7.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T5.Y, T7.Y, literal.y,
+; EG-NEXT: BFE_INT T18.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR T4.W, T7.X, literal.x,
+; EG-NEXT: ASHR * T20.W, T7.Z, literal.z,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T39.X, T31.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T36.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T37.Z, PV.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T38.W, T9.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T11.W, T31.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T40.X, T21.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T37.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T38.Z, T10.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T39.W, T8.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T10.W, T31.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T31.X, T21.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T38.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T39.Z, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT T40.W, T9.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T9.W, T31.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T41.X, T21.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T39.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T40.Z, T8.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T31.W, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 151:
-; EG-NEXT: LSHR * T8.W, T21.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T42.X, T21.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T40.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.Z, T7.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T41.W, T7.W, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T7.W, T21.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T43.X, T20.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T31.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T41.Z, T6.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.W, T6.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T7.W, T21.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T21.X, T20.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T41.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.Z, T6.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.W, T5.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T6.W, T21.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T44.X, T20.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.Z, T5.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.W, T5.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T20.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T45.X, T20.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Z, T4.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.W, T4.Y, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.W, T20.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T46.X, T19.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.Z, T4.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.W, T3.Z, 0.0, literal.x,
-; EG-NEXT: LSHR * T4.W, T20.Z, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T20.X, T19.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T44.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Z, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT T46.W, T3.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T3.W, T20.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T47.X, T19.Z, 0.0, literal.x,
-; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Z, T2.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T2.W, T19.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T20.X, T7.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T18.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T19.Z, PV.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T4.W, T7.Y, literal.x,
+; EG-NEXT: ASHR * T21.W, T7.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T21.X, T7.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T19.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T20.Z, T4.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T4.W, T7.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T22.W, T8.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T22.X, T8.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T20.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T21.Z, T4.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T4.W, T7.W, literal.x,
+; EG-NEXT: ASHR * T7.W, T8.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T7.X, T8.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T21.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T3.W, T8.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T48.X, T19.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T46.Y, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T20.Z, T2.Y, 0.0, literal.x,
-; EG-NEXT: BFE_INT T47.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ALU clause starting at 127:
+; EG-NEXT: ASHR * T23.W, T8.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T23.X, T8.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T22.Y, T3.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T7.Z, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T3.W, T8.Y, literal.x,
+; EG-NEXT: ASHR * T24.W, T8.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T24.X, T8.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T7.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T23.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T3.W, T8.Z, literal.x,
+; EG-NEXT: ASHR * T25.W, T9.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T25.X, T9.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T23.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T24.Z, T2.W, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T8.W, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T8.W, T9.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T8.X, T9.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T24.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T9.X, literal.x,
+; EG-NEXT: ASHR * T26.W, T9.Z, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T26.X, T9.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T25.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T8.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T9.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T27.W, T9.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T27.X, T9.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T8.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T26.Z, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T1.W, T9.Z, literal.x,
+; EG-NEXT: ASHR * T28.W, T6.X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T28.X, T6.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T26.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T27.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T1.W, T9.W, literal.x,
+; EG-NEXT: ASHR * T9.W, T6.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T9.X, T6.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T27.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T28.Z, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR T1.W, T6.X, literal.x,
+; EG-NEXT: ASHR * T29.W, T6.Z, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T29.X, T6.Z, 0.0, literal.x,
+; EG-NEXT: BFE_INT T28.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T9.Z, T0.W, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, T6.Y, literal.x,
+; EG-NEXT: ASHR * T30.W, T6.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T30.X, T6.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T9.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T29.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T0.W, T6.Z, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43)
-; EG-NEXT: LSHR T19.X, PS, literal.x,
-; EG-NEXT: BFE_INT T20.Y, T1.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T47.Z, T1.Y, 0.0, literal.y,
-; EG-NEXT: BFE_INT T48.W, T0.W, 0.0, literal.y,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
-; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T49.X, PS, literal.x,
-; EG-NEXT: BFE_INT T47.Y, T0.Z, 0.0, literal.y,
-; EG-NEXT: BFE_INT T48.Z, T0.Y, 0.0, literal.y,
-; EG-NEXT: LSHR T0.W, T19.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43)
+; EG-NEXT: LSHR T6.X, PS, literal.x,
+; EG-NEXT: BFE_INT T29.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: BFE_INT T30.Z, T0.Y, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T6.W, literal.y,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T50.X, PS, literal.x,
-; EG-NEXT: BFE_INT * T48.Y, PV.W, 0.0, literal.y,
+; EG-NEXT: LSHR T31.X, PS, literal.x,
+; EG-NEXT: BFE_INT * T30.Y, PV.W, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
;
; GFX12-LABEL: constant_sextload_v64i8_to_v64i32:
@@ -5771,32 +5714,31 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 1
+; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T5.X, T4.X, literal.x, PV.W,
-; EG-NEXT: LSHR * T5.Z, T4.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
-; EG-NEXT: MOV T4.Y, 0.0,
-; EG-NEXT: MOV T5.W, 0.0,
-; EG-NEXT: MOV * T4.W, 0.0,
+; EG-NEXT: LSHR T1.Z, T0.X, literal.x,
+; EG-NEXT: MOV * T0.W, literal.y,
+; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T1.X, T0.X, literal.x, PV.W,
+; EG-NEXT: MOV T1.Y, 0.0,
+; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, PV.W,
+; EG-NEXT: AND_INT * T0.X, T0.X, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV T1.W, 0.0,
+; EG-NEXT: MOV * T0.W, 0.0,
+; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i64:
@@ -5920,32 +5862,32 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT: ASHR T4.W, T4.X, literal.y,
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.z,
+; EG-NEXT: BFE_INT T1.X, T0.X, 0.0, literal.x,
+; EG-NEXT: ASHR T0.W, T0.X, literal.y,
+; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ASHR T5.Y, PV.X, literal.x,
-; EG-NEXT: ASHR T4.Z, T4.X, literal.y,
-; EG-NEXT: LSHR T0.W, T4.X, literal.z,
-; EG-NEXT: LSHR * T1.W, T4.X, literal.w,
+; EG-NEXT: ASHR T1.Y, PV.X, literal.x,
+; EG-NEXT: ASHR T0.Z, T0.X, literal.y,
+; EG-NEXT: LSHR T1.W, T0.X, literal.z,
+; EG-NEXT: LSHR * T2.W, T0.X, literal.w,
; EG-NEXT: 31(4.344025e-44), 24(3.363116e-44)
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T4.X, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T5.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT T1.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T7.X, PV.W, literal.x,
-; EG-NEXT: ASHR T4.Y, PV.X, literal.y,
-; EG-NEXT: ASHR * T5.W, PV.Z, literal.y,
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
+; EG-NEXT: ASHR T0.Y, PV.X, literal.y,
+; EG-NEXT: ASHR * T1.W, PV.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i64:
@@ -6104,51 +6046,50 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 34, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T9.X, 1
+; EG-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T4.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T6.X, T5.Y, literal.x, PV.W,
-; EG-NEXT: LSHR * T6.Z, T5.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T6.Y, 0.0,
-; EG-NEXT: BFE_UINT * T7.Z, T5.Y, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T7.X, T5.Y, literal.x,
-; EG-NEXT: MOV * T7.Y, 0.0,
+; EG-NEXT: LSHR T1.Z, T0.Y, literal.x,
+; EG-NEXT: MOV * T0.W, literal.y,
+; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T1.X, T0.Y, literal.x, PV.W,
+; EG-NEXT: MOV T1.Y, 0.0,
+; EG-NEXT: BFE_UINT T2.Z, T0.Y, literal.y, PV.W,
+; EG-NEXT: AND_INT * T2.X, T0.Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T8.X, T5.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T8.Z, T5.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T8.Y, 0.0,
-; EG-NEXT: BFE_UINT * T5.Z, T5.X, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV T6.W, 0.0,
-; EG-NEXT: MOV * T7.W, 0.0,
+; EG-NEXT: MOV T2.Y, 0.0,
+; EG-NEXT: LSHR * T3.Z, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T3.X, T0.X, literal.x, T0.W,
+; EG-NEXT: MOV T3.Y, 0.0,
+; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, T0.W,
+; EG-NEXT: AND_INT * T0.X, T0.X, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: MOV T8.W, 0.0,
-; EG-NEXT: MOV * T5.W, 0.0,
-; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV T1.W, 0.0,
+; EG-NEXT: MOV * T2.W, 0.0,
+; EG-NEXT: MOV T3.W, 0.0,
+; EG-NEXT: MOV * T0.W, 0.0,
+; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T10.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T5.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T6.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR * T12.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i64:
@@ -6351,55 +6292,55 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T6.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
+; EG-NEXT: BFE_INT T2.X, T0.Y, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T8.X, PV.W, literal.x,
-; EG-NEXT: ASHR T7.Y, PV.X, literal.y,
-; EG-NEXT: LSHR T0.W, T5.Y, literal.z,
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
+; EG-NEXT: ASHR T2.Y, PV.X, literal.y,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.z,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T9.X, PS, literal.x,
-; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.y,
-; EG-NEXT: ASHR * T10.W, T5.X, literal.z,
+; EG-NEXT: LSHR T4.X, PS, literal.x,
+; EG-NEXT: BFE_INT T2.Z, PV.W, 0.0, literal.y,
+; EG-NEXT: ASHR * T5.W, T0.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T11.X, T5.X, 0.0, literal.x,
-; EG-NEXT: ASHR T10.Z, T5.X, literal.y,
-; EG-NEXT: LSHR T0.W, T5.X, literal.z,
-; EG-NEXT: ASHR * T5.W, T5.Y, literal.w,
+; EG-NEXT: BFE_INT T6.X, T0.X, 0.0, literal.x,
+; EG-NEXT: ASHR T5.Z, T0.X, literal.y,
+; EG-NEXT: LSHR T0.W, T0.X, literal.z,
+; EG-NEXT: ASHR * T7.W, T0.Y, literal.w,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T10.X, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T5.Z, T5.Y, literal.z,
-; EG-NEXT: LSHR T0.W, T5.X, literal.x,
-; EG-NEXT: LSHR * T1.W, T5.Y, literal.w,
+; EG-NEXT: BFE_INT T5.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T6.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T7.Z, T0.Y, literal.z,
+; EG-NEXT: LSHR T0.W, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.W, T0.Y, literal.w,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T5.X, PS, 0.0, literal.x,
-; EG-NEXT: ASHR T10.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T11.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT T7.X, PS, 0.0, literal.x,
+; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T6.Z, PV.W, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T12.X, PV.W, literal.x,
-; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T11.W, PV.Z, literal.y,
-; EG-NEXT: ASHR * T7.W, T7.Z, literal.y,
+; EG-NEXT: LSHR T0.X, PV.W, literal.x,
+; EG-NEXT: ASHR T7.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T6.W, PV.Z, literal.y,
+; EG-NEXT: ASHR * T2.W, T2.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
@@ -6667,89 +6608,88 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @12
-; EG-NEXT: ALU 68, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T17.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T15.X, 1
+; EG-NEXT: ALU 67, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T11.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T8.X, T7.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T8.Z, T7.W, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T8.Y, 0.0,
-; EG-NEXT: BFE_UINT * T9.Z, T7.W, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T9.X, T7.W, literal.x,
-; EG-NEXT: MOV * T9.Y, 0.0,
+; EG-NEXT: LSHR T1.Z, T0.W, literal.x,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T1.X, T0.W, literal.x, PV.W,
+; EG-NEXT: MOV T1.Y, 0.0,
+; EG-NEXT: BFE_UINT T2.Z, T0.W, literal.y, PV.W,
+; EG-NEXT: AND_INT * T2.X, T0.W, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T10.X, T7.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T10.Z, T7.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T10.Y, 0.0,
-; EG-NEXT: BFE_UINT * T11.Z, T7.Z, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T11.X, T7.Z, literal.x,
-; EG-NEXT: MOV * T11.Y, 0.0,
+; EG-NEXT: MOV T2.Y, 0.0,
+; EG-NEXT: LSHR * T3.Z, T0.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T3.X, T0.Z, literal.x, T1.W,
+; EG-NEXT: MOV T3.Y, 0.0,
+; EG-NEXT: BFE_UINT T4.Z, T0.Z, literal.y, T1.W,
+; EG-NEXT: AND_INT * T4.X, T0.Z, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T12.X, T7.Y, literal.x, T0.W,
-; EG-NEXT: LSHR * T12.Z, T7.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T12.Y, 0.0,
-; EG-NEXT: BFE_UINT * T13.Z, T7.Y, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T13.X, T7.Y, literal.x,
-; EG-NEXT: MOV * T13.Y, 0.0,
+; EG-NEXT: MOV T4.Y, 0.0,
+; EG-NEXT: LSHR * T5.Z, T0.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T5.X, T0.Y, literal.x, T1.W,
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: BFE_UINT T6.Z, T0.Y, literal.y, T1.W,
+; EG-NEXT: AND_INT * T6.X, T0.Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T14.X, T7.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T14.Z, T7.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T14.Y, 0.0,
-; EG-NEXT: BFE_UINT * T7.Z, T7.X, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T7.X, T7.X, literal.x,
+; EG-NEXT: MOV T6.Y, 0.0,
+; EG-NEXT: LSHR * T7.Z, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T7.X, T0.X, literal.x, T1.W,
; EG-NEXT: MOV T7.Y, 0.0,
-; EG-NEXT: MOV T8.W, 0.0,
-; EG-NEXT: MOV * T9.W, 0.0,
+; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, T1.W,
+; EG-NEXT: AND_INT * T0.X, T0.X, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: MOV T10.W, 0.0,
-; EG-NEXT: MOV * T11.W, 0.0,
-; EG-NEXT: MOV T12.W, 0.0,
-; EG-NEXT: MOV * T13.W, 0.0,
-; EG-NEXT: MOV T14.W, 0.0,
-; EG-NEXT: MOV * T7.W, 0.0,
-; EG-NEXT: LSHR T15.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV T1.W, 0.0,
+; EG-NEXT: MOV * T2.W, 0.0,
+; EG-NEXT: MOV T3.W, 0.0,
+; EG-NEXT: MOV * T4.W, 0.0,
+; EG-NEXT: MOV T5.W, 0.0,
+; EG-NEXT: MOV * T6.W, 0.0,
+; EG-NEXT: MOV T7.W, 0.0,
+; EG-NEXT: MOV * T0.W, 0.0,
+; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T9.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T17.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T10.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T11.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T19.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: LSHR T21.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T14.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT: LSHR * T22.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T15.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i64:
@@ -7114,98 +7054,98 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @12
; EG-NEXT: ALU 78, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T11.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T10.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T9.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T8.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T15.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T9.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T2.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
-; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T9.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T2.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T10.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T11.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T4.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR * T12.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T5.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_INT * T13.X, T7.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T6.X, T0.W, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T14.X, T7.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T13.Y, PV.X, literal.y,
-; EG-NEXT: LSHR T0.W, T7.W, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T6.Y, PV.X, literal.y,
+; EG-NEXT: LSHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T15.X, PS, literal.x,
-; EG-NEXT: ASHR T14.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T13.Z, PV.W, 0.0, literal.z,
-; EG-NEXT: LSHR T0.W, T7.Y, literal.z,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
+; EG-NEXT: LSHR T8.X, PS, literal.x,
+; EG-NEXT: ASHR T7.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T6.Z, PV.W, 0.0, literal.z,
+; EG-NEXT: LSHR T1.W, T0.Y, literal.z,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43)
-; EG-NEXT: LSHR T16.X, PS, literal.x,
-; EG-NEXT: BFE_INT T14.Z, PV.W, 0.0, literal.y,
-; EG-NEXT: ASHR * T17.W, T7.X, literal.z,
+; EG-NEXT: LSHR T9.X, PS, literal.x,
+; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.y,
+; EG-NEXT: ASHR * T10.W, T0.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T18.X, T7.X, 0.0, literal.x,
-; EG-NEXT: ASHR T17.Z, T7.X, literal.y,
-; EG-NEXT: LSHR T0.W, T7.X, literal.z,
-; EG-NEXT: ASHR * T19.W, T7.Y, literal.w,
+; EG-NEXT: BFE_INT T11.X, T0.X, 0.0, literal.x,
+; EG-NEXT: ASHR T10.Z, T0.X, literal.y,
+; EG-NEXT: LSHR T1.W, T0.X, literal.z,
+; EG-NEXT: ASHR * T12.W, T0.Y, literal.w,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T17.X, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T18.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T19.Z, T7.Y, literal.z,
-; EG-NEXT: LSHR T0.W, T7.X, literal.x,
-; EG-NEXT: LSHR * T1.W, T7.Y, literal.w,
-; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
+; EG-NEXT: BFE_INT T10.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T12.Z, T0.Y, literal.z,
+; EG-NEXT: LSHR T1.W, T0.X, literal.x,
+; EG-NEXT: LSHR * T2.W, T0.Y, literal.w,
+; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T19.X, PS, 0.0, literal.x,
-; EG-NEXT: ASHR T17.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T18.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: ASHR * T20.W, T7.Z, literal.y,
+; EG-NEXT: BFE_INT T12.X, PS, 0.0, literal.x,
+; EG-NEXT: ASHR T10.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T11.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T13.W, T0.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T7.X, T7.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T20.Z, T7.Z, literal.z,
-; EG-NEXT: LSHR T1.W, T7.Z, literal.w,
-; EG-NEXT: ASHR * T21.W, T7.W, literal.y,
+; EG-NEXT: BFE_INT T0.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T13.Z, T0.Z, literal.z,
+; EG-NEXT: LSHR T2.W, T0.Z, literal.w,
+; EG-NEXT: ASHR * T14.W, T0.W, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T20.X, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T7.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T21.Z, T7.W, literal.z,
-; EG-NEXT: LSHR T1.W, T7.Z, literal.x,
-; EG-NEXT: LSHR * T2.W, T7.W, literal.w,
+; EG-NEXT: BFE_INT T13.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T0.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T14.Z, T0.W, literal.z,
+; EG-NEXT: LSHR T2.W, T0.Z, literal.x,
+; EG-NEXT: LSHR * T0.W, T0.W, literal.w,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T21.X, PS, 0.0, literal.x,
-; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T18.W, T18.Z, literal.y,
-; EG-NEXT: ASHR * T14.W, T14.Z, literal.y,
+; EG-NEXT: BFE_INT T14.X, PS, 0.0, literal.x,
+; EG-NEXT: ASHR T13.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T11.W, T11.Z, literal.y,
+; EG-NEXT: ASHR * T7.W, T7.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT: LSHR T22.X, T0.W, literal.x,
-; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T7.W, PV.Z, literal.y,
-; EG-NEXT: ASHR * T13.W, T13.Z, literal.y,
+; EG-NEXT: LSHR T15.X, T1.W, literal.x,
+; EG-NEXT: ASHR T14.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T0.W, PV.Z, literal.y,
+; EG-NEXT: ASHR * T6.W, T6.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i64:
@@ -7693,170 +7633,169 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @22
-; EG-NEXT: ALU 103, @27, KC0[CB0:0-32], KC1[]
-; EG-NEXT: ALU 33, @131, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T39.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T38.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T37.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T36.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T35.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T34.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T33.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T31.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T29.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 1
+; EG-NEXT: ALU 102, @27, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 33, @130, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T31.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T30.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T29.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T28.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T27.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T23.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T22.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T21.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T20.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T18.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T17.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T16.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 26:
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 27:
-; EG-NEXT: MOV * T0.W, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T13.X, T11.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T13.Z, T11.W, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T13.Y, 0.0,
-; EG-NEXT: BFE_UINT * T14.Z, T11.W, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T14.X, T11.W, literal.x,
-; EG-NEXT: MOV * T14.Y, 0.0,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T15.X, T11.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T15.Z, T11.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T15.Y, 0.0,
-; EG-NEXT: BFE_UINT * T16.Z, T11.Z, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T16.X, T11.Z, literal.x,
-; EG-NEXT: MOV * T16.Y, 0.0,
+; EG-NEXT: LSHR T2.Z, T1.W, literal.x,
+; EG-NEXT: MOV * T2.W, literal.y,
+; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T2.X, T1.W, literal.x, PV.W,
+; EG-NEXT: MOV T2.Y, 0.0,
+; EG-NEXT: BFE_UINT T3.Z, T1.W, literal.y, PV.W,
+; EG-NEXT: AND_INT * T3.X, T1.W, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T17.X, T11.Y, literal.x, T0.W,
-; EG-NEXT: LSHR * T17.Z, T11.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T17.Y, 0.0,
-; EG-NEXT: BFE_UINT * T18.Z, T11.Y, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T18.X, T11.Y, literal.x,
-; EG-NEXT: MOV * T18.Y, 0.0,
+; EG-NEXT: MOV T3.Y, 0.0,
+; EG-NEXT: LSHR * T4.Z, T1.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T4.X, T1.Z, literal.x, T2.W,
+; EG-NEXT: MOV T4.Y, 0.0,
+; EG-NEXT: BFE_UINT T5.Z, T1.Z, literal.y, T2.W,
+; EG-NEXT: AND_INT * T5.X, T1.Z, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T19.X, T11.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T19.Z, T11.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T19.Y, 0.0,
-; EG-NEXT: BFE_UINT * T11.Z, T11.X, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T11.X, T11.X, literal.x,
-; EG-NEXT: MOV * T11.Y, 0.0,
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: LSHR * T6.Z, T1.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T6.X, T1.Y, literal.x, T2.W,
+; EG-NEXT: MOV T6.Y, 0.0,
+; EG-NEXT: BFE_UINT T7.Z, T1.Y, literal.y, T2.W,
+; EG-NEXT: AND_INT * T7.X, T1.Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T20.X, T12.W, literal.x, T0.W,
-; EG-NEXT: LSHR * T20.Z, T12.W, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T20.Y, 0.0,
-; EG-NEXT: BFE_UINT * T21.Z, T12.W, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T21.X, T12.W, literal.x,
-; EG-NEXT: MOV * T21.Y, 0.0,
+; EG-NEXT: MOV T7.Y, 0.0,
+; EG-NEXT: LSHR * T8.Z, T1.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T8.X, T1.X, literal.x, T2.W,
+; EG-NEXT: MOV T8.Y, 0.0,
+; EG-NEXT: BFE_UINT T1.Z, T1.X, literal.y, T2.W,
+; EG-NEXT: AND_INT * T1.X, T1.X, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T22.X, T12.Z, literal.x, T0.W,
-; EG-NEXT: LSHR * T22.Z, T12.Z, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T22.Y, 0.0,
-; EG-NEXT: BFE_UINT * T23.Z, T12.Z, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T23.X, T12.Z, literal.x,
-; EG-NEXT: MOV * T23.Y, 0.0,
+; EG-NEXT: MOV T1.Y, 0.0,
+; EG-NEXT: LSHR * T9.Z, T0.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T9.X, T0.W, literal.x, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: MOV T9.Y, 0.0,
+; EG-NEXT: BFE_UINT T10.Z, T0.W, literal.y, T2.W, BS:VEC_021/SCL_122
+; EG-NEXT: AND_INT * T10.X, T0.W, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T24.X, T12.Y, literal.x, T0.W,
-; EG-NEXT: LSHR * T24.Z, T12.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T24.Y, 0.0,
-; EG-NEXT: BFE_UINT * T25.Z, T12.Y, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T25.X, T12.Y, literal.x,
-; EG-NEXT: MOV * T25.Y, 0.0,
+; EG-NEXT: MOV T10.Y, 0.0,
+; EG-NEXT: LSHR * T11.Z, T0.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T11.X, T0.Z, literal.x, T2.W,
+; EG-NEXT: MOV T11.Y, 0.0,
+; EG-NEXT: BFE_UINT T12.Z, T0.Z, literal.y, T2.W,
+; EG-NEXT: AND_INT * T12.X, T0.Z, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T26.X, T12.X, literal.x, T0.W,
-; EG-NEXT: LSHR * T26.Z, T12.X, literal.y,
-; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
-; EG-NEXT: MOV T26.Y, 0.0,
-; EG-NEXT: BFE_UINT * T12.Z, T12.X, literal.x, T0.W,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T12.X, T12.X, literal.x,
; EG-NEXT: MOV T12.Y, 0.0,
+; EG-NEXT: LSHR * T13.Z, T0.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T13.X, T0.Y, literal.x, T2.W,
+; EG-NEXT: MOV T13.Y, 0.0,
+; EG-NEXT: BFE_UINT T14.Z, T0.Y, literal.y, T2.W,
+; EG-NEXT: AND_INT * T14.X, T0.Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T14.Y, 0.0,
+; EG-NEXT: LSHR * T15.Z, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T15.X, T0.X, literal.x, T2.W,
+; EG-NEXT: MOV T15.Y, 0.0,
+; EG-NEXT: BFE_UINT T0.Z, T0.X, literal.y, T2.W,
+; EG-NEXT: AND_INT * T0.X, T0.X, literal.z,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV T2.W, 0.0,
+; EG-NEXT: MOV * T3.W, 0.0,
+; EG-NEXT: MOV T4.W, 0.0,
+; EG-NEXT: MOV * T5.W, 0.0,
+; EG-NEXT: MOV T6.W, 0.0,
+; EG-NEXT: MOV * T7.W, 0.0,
+; EG-NEXT: MOV T8.W, 0.0,
+; EG-NEXT: MOV * T1.W, 0.0,
+; EG-NEXT: MOV T9.W, 0.0,
+; EG-NEXT: MOV * T10.W, 0.0,
+; EG-NEXT: MOV T11.W, 0.0,
+; EG-NEXT: MOV * T12.W, 0.0,
; EG-NEXT: MOV T13.W, 0.0,
; EG-NEXT: MOV * T14.W, 0.0,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T15.W, 0.0,
-; EG-NEXT: MOV * T16.W, 0.0,
-; EG-NEXT: MOV T17.W, 0.0,
-; EG-NEXT: MOV * T18.W, 0.0,
-; EG-NEXT: MOV T19.W, 0.0,
-; EG-NEXT: MOV * T11.W, 0.0,
-; EG-NEXT: MOV T20.W, 0.0,
-; EG-NEXT: MOV * T21.W, 0.0,
-; EG-NEXT: MOV T22.W, 0.0,
-; EG-NEXT: MOV * T23.W, 0.0,
-; EG-NEXT: MOV T24.W, 0.0,
-; EG-NEXT: MOV * T25.W, 0.0,
-; EG-NEXT: MOV T26.W, 0.0,
-; EG-NEXT: MOV * T12.W, 0.0,
-; EG-NEXT: LSHR T27.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: MOV * T0.W, 0.0,
+; EG-NEXT: LSHR T16.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T28.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T17.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T29.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T18.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T30.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T19.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR * T31.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T20.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 131:
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ALU clause starting at 130:
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.x,
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T32.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T21.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: LSHR T33.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T22.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT: LSHR T34.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT: LSHR T35.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T24.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT: LSHR T36.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT: LSHR T37.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T26.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT: LSHR T38.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T27.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T28.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
-; EG-NEXT: LSHR T40.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T29.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
-; EG-NEXT: LSHR T41.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T30.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T16.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
-; EG-NEXT: LSHR * T42.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T31.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i64:
@@ -8558,187 +8497,187 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; EG-NEXT: TEX 1 @22
; EG-NEXT: ALU 84, @27, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 71, @112, KC0[], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T31.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T30.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T25.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T24.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T23.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T21.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T20.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T18.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T17.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T16.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T15.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T14.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T13.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T1.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T13.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T12.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T11.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T10.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T9.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T8.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T7.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T6.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T5.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T4.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T3.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 22:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
-; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
+; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
; EG-NEXT: ALU clause starting at 26:
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 27:
-; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT: LSHR T14.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T3.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
-; EG-NEXT: LSHR T15.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T4.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
-; EG-NEXT: LSHR T16.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T5.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
-; EG-NEXT: LSHR T17.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T6.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
-; EG-NEXT: LSHR T18.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T7.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
-; EG-NEXT: LSHR T19.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T8.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
-; EG-NEXT: LSHR T20.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T9.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
-; EG-NEXT: LSHR T21.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T10.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
-; EG-NEXT: LSHR T22.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T11.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
-; EG-NEXT: LSHR T23.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T12.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
-; EG-NEXT: LSHR T24.X, PV.W, literal.x,
-; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
-; EG-NEXT: LSHR * T25.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: BFE_INT * T26.X, T11.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T15.X, T0.W, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T27.X, T11.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T26.Y, PV.X, literal.y,
-; EG-NEXT: LSHR * T0.W, T11.W, literal.x,
+; EG-NEXT: BFE_INT T16.X, T0.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T15.Y, PV.X, literal.y,
+; EG-NEXT: LSHR * T2.W, T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T28.X, T11.X, 0.0, literal.x,
-; EG-NEXT: ASHR T27.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T26.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
+; EG-NEXT: BFE_INT T17.X, T0.X, 0.0, literal.x,
+; EG-NEXT: ASHR T16.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T15.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T2.W, T0.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T29.X, T12.W, 0.0, literal.x,
-; EG-NEXT: ASHR T28.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T27.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR T0.W, T11.X, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: BFE_INT T18.X, T1.W, 0.0, literal.x,
+; EG-NEXT: ASHR T17.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T16.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR T2.W, T0.X, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
-; EG-NEXT: LSHR T30.X, PS, literal.x,
-; EG-NEXT: ASHR T29.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T28.Z, PV.W, 0.0, literal.z,
-; EG-NEXT: LSHR T0.W, T12.W, literal.z,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
+; EG-NEXT: LSHR T19.X, PS, literal.x,
+; EG-NEXT: ASHR T18.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T17.Z, PV.W, 0.0, literal.z,
+; EG-NEXT: LSHR T2.W, T1.W, literal.z,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43)
-; EG-NEXT: LSHR T31.X, PS, literal.x,
-; EG-NEXT: BFE_INT T29.Z, PV.W, 0.0, literal.y,
-; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
-; EG-NEXT: ASHR * T32.W, T12.X, literal.w,
+; EG-NEXT: LSHR T20.X, PS, literal.x,
+; EG-NEXT: BFE_INT T18.Z, PV.W, 0.0, literal.y,
+; EG-NEXT: ADD_INT T2.W, KC0[2].Y, literal.z,
+; EG-NEXT: ASHR * T21.W, T1.X, literal.w,
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; EG-NEXT: 240(3.363116e-43), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T33.X, T12.Z, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Y, T11.Z, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: ASHR T32.Z, T12.X, literal.y,
-; EG-NEXT: LSHR T1.W, T12.X, literal.z,
-; EG-NEXT: ASHR * T34.W, T12.Y, literal.w,
+; EG-NEXT: BFE_INT T22.X, T1.Z, 0.0, literal.x,
+; EG-NEXT: LSHR T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR T21.Z, T1.X, literal.y,
+; EG-NEXT: LSHR T3.W, T1.X, literal.z,
+; EG-NEXT: ASHR * T23.W, T1.Y, literal.w,
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T32.X, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T33.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T34.Z, T12.Y, literal.z,
-; EG-NEXT: LSHR T1.W, T12.Z, literal.x,
-; EG-NEXT: LSHR * T2.W, T12.Y, literal.w,
+; EG-NEXT: BFE_INT T21.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T22.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T23.Z, T1.Y, literal.z,
+; EG-NEXT: LSHR T3.W, T1.Z, literal.x,
+; EG-NEXT: LSHR * T4.W, T1.Y, literal.w,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT * T34.X, PS, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T23.X, PS, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 112:
-; EG-NEXT: ASHR T32.Y, T32.X, literal.x,
-; EG-NEXT: BFE_INT T33.Z, T1.W, 0.0, literal.y,
-; EG-NEXT: LSHR T1.W, T11.W, literal.z, BS:VEC_120/SCL_212
-; EG-NEXT: ASHR * T35.W, T12.Z, literal.x,
+; EG-NEXT: ASHR T21.Y, T21.X, literal.x,
+; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.y,
+; EG-NEXT: LSHR T3.W, T0.W, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR * T24.W, T1.Z, literal.x,
; EG-NEXT: 31(4.344025e-44), 8(1.121039e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T36.X, T12.X, 0.0, literal.x,
-; EG-NEXT: ASHR T34.Y, T34.X, literal.y, BS:VEC_120/SCL_212
-; EG-NEXT: ASHR T35.Z, T12.Z, literal.z,
-; EG-NEXT: LSHR T2.W, T12.Z, literal.w,
-; EG-NEXT: ASHR * T37.W, T12.W, literal.y,
+; EG-NEXT: BFE_INT T25.X, T1.X, 0.0, literal.x,
+; EG-NEXT: ASHR T23.Y, T23.X, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR T24.Z, T1.Z, literal.z,
+; EG-NEXT: LSHR T4.W, T1.Z, literal.w,
+; EG-NEXT: ASHR * T26.W, T1.W, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T35.X, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T36.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T37.Z, T12.W, literal.z,
-; EG-NEXT: LSHR T2.W, T12.X, literal.x,
-; EG-NEXT: LSHR * T3.W, T12.W, literal.w,
+; EG-NEXT: BFE_INT T24.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T25.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T26.Z, T1.W, literal.z,
+; EG-NEXT: LSHR T4.W, T1.X, literal.x,
+; EG-NEXT: LSHR * T1.W, T1.W, literal.w,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T37.X, PS, 0.0, literal.x,
-; EG-NEXT: ASHR T35.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T36.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR T2.W, T11.Z, literal.z,
-; EG-NEXT: ASHR * T12.W, T11.X, literal.y,
+; EG-NEXT: BFE_INT T26.X, PS, 0.0, literal.x,
+; EG-NEXT: ASHR T24.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T25.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR T1.W, T0.Z, literal.z,
+; EG-NEXT: ASHR * T27.W, T0.X, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T38.X, T12.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T37.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T12.Z, T11.X, literal.z,
-; EG-NEXT: LSHR T3.W, T11.X, literal.w,
-; EG-NEXT: ASHR * T39.W, T11.Y, literal.y,
+; EG-NEXT: BFE_INT T28.X, T1.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T26.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T27.Z, T0.X, literal.z,
+; EG-NEXT: LSHR T4.W, T0.X, literal.w,
+; EG-NEXT: ASHR * T29.W, T0.Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T12.X, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T38.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T39.Z, T11.Y, literal.z,
-; EG-NEXT: LSHR T3.W, T12.Y, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: LSHR * T4.W, T11.Y, literal.w,
+; EG-NEXT: BFE_INT T27.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T28.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T29.Z, T0.Y, literal.z,
+; EG-NEXT: LSHR T4.W, T1.Y, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHR * T5.W, T0.Y, literal.w,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: BFE_INT T39.X, PS, 0.0, literal.x,
-; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T38.Z, PV.W, 0.0, literal.x,
-; EG-NEXT: ASHR T36.W, T36.Z, literal.y,
-; EG-NEXT: ASHR * T40.W, T11.Z, literal.y,
+; EG-NEXT: BFE_INT T29.X, PS, 0.0, literal.x,
+; EG-NEXT: ASHR T27.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T28.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR T25.W, T25.Z, literal.y,
+; EG-NEXT: ASHR * T30.W, T0.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT: BFE_INT T11.X, T11.Z, 0.0, literal.x,
-; EG-NEXT: ASHR T39.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T40.Z, T11.Z, literal.z,
-; EG-NEXT: ASHR T38.W, PV.Z, literal.y,
-; EG-NEXT: ASHR * T41.W, T11.W, literal.y,
+; EG-NEXT: BFE_INT T0.X, T0.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T29.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T30.Z, T0.Z, literal.z,
+; EG-NEXT: ASHR T28.W, PV.Z, literal.y,
+; EG-NEXT: ASHR * T31.W, T0.W, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T40.X, T2.W, 0.0, literal.x,
-; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T41.Z, T11.W, literal.z, BS:VEC_120/SCL_212
-; EG-NEXT: ASHR T33.W, T33.Z, literal.y,
-; EG-NEXT: ASHR * T29.W, T29.Z, literal.y,
+; EG-NEXT: BFE_INT T30.X, T1.W, 0.0, literal.x,
+; EG-NEXT: ASHR T0.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T31.Z, T0.W, literal.z, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR T22.W, T22.Z, literal.y,
+; EG-NEXT: ASHR * T18.W, T18.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T41.X, T1.W, 0.0, literal.x,
-; EG-NEXT: ASHR T40.Y, PV.X, literal.y,
-; EG-NEXT: BFE_INT T11.Z, T0.Y, 0.0, literal.x,
-; EG-NEXT: ASHR T28.W, T28.Z, literal.y,
-; EG-NEXT: ASHR * T27.W, T27.Z, literal.y,
+; EG-NEXT: BFE_INT T31.X, T3.W, 0.0, literal.x,
+; EG-NEXT: ASHR T30.Y, PV.X, literal.y,
+; EG-NEXT: BFE_INT T0.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T17.W, T17.Z, literal.y,
+; EG-NEXT: ASHR * T16.W, T16.Z, literal.y,
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
-; EG-NEXT: LSHR T42.X, T0.W, literal.x,
-; EG-NEXT: ASHR T41.Y, PV.X, literal.y,
-; EG-NEXT: ASHR T11.W, PV.Z, literal.y,
-; EG-NEXT: ASHR * T26.W, T26.Z, literal.y,
+; EG-NEXT: LSHR T1.X, T2.W, literal.x,
+; EG-NEXT: ASHR T31.Y, PV.X, literal.y,
+; EG-NEXT: ASHR T0.W, PV.Z, literal.y,
+; EG-NEXT: ASHR * T15.W, T15.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i64:
@@ -9280,21 +9219,21 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T5.X, literal.x,
+; EG-NEXT: LSHL * T0.W, T4.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
+; EG-NEXT: AND_INT * T1.W, T4.X, literal.y,
; EG-NEXT: 16711680(2.341805e-38), 255(3.573311e-43)
-; EG-NEXT: OR_INT T5.X, PS, PV.W,
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT T4.X, PS, PV.W,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i16:
@@ -9384,16 +9323,16 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 16, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.Y, T2.X,
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
+; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: AND_INT T0.W, T5.X, literal.x,
+; EG-NEXT: AND_INT T0.W, T4.X, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
@@ -9407,8 +9346,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T5.X, PS, PV.W,
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT T4.X, PS, PV.W,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i16:
@@ -9499,47 +9438,47 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT: VTX_READ_32 T5.X, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: AND_INT T0.W, T7.X, literal.x,
+; EG-NEXT: LSHL * T0.W, T5.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
+; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T7.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT T0.W, T5.X, literal.x, PV.W,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), -65536(nan)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
+; EG-NEXT: MOV * T3.X, PV.W,
; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T8.X, T4.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T5.X, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.X,
+; EG-NEXT: MOV * T5.Y, T3.X,
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
@@ -9641,34 +9580,26 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT: VTX_READ_32 T5.X, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T5.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: LSHR * T0.W, T5.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -9676,18 +9607,26 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T7.X, literal.x,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: ASHR * T0.W, T5.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
+; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T6.Y, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T8.X, T4.X,
+; EG-NEXT: MOV T3.X, PV.Y,
+; EG-NEXT: MOV * T6.X, T2.X,
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
@@ -9808,77 +9747,77 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
+; EG-NEXT: VTX_READ_64 T7.XY, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T4.X,
+; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: AND_INT T0.W, T11.X, literal.x,
+; EG-NEXT: LSHL * T0.W, T7.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
+; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T11.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT T1.W, T7.X, literal.x, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), -65536(nan)
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T11.Y, literal.x,
+; EG-NEXT: MOV T5.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: LSHL * T1.W, T7.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: BFE_UINT * T0.W, T7.Y, literal.x, T0.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T12.W, PV.W, PS,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T12.X, T8.X,
-; EG-NEXT: MOV * T12.Z, T4.X,
+; EG-NEXT: MOV * T0.Y, T4.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T7.X, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T7.X, PV.W, PS,
+; EG-NEXT: MOV T4.X, PV.X,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: LSHR * T0.W, T7.Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T7.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV T2.X, PV.Z,
+; EG-NEXT: MOV T7.Y, T5.X,
+; EG-NEXT: MOV * T7.W, T3.X, BS:VEC_120/SCL_212
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
@@ -10025,24 +9964,36 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
+; EG-NEXT: VTX_READ_64 T7.XY, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: MOV * T11.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T4.X,
+; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), -65536(nan)
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: MOV T5.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: LSHR * T0.W, T7.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10050,9 +10001,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: LSHR * T0.W, T7.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10060,55 +10011,43 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T11.X, literal.x,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: ASHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x,
+; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
+; EG-NEXT: MOV T5.X, PV.Y,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: BFE_INT * T0.W, T7.Y, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T11.Y, literal.x,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: ASHR * T0.W, T7.Y, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T12.W, PV.W, PS,
+; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T8.W, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T12.X, T8.X,
-; EG-NEXT: MOV * T12.Z, T4.X,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T8.X, T4.X,
+; EG-NEXT: MOV * T8.Z, T2.X,
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
@@ -10303,144 +10242,145 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 103, @12, KC0[], KC1[]
-; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT: ALU 104, @12, KC0[], KC1[]
+; EG-NEXT: ALU 20, @117, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T19.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T8.X,
+; EG-NEXT: MOV * T11.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT T0.W, T19.X, literal.x,
+; EG-NEXT: LSHL * T0.W, T11.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
+; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T19.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: MOV T8.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), -65536(nan)
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.X, literal.x,
+; EG-NEXT: MOV T9.X, PV.W,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: LSHL * T1.W, T11.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T12.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.Y, literal.x,
+; EG-NEXT: MOV T6.X, PV.W,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: BFE_UINT * T1.W, T11.Y, literal.x, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: -65536(nan), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: LSHL * T1.W, T11.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: BFE_UINT * T1.W, T11.Z, literal.x, T0.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.Y, literal.x,
+; EG-NEXT: MOV T5.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: LSHL * T1.W, T11.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T20.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T8.X,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: BFE_UINT * T0.W, T11.W, literal.x, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
+; EG-NEXT: -65536(nan), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T9.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T8.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T11.X, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.Z, literal.x,
+; EG-NEXT: OR_INT * T12.X, PV.W, PS,
+; EG-NEXT: MOV T8.X, PV.X,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T19.Z, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T6.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T11.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T12.Z, PV.W, PS,
+; EG-NEXT: MOV T6.X, PV.Z,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T11.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T19.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T5.X, PV.W,
; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T19.W, literal.y,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T11.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T19.W, literal.x,
+; EG-NEXT: OR_INT * T11.X, PV.W, PS,
+; EG-NEXT: MOV T4.X, PV.X,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: LSHR * T0.W, T11.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: ALU clause starting at 117:
+; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 116:
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR T0.W, T19.W, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T21.X, PS, literal.x,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.y,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00)
-; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T19.W, PV.W, PS,
+; EG-NEXT: LSHR T13.X, PV.W, literal.x,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.y,
+; EG-NEXT: AND_INT * T1.W, T11.W, literal.z,
+; EG-NEXT: 2(2.802597e-45), -65536(nan)
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T11.Z, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T20.X, T16.X,
-; EG-NEXT: MOV * T20.Z, T12.X,
-; EG-NEXT: MOV T19.X, T8.X,
-; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T2.X, PV.Z,
+; EG-NEXT: MOV T12.Y, T9.X,
+; EG-NEXT: MOV T12.W, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T11.Y, T5.X,
+; EG-NEXT: MOV * T11.W, T3.X,
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
@@ -10690,27 +10630,39 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
-; EG-NEXT: ALU 104, @12, KC0[], KC1[]
-; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT: ALU 105, @12, KC0[], KC1[]
+; EG-NEXT: ALU 45, @118, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
-; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
+; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T19.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T8.X,
+; EG-NEXT: MOV * T11.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.X, literal.x,
+; EG-NEXT: MOV T8.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), -65536(nan)
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: MOV T9.X, PV.W,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10718,9 +10670,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
-; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T6.X, PV.W,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10728,25 +10680,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T19.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T12.X,
-; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: LSHR * T0.W, T11.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10754,9 +10690,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T11.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10764,25 +10700,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T19.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T20.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, T8.X,
-; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
+; EG-NEXT: MOV T5.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: LSHR * T0.W, T11.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10790,9 +10710,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: LSHR * T0.W, T11.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -10800,61 +10720,81 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T19.Z, literal.x,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T8.X,
+; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T8.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: ASHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: ALU clause starting at 117:
-; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W,
+; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
+; EG-NEXT: MOV T6.X, PV.W,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: ASHR * T0.W, T11.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 118:
+; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T12.W, PV.W, PS,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: BFE_INT * T0.W, T11.Z, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
; EG-NEXT: MOV T4.X, PV.W,
; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR T0.W, T19.W, literal.x,
+; EG-NEXT: ASHR * T0.W, T11.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T11.Y, PV.W, PS,
+; EG-NEXT: MOV T5.X, PV.Y,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: BFE_INT * T0.W, T11.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: ASHR T0.W, T11.W, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT: LSHR T21.X, PS, literal.x,
+; EG-NEXT: LSHR T13.X, PS, literal.x,
; EG-NEXT: AND_INT T1.W, PV.Y, literal.y,
; EG-NEXT: LSHL * T0.W, PV.W, literal.z,
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T19.W, PV.W, PS,
+; EG-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T11.W, PV.W, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T20.X, T16.X,
-; EG-NEXT: MOV * T20.Z, T12.X,
-; EG-NEXT: MOV T19.X, T8.X,
-; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T12.X, T8.X,
+; EG-NEXT: MOV * T12.Z, T6.X,
+; EG-NEXT: MOV T11.X, T4.X,
+; EG-NEXT: MOV * T11.Z, T2.X, BS:VEC_120/SCL_212
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
@@ -11201,274 +11141,275 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG: ; %bb.0:
; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @10
-; EG-NEXT: ALU 103, @16, KC0[], KC1[]
-; EG-NEXT: ALU 104, @120, KC0[], KC1[]
-; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT: ALU 107, @16, KC0[], KC1[]
+; EG-NEXT: ALU 101, @124, KC0[], KC1[]
+; EG-NEXT: ALU 41, @226, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
-; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T8.X,
+; EG-NEXT: MOV * T19.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: AND_INT T0.W, T37.X, literal.x,
+; EG-NEXT: LSHL * T0.W, T20.X, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 255(3.573311e-43), -65536(nan)
+; EG-NEXT: 16711680(2.341805e-38), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T0.W, T37.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: MOV T8.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W,
+; EG-NEXT: BFE_UINT T1.W, T20.X, literal.x, PV.W,
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), -65536(nan)
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T12.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.Y, literal.x,
+; EG-NEXT: MOV T9.X, PV.W,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: LSHL * T1.W, T20.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W,
+; EG-NEXT: MOV T6.X, PV.W,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: BFE_UINT * T1.W, T20.Y, literal.x, T0.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.Y, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T36.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T8.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.Z, literal.x,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: LSHL * T1.W, T20.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: BFE_UINT * T1.W, T20.Z, literal.x, T0.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.Z, literal.x,
+; EG-NEXT: MOV T5.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: LSHL * T1.W, T20.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T37.Y, PV.W, PS,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T4.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T37.W, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T37.W, literal.x,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: BFE_UINT * T1.W, T20.W, literal.x, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: -65536(nan), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T16.X,
+; EG-NEXT: LSHL * T1.W, T19.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W,
+; EG-NEXT: MOV T16.X, PV.W,
+; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: BFE_UINT * T1.W, T19.X, literal.x, T0.W, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 120:
-; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T37.W, literal.x,
+; EG-NEXT: MOV T17.X, PV.W,
+; EG-NEXT: MOV T0.Y, T14.X,
+; EG-NEXT: LSHL * T1.W, T19.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T37.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T32.X,
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.X, literal.y,
-; EG-NEXT: -65536(nan), 255(3.573311e-43)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.X, literal.x,
+; EG-NEXT: MOV T14.X, PV.W,
+; EG-NEXT: MOV T0.Y, T15.X,
+; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
+; EG-NEXT: -65536(nan), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
+; EG-NEXT: MOV T15.X, PV.W,
+; EG-NEXT: MOV T0.Y, T12.X,
+; EG-NEXT: LSHL * T1.W, T19.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, T33.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T12.X, PV.W,
+; EG-NEXT: MOV T0.Y, T13.X,
+; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
; EG-NEXT: -65536(nan), 0(0.000000e+00)
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T33.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.X, literal.x,
+; EG-NEXT: MOV T13.X, PV.W,
+; EG-NEXT: MOV T0.Y, T10.X,
+; EG-NEXT: LSHL * T1.W, T19.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: ALU clause starting at 124:
+; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
-; EG-NEXT: MOV T33.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T28.X,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOV T10.X, PV.W,
+; EG-NEXT: MOV T0.Y, T11.X,
+; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
+; EG-NEXT: -65536(nan), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
+; EG-NEXT: MOV T11.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: LSHR * T0.W, T20.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T9.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T8.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T20.X, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.Y, literal.x,
+; EG-NEXT: OR_INT * T21.X, PV.W, PS,
+; EG-NEXT: MOV T8.X, PV.X,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: LSHR * T0.W, T20.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, T29.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T29.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.Y, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T6.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T20.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T21.Z, PV.W, PS,
+; EG-NEXT: MOV T6.X, PV.Z,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T20.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T38.W, PV.W, PS,
-; EG-NEXT: MOV T29.X, PV.W,
-; EG-NEXT: MOV * T0.Y, T24.X,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T5.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T4.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T20.Z, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T20.X, PV.W, PS,
+; EG-NEXT: MOV T4.X, PV.X,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: LSHR * T0.W, T20.W, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T20.W, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHL * T1.W, T35.Z, literal.x,
+; EG-NEXT: OR_INT * T20.Z, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.Z,
+; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, T25.X,
-; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT: MOV * T25.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T1.W, T35.Z, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T17.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T16.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T19.X, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T22.X, PV.W, PS,
+; EG-NEXT: MOV T16.X, PV.X,
+; EG-NEXT: MOV T0.Y, T15.X,
+; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
-; EG-NEXT: MOV T25.X, PV.Y,
-; EG-NEXT: MOV * T0.Y, T20.X,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T15.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T14.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T19.Y, literal.y,
+; EG-NEXT: -65536(nan), 255(3.573311e-43)
+; EG-NEXT: OR_INT * T22.Z, PV.W, PS,
+; EG-NEXT: MOV T14.X, PV.Z,
+; EG-NEXT: MOV T0.Y, T13.X,
+; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T2.W, T35.W, literal.y,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
+; EG-NEXT: ALU clause starting at 226:
+; EG-NEXT: OR_INT * T0.W, T1.W, T0.W,
+; EG-NEXT: MOV T13.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T12.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, T19.Z, literal.y,
; EG-NEXT: -65536(nan), 255(3.573311e-43)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV * T20.X, PV.W,
-; EG-NEXT: ALU clause starting at 225:
-; EG-NEXT: MOV T0.Y, T20.X,
-; EG-NEXT: LSHL * T1.W, T35.W, literal.x,
+; EG-NEXT: OR_INT * T19.X, PV.W, PS,
+; EG-NEXT: MOV T12.X, PV.X,
+; EG-NEXT: MOV T0.Y, T11.X,
+; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT: OR_INT * T1.W, PV.W, PS,
-; EG-NEXT: MOV T20.X, PV.W,
-; EG-NEXT: MOV T0.Y, T21.X,
-; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT: MOV * T21.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T11.X, PV.W,
+; EG-NEXT: MOV T0.Y, T10.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T24.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.W, T35.W, literal.x,
-; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T41.X, PS, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T25.X, PV.W, literal.x,
; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT: AND_INT T0.W, PV.W, literal.z,
+; EG-NEXT: AND_INT T0.W, T19.W, literal.z,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44)
-; EG-NEXT: LSHR T42.X, PS, literal.x,
-; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), -65536(nan)
+; EG-NEXT: 255(3.573311e-43), 32(4.484155e-44)
+; EG-NEXT: LSHR T26.X, PS, literal.x,
+; EG-NEXT: OR_INT * T19.Z, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T21.X, PV.W,
-; EG-NEXT: MOV * T36.X, T16.X,
-; EG-NEXT: MOV * T36.Z, T12.X,
-; EG-NEXT: MOV T37.X, T8.X,
-; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212
-; EG-NEXT: MOV * T38.X, T32.X,
-; EG-NEXT: MOV * T38.Z, T28.X,
-; EG-NEXT: MOV T35.X, T24.X,
-; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T10.X, PV.Z,
+; EG-NEXT: MOV T21.Y, T9.X,
+; EG-NEXT: MOV T21.W, T7.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T20.Y, T5.X,
+; EG-NEXT: MOV * T20.W, T3.X,
+; EG-NEXT: MOV T22.Y, T17.X,
+; EG-NEXT: MOV T22.W, T15.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T19.Y, T13.X,
+; EG-NEXT: MOV * T19.W, T11.X,
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i16:
; GFX12: ; %bb.0:
@@ -11929,27 +11870,39 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: ALU 104, @16, KC0[], KC1[]
; EG-NEXT: ALU 104, @121, KC0[], KC1[]
; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T26.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T24.X, 0
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
-; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
+; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1
+; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T0.Y, T16.X,
-; EG-NEXT: MOV * T35.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.Y, T8.X,
+; EG-NEXT: MOV * T19.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T0.W, T20.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 65535(9.183409e-41), -65536(nan)
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.X, literal.x,
+; EG-NEXT: MOV T8.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: LSHR * T0.W, T20.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), -65536(nan)
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
+; EG-NEXT: MOV T9.X, PV.W,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: LSHR * T0.W, T20.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -11957,9 +11910,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T16.X, PV.W,
-; EG-NEXT: MOV T0.Y, T17.X,
-; EG-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T6.X, PV.W,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: LSHR * T0.W, T20.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -11967,25 +11920,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T17.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
-; EG-NEXT: MOV T17.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T12.X,
-; EG-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.Y, literal.x,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: LSHR * T0.W, T20.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -11993,9 +11930,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T12.X, PV.W,
-; EG-NEXT: MOV T0.Y, T13.X,
-; EG-NEXT: LSHR * T0.W, T37.Y, literal.x,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: LSHR * T0.W, T20.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12003,25 +11940,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T36.W, PV.W, PS,
-; EG-NEXT: MOV T13.X, PV.W,
-; EG-NEXT: MOV T0.Y, T8.X,
-; EG-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.Z, literal.x,
+; EG-NEXT: MOV T5.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: LSHR * T0.W, T20.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12029,9 +11950,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV T0.Y, T9.X,
-; EG-NEXT: LSHR * T0.W, T37.Z, literal.x,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: LSHR * T0.W, T20.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12039,26 +11960,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.Z, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: ALU clause starting at 121:
-; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W,
-; EG-NEXT: MOV T9.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T37.W, literal.x,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T16.X,
+; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12066,9 +11970,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV T0.Y, T5.X,
-; EG-NEXT: LSHR * T0.W, T37.W, literal.x,
+; EG-NEXT: MOV T16.X, PV.W,
+; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12076,35 +11980,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T37.W, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T37.W, PV.W, PS,
-; EG-NEXT: MOV T5.X, PV.W,
-; EG-NEXT: MOV T0.Y, T32.X,
-; EG-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.X, literal.x,
+; EG-NEXT: MOV T17.X, PV.W,
+; EG-NEXT: MOV T0.Y, T14.X,
+; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: ALU clause starting at 121:
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T32.X, PV.W,
-; EG-NEXT: MOV T0.Y, T33.X,
-; EG-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T14.X, PV.W,
+; EG-NEXT: MOV T0.Y, T15.X,
+; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12112,25 +12001,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T33.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T35.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
-; EG-NEXT: MOV T33.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T28.X,
-; EG-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
+; EG-NEXT: MOV T15.X, PV.W,
+; EG-NEXT: MOV T0.Y, T12.X,
+; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12138,9 +12011,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T28.X, PV.W,
-; EG-NEXT: MOV T0.Y, T29.X,
-; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
+; EG-NEXT: MOV T12.X, PV.W,
+; EG-NEXT: MOV T0.Y, T13.X,
+; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12148,26 +12021,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T29.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T35.Y, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: ALU clause starting at 226:
-; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
-; EG-NEXT: LSHL * T0.W, T0.W, literal.y,
-; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T38.W, PV.W, PS,
-; EG-NEXT: MOV T29.X, PV.W,
-; EG-NEXT: MOV T0.Y, T24.X,
-; EG-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: -65536(nan), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.Z, literal.x,
+; EG-NEXT: MOV T13.X, PV.W,
+; EG-NEXT: MOV T0.Y, T10.X,
+; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12175,9 +12031,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T24.X, PV.W,
-; EG-NEXT: MOV T0.Y, T25.X,
-; EG-NEXT: LSHR * T0.W, T35.Z, literal.x,
+; EG-NEXT: MOV T10.X, PV.W,
+; EG-NEXT: MOV T0.Y, T11.X,
+; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
@@ -12185,70 +12041,155 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T25.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: ASHR * T0.W, T35.Z, literal.x,
+; EG-NEXT: MOV T11.X, PV.W,
+; EG-NEXT: MOV T0.Y, T8.X,
+; EG-NEXT: BFE_INT * T0.W, T20.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T8.X, PV.W,
+; EG-NEXT: MOV T0.Y, T9.X,
+; EG-NEXT: ASHR * T0.W, T20.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
-; EG-NEXT: MOV T25.X, PV.Y,
-; EG-NEXT: MOV T0.Y, T20.X,
-; EG-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x,
+; EG-NEXT: OR_INT * T21.Y, PV.W, PS,
+; EG-NEXT: MOV T9.X, PV.Y,
+; EG-NEXT: MOV T0.Y, T6.X,
+; EG-NEXT: BFE_INT * T0.W, T20.Y, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
-; EG-NEXT: MOV * T20.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
-; EG-NEXT: LSHR * T0.W, T35.W, literal.x,
+; EG-NEXT: MOV T6.X, PV.W,
+; EG-NEXT: MOV T0.Y, T7.X,
+; EG-NEXT: ASHR * T0.W, T20.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T21.W, PV.W, PS,
+; EG-NEXT: MOV T7.X, PV.W,
+; EG-NEXT: MOV T0.Y, T4.X,
+; EG-NEXT: BFE_INT * T0.W, T20.Z, 0.0, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T20.X, PV.W,
-; EG-NEXT: MOV T0.Y, T21.X,
-; EG-NEXT: LSHR * T0.W, T35.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT: 8(1.121039e-44), -65536(nan)
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T21.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T4.X, PV.W,
+; EG-NEXT: MOV T0.Y, T5.X,
+; EG-NEXT: ASHR * T0.W, T20.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
+; EG-NEXT: MOV T5.X, PV.Y,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: BFE_INT * T0.W, T20.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: ALU clause starting at 226:
+; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: ASHR * T0.W, T20.W, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T20.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T16.X,
+; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T16.X, PV.W,
+; EG-NEXT: MOV T0.Y, T17.X,
+; EG-NEXT: ASHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T22.Y, PV.W, PS,
+; EG-NEXT: MOV T17.X, PV.Y,
+; EG-NEXT: MOV T0.Y, T14.X,
+; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T14.X, PV.W,
+; EG-NEXT: MOV T0.Y, T15.X,
+; EG-NEXT: ASHR * T0.W, T19.Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T22.W, PV.W, PS,
+; EG-NEXT: MOV T15.X, PV.W,
+; EG-NEXT: MOV T0.Y, T12.X,
+; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T12.X, PV.W,
+; EG-NEXT: MOV T0.Y, T13.X,
+; EG-NEXT: ASHR * T0.W, T19.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T19.Y, PV.W, PS,
+; EG-NEXT: MOV T13.X, PV.Y,
+; EG-NEXT: MOV T0.Y, T10.X,
+; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T10.X, PV.W,
+; EG-NEXT: MOV T0.Y, T11.X,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T39.X, PV.W, literal.x,
-; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T23.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T24.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ASHR T0.W, T35.W, literal.x,
+; EG-NEXT: ASHR T0.W, T19.W, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44)
-; EG-NEXT: LSHR T41.X, PS, literal.x,
+; EG-NEXT: LSHR T25.X, PS, literal.x,
; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y,
; EG-NEXT: LSHL T0.W, PV.W, literal.z,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT: LSHR T42.X, PS, literal.x,
-; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT: LSHR T26.X, PS, literal.x,
+; EG-NEXT: OR_INT * T19.W, PV.Z, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T21.X, PV.W,
-; EG-NEXT: MOV * T36.X, T16.X,
-; EG-NEXT: MOV * T36.Z, T12.X,
-; EG-NEXT: MOV T37.X, T8.X,
-; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212
-; EG-NEXT: MOV * T38.X, T32.X,
-; EG-NEXT: MOV * T38.Z, T28.X,
-; EG-NEXT: MOV T35.X, T24.X,
-; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV T11.X, PV.W,
+; EG-NEXT: MOV * T21.X, T8.X,
+; EG-NEXT: MOV * T21.Z, T6.X,
+; EG-NEXT: MOV T20.X, T4.X,
+; EG-NEXT: MOV T20.Z, T2.X, BS:VEC_120/SCL_212
+; EG-NEXT: MOV * T22.X, T16.X,
+; EG-NEXT: MOV * T22.Z, T14.X,
+; EG-NEXT: MOV T19.X, T12.X,
+; EG-NEXT: MOV * T19.Z, T10.X, BS:VEC_120/SCL_212
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i16:
; GFX12: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 64f1f45bf734cf..c0f377eccf4fab 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1206,18 +1206,18 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Y, T4.X, literal.x,
+; EG-NEXT: LSHR * T0.Y, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; CM-LABEL: global_zextload_v2i16_to_v2i32:
@@ -1225,19 +1225,19 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
-; CM-NEXT: MOV * T4.X, KC0[2].Z,
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: LSHR * T4.Y, T4.X, literal.x,
+; CM-NEXT: LSHR * T0.Y, T0.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT * T4.X, T4.X, literal.x,
+; CM-NEXT: AND_INT * T0.X, T0.X, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <2 x i16>, ptr addrspace(1) %in
%ext = zext <2 x i16> %load to <2 x i32>
@@ -1304,41 +1304,40 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
+; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR T0.W, T4.X, literal.x,
-; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ASHR * T0.Y, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
; CM-LABEL: global_sextload_v2i16_to_v2i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T4.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
-; CM-NEXT: MOV * T4.X, KC0[2].Z,
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
-; CM-NEXT: LSHR * T0.W, T4.X, literal.x,
+; CM-NEXT: ASHR * T0.Y, T0.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
-; CM-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
-; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <2 x i16>, ptr addrspace(1) %in
%ext = sext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -5575,20 +5574,20 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T4.Z, T4.X, literal.x,
+; EG-NEXT: LSHR * T0.Z, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
-; EG-NEXT: MOV T4.Y, 0.0,
-; EG-NEXT: MOV T4.W, 0.0,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: AND_INT T0.X, T0.X, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV T0.W, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; CM-LABEL: global_zextload_v2i16_to_v2i64:
@@ -5596,21 +5595,21 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
-; CM-NEXT: MOV * T4.X, KC0[2].Z,
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: LSHR * T4.Z, T4.X, literal.x,
+; CM-NEXT: LSHR * T0.Z, T0.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: AND_INT T4.X, T4.X, literal.x,
-; CM-NEXT: MOV T4.Y, 0.0,
-; CM-NEXT: MOV * T4.W, 0.0,
+; CM-NEXT: AND_INT T0.X, T0.X, literal.x,
+; CM-NEXT: MOV T0.Y, 0.0,
+; CM-NEXT: MOV * T0.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <2 x i16>, ptr addrspace(1) %in
%ext = zext <2 x i16> %load to <2 x i64>
@@ -5686,22 +5685,22 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T4.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: ASHR * T4.W, T4.X, literal.x,
+; EG-NEXT: ASHR * T0.W, T0.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT: ASHR * T4.Z, T4.X, literal.x,
+; EG-NEXT: ASHR * T0.Z, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T4.X, T4.X, 0.0, literal.x,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
-; EG-NEXT: ASHR * T4.Y, PV.X, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v2i16_to_v2i64:
@@ -5709,22 +5708,22 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
-; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
-; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
+; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
-; CM-NEXT: MOV * T4.X, KC0[2].Z,
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
-; CM-NEXT: ASHR * T4.W, T4.X, literal.x,
+; CM-NEXT: ASHR * T0.W, T0.X, literal.x,
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; CM-NEXT: ASHR * T4.Z, T4.X, literal.x,
+; CM-NEXT: ASHR * T0.Z, T0.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: BFE_INT * T4.X, T4.X, 0.0, literal.x,
+; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
-; CM-NEXT: ASHR * T4.Y, PV.X, literal.y,
+; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; CM-NEXT: ASHR * T0.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <2 x i16>, ptr addrspace(1) %in
%ext = sext <2 x i16> %load to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 1dd08c561b2ab4..bd84e753cbc84f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
@@ -7,240 +8,653 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
-; FUNC-LABEL: {{^}}local_load_i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16 v{{[0-9]+}}
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_i16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_u16 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b16 v1, v0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_load_i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b16 v1, v0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_load_i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @0, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
entry:
%ld = load i16, ptr addrspace(3) %in
store i16 %ld, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_load_v2i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v2i16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b32 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b32 v1, v0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_load_v2i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b32 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b32 v1, v0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_load_v2i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
entry:
%ld = load <2 x i16>, ptr addrspace(3) %in
store <2 x i16> %ld, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_load_v3i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_write_b16
-
-; EG-DAG: LDS_USHORT_READ_RET
-; EG-DAG: LDS_USHORT_READ_RET
define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v3i16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b32 v2, v0
+; SI-NEXT: ds_write_b16 v2, v1 offset:4
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_load_v3i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b16 v2, v1 offset:4
+; GFX9-NEXT: ds_write_b32 v2, v0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_load_v3i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 19, @2, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.Z, OQAP,
+; EG-NEXT: LSHL T0.Z, PV.Z, literal.x,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.z,
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: OR_INT T0.W, T0.Z, T0.W,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.Y,
+; EG-NEXT: RETURN
entry:
%ld = load <3 x i16>, ptr addrspace(3) %in
store <3 x i16> %ld, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_load_v4i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v4i16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_load_v4i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_load_v4i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 11, @3, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
entry:
%ld = load <4 x i16>, ptr addrspace(3) %in
store <4 x i16> %ld, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_load_v8i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v8i16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT: v_mov_b32_e32 v4, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_load_v8i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_load_v8i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
entry:
%ld = load <8 x i16>, ptr addrspace(3) %in
store <8 x i16> %ld, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_load_v16i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v16i16:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT: v_mov_b32_e32 v8, s0
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_load_v16i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
+; GFX9-NEXT: v_mov_b32_e32 v8, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_load_v16i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 53, @5, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
entry:
%ld = load <16 x i16>, ptr addrspace(3) %in
store <16 x i16> %ld, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16
-; GCN: ds_write_b32
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_i16_to_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_u16 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b32 v1, v0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_i16_to_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b32 v1, v0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_i16_to_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
%a = load i16, ptr addrspace(3) %in
%ext = zext i16 %a to i32
store i32 %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
-; GCN-NOT: s_wqm_b64
-
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_i16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_i16_to_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_i16 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b32 v1, v0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_i16_to_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_i16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b32 v1, v0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_i16_to_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 6, @7, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%a = load i16, ptr addrspace(3) %in
%ext = sext i16 %a to i32
store i32 %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v1i16_to_v1i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_u16 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b32 v1, v0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v1i16_to_v1i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b32 v1, v0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v1i16_to_v1i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
%load = load <1 x i16>, ptr addrspace(3) %in
%ext = zext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_i16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v1i16_to_v1i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_i16 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b32 v1, v0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v1i16_to_v1i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_i16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write_b32 v1, v0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v1i16_to_v1i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <1 x i16>, ptr addrspace(3) %in
%ext = sext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v2i16_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b32 v0, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v2i16_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b32 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v2i16_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: AND_INT T0.W, PV.X, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.X, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <2 x i16>, ptr addrspace(3) %in
%ext = zext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: LDS_READ_RET
-; EG: BFE_INT
-; EG: BFE_INT
define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v2i16_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b32 v0, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v2i16_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b32 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v2i16_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.X, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <2 x i16>, ptr addrspace(3) %in
%ext = sext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; SI-DAG: ds_write_b32
-; SI-DAG: ds_write_b64
-; CIVI-DAG: ds_write_b96
-; GFX9-DAG: ds_write_b96
-
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_local_zextload_v3i16_to_v3i32:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v4, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; SI-NEXT: ds_write_b32 v4, v0 offset:8
+; SI-NEXT: ds_write_b64 v4, v[2:3]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_local_zextload_v3i16_to_v3i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: ds_write_b96 v3, v[0:2]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_local_zextload_v3i16_to_v3i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 18, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.Z,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.Y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
entry:
%ld = load <3 x i16>, ptr addrspace(3) %in
%ext = zext <3 x i16> %ld to <3 x i32>
@@ -248,23 +662,64 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; SI-DAG: ds_write_b32
-; SI-DAG: ds_write_b64
-; CIVI-DAG: ds_write_b96
-; GFX9-DAG: ds_write_b96
-
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_local_sextload_v3i16_to_v3i32:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v4, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
+; SI-NEXT: v_bfe_i32 v2, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v1, 0, 16
+; SI-NEXT: ds_write_b32 v4, v0 offset:8
+; SI-NEXT: ds_write_b64 v4, v[2:3]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_local_sextload_v3i16_to_v3i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[3:4], v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v3
+; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v3, 0, 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: ds_write_b96 v3, v[0:2]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_local_sextload_v3i16_to_v3i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 22, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
entry:
%ld = load <3 x i16>, ptr addrspace(3) %in
%ext = sext <3 x i16> %ld to <3 x i32>
@@ -272,698 +727,5078 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_local_zextload_v4i16_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_local_zextload_v4i16_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_local_zextload_v4i16_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 22, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <4 x i16>, ptr addrspace(3) %in
%ext = zext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v4i16_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v6, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
+; SI-NEXT: v_ashr_i64 v[4:5], v[0:1], 48
+; SI-NEXT: v_bfe_i32 v2, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v1, 0, 16
+; SI-NEXT: v_mov_b32_e32 v1, v4
+; SI-NEXT: ds_write2_b64 v6, v[2:3], v[0:1] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v4i16_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v1
+; GFX9-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 16
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v4i16_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 25, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <4 x i16>, ptr addrspace(3) %in
%ext = sext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v8i16_to_v8i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT: v_mov_b32_e32 v12, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v3
+; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v8i16_to_v8i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NEXT: v_mov_b32_e32 v10, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX9-NEXT: ds_write2_b64 v10, v[0:1], v[8:9] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v10, v[4:5], v[6:7] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v8i16_to_v8i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 46, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: AND_INT T1.W, T0.W, literal.x,
+; EG-NEXT: MOV * T2.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <8 x i16>, ptr addrspace(3) %in
%ext = zext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v8i16_to_v8i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT: v_mov_b32_e32 v12, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v0
+; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v1
+; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
+; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
+; SI-NEXT: v_bfe_i32 v4, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v6, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v8, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v10, v3, 0, 16
+; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v8i16_to_v8i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NEXT: v_mov_b32_e32 v12, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v3
+; GFX9-NEXT: v_bfe_i32 v8, v2, 0, 16
+; GFX9-NEXT: v_bfe_i32 v10, v3, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 16, v1
+; GFX9-NEXT: v_bfe_i32 v4, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v6, v1, 0, 16
+; GFX9-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v8i16_to_v8i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 51, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: LSHR * T1.W, T0.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: LSHR T1.Z, T0.W, literal.x,
+; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T2.Z, T0.Y, literal.x,
+; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.Z, T1.Y, literal.x,
+; EG-NEXT: BFE_INT T1.W, T2.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: MOV * T2.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <8 x i16>, ptr addrspace(3) %in
%ext = sext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v16i16_to_v16i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
+; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; SI-NEXT: v_mov_b32_e32 v24, s0
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v12, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v16, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v18, 0xffff, v4
+; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v6
+; SI-NEXT: ds_write2_b64 v24, v[22:23], v[20:21] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v24, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v24, v[10:11], v[8:9] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v16i16_to_v16i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NEXT: v_mov_b32_e32 v16, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v16i16_to_v16i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 94, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: MOV * T2.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT: MOV * T3.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <16 x i16>, ptr addrspace(3) %in
%ext = zext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v16i16_to_v16i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
+; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; SI-NEXT: v_mov_b32_e32 v24, s0
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v1
+; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v0
+; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v3
+; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v2
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v5
+; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v4
+; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v7
+; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v6
+; SI-NEXT: v_bfe_i32 v8, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v10, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v14, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v16, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v18, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v20, v7, 0, 16
+; SI-NEXT: v_bfe_i32 v22, v6, 0, 16
+; SI-NEXT: ds_write2_b64 v24, v[22:23], v[20:21] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v24, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v24, v[10:11], v[8:9] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v16i16_to_v16i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v21, 16, v7
+; GFX9-NEXT: v_ashrrev_i32_e32 v23, 16, v6
+; GFX9-NEXT: v_bfe_i32 v10, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v20, v7, 0, 16
+; GFX9-NEXT: v_bfe_i32 v22, v6, 0, 16
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v1
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 16, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 16, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 16, v5
+; GFX9-NEXT: v_ashrrev_i32_e32 v19, 16, v4
+; GFX9-NEXT: v_bfe_i32 v8, v1, 0, 16
+; GFX9-NEXT: v_bfe_i32 v12, v3, 0, 16
+; GFX9-NEXT: v_bfe_i32 v14, v2, 0, 16
+; GFX9-NEXT: v_bfe_i32 v16, v5, 0, 16
+; GFX9-NEXT: v_bfe_i32 v18, v4, 0, 16
+; GFX9-NEXT: ds_write2_b64 v0, v[22:23], v[20:21] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v0, v[18:19], v[16:17] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v16i16_to_v16i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 95, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: MOV * T2.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: LSHR T2.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: LSHR * T3.Z, T2.Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T2.W, T2.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T4.Z, T0.Y, literal.x,
+; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T3.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T4.Z, T0.W, literal.x,
+; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T3.Z, T1.Y, literal.x,
+; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T4.Z, T1.Z, literal.x,
+; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T3.Z, T2.Z, literal.x,
+; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: BFE_INT T1.W, T2.Y, 0.0, literal.x,
+; EG-NEXT: MOV * T2.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: BFE_INT T1.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ALU 7, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <16 x i16>, ptr addrspace(3) %in
%ext = sext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v32i16_to_v32i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v12, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; SI-NEXT: s_waitcnt lgkmcnt(3)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v2
+; SI-NEXT: s_waitcnt lgkmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9
+; SI-NEXT: v_and_b32_e32 v24, 0xffff, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11
+; SI-NEXT: v_and_b32_e32 v26, 0xffff, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13
+; SI-NEXT: v_and_b32_e32 v28, 0xffff, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
+; SI-NEXT: v_and_b32_e32 v30, 0xffff, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT: v_mov_b32_e32 v32, s0
+; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
+; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
+; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
+; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
+; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v32i16_to_v32i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v12, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; GFX9-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; GFX9-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; GFX9-NEXT: v_mov_b32_e32 v32, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(3)
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9
+; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v11
+; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX9-NEXT: ds_write2_b64 v32, v[12:13], v[30:31] offset0:12 offset1:13
+; GFX9-NEXT: ds_write2_b64 v32, v[14:15], v[28:29] offset0:14 offset1:15
+; GFX9-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
+; GFX9-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
+; GFX9-NEXT: ds_write2_b64 v32, v[4:5], v[22:23] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v32, v[6:7], v[20:21] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v32, v[0:1], v[18:19] offset1:1
+; GFX9-NEXT: ds_write2_b64 v32, v[2:3], v[16:17] offset0:2 offset1:3
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v32i16_to_v32i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 105, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.W, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Y, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Z, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.W, OQAP,
+; EG-NEXT: MOV * T4.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Y, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Z, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.W, OQAP,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.Y, OQAP,
+; EG-NEXT: LSHR T5.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT: MOV * T5.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT: ALU 84, @22, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 72(1.008935e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <32 x i16>, ptr addrspace(3) %in
%ext = zext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v32i16_to_v32i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v12, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; SI-NEXT: s_waitcnt lgkmcnt(3)
+; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v1
+; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v0
+; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3
+; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v2
+; SI-NEXT: v_bfe_i32 v16, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v18, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v20, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v22, v2, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(2)
+; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v5
+; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v4
+; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v4, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v7
+; SI-NEXT: v_bfe_i32 v4, v7, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v6
+; SI-NEXT: v_bfe_i32 v6, v6, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v9
+; SI-NEXT: v_bfe_i32 v24, v9, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8
+; SI-NEXT: v_bfe_i32 v8, v8, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v11
+; SI-NEXT: v_bfe_i32 v26, v11, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10
+; SI-NEXT: v_bfe_i32 v10, v10, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v13
+; SI-NEXT: v_bfe_i32 v28, v13, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12
+; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15
+; SI-NEXT: v_bfe_i32 v30, v15, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14
+; SI-NEXT: v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT: v_mov_b32_e32 v32, s0
+; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
+; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
+; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
+; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
+; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v32i16_to_v32i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v12, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; GFX9-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; GFX9-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; GFX9-NEXT: v_mov_b32_e32 v32, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(3)
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 16, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v19, 16, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v21, 16, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v31, 16, v13
+; GFX9-NEXT: v_bfe_i32 v30, v13, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 16, v12
+; GFX9-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v23, 16, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 16, v7
+; GFX9-NEXT: v_ashrrev_i32_e32 v27, 16, v6
+; GFX9-NEXT: v_bfe_i32 v16, v3, 0, 16
+; GFX9-NEXT: v_bfe_i32 v18, v2, 0, 16
+; GFX9-NEXT: v_bfe_i32 v20, v1, 0, 16
+; GFX9-NEXT: v_bfe_i32 v22, v0, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v5
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v4
+; GFX9-NEXT: v_bfe_i32 v24, v7, 0, 16
+; GFX9-NEXT: v_bfe_i32 v26, v6, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v5, 0, 16
+; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v9
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 16, v8
+; GFX9-NEXT: v_bfe_i32 v4, v9, 0, 16
+; GFX9-NEXT: v_bfe_i32 v6, v8, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v11
+; GFX9-NEXT: v_bfe_i32 v8, v11, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v10
+; GFX9-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v29, 16, v15
+; GFX9-NEXT: v_bfe_i32 v28, v15, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 16, v14
+; GFX9-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX9-NEXT: ds_write2_b64 v32, v[12:13], v[30:31] offset0:12 offset1:13
+; GFX9-NEXT: ds_write2_b64 v32, v[14:15], v[28:29] offset0:14 offset1:15
+; GFX9-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:10 offset1:11
+; GFX9-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:8 offset1:9
+; GFX9-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v32, v[26:27], v[24:25] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset1:1
+; GFX9-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:2 offset1:3
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v32i16_to_v32i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 101, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.W, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Y, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Z, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.W, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Y, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Z, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.W, OQAP,
+; EG-NEXT: LSHR * T5.W, T4.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T6.W
+; EG-NEXT: MOV T5.Y, OQAP,
+; EG-NEXT: LSHR T5.Z, T4.W, literal.x,
+; EG-NEXT: BFE_INT T5.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T6.Z, T0.Y, literal.x,
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T6.Z, T0.W, literal.x,
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.Z, T1.Y, literal.x,
+; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T6.Z, T1.Z, literal.x,
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.Z, T1.W, literal.x,
+; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR * T6.Z, T2.Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ALU 89, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.Z, T2.Z, literal.x,
+; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T6.Z, T2.W, literal.x,
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.Z, T3.Y, literal.x,
+; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T6.Z, T3.Z, literal.x,
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.Z, T3.W, literal.x,
+; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T6.Z, T4.Y, literal.x,
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.Z, T5.Y, literal.x,
+; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: BFE_INT T5.W, T4.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: BFE_INT T4.W, T4.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: BFE_INT T4.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: BFE_INT T4.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: MOV * T5.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT: ALU 16, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <32 x i16>, ptr addrspace(3) %in
%ext = sext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v64i16_to_v64i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0xe8f000
+; SI-NEXT: s_add_u32 s12, s12, s11
+; SI-NEXT: s_addc_u32 s13, s13, 0
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v24, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v24 offset0:8 offset1:9
+; SI-NEXT: ds_read2_b64 v[4:7], v24 offset0:10 offset1:11
+; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:12 offset1:13
+; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:14 offset1:15
+; SI-NEXT: ds_read2_b64 v[20:23], v24 offset1:1
+; SI-NEXT: ds_read2_b64 v[16:19], v24 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[36:39], v24 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[40:43], v24 offset0:6 offset1:7
+; SI-NEXT: s_waitcnt lgkmcnt(7)
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; SI-NEXT: s_waitcnt lgkmcnt(6)
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6
+; SI-NEXT: v_and_b32_e32 v24, 0xffff, v1
+; SI-NEXT: buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v26, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v28, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v32, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4
+; SI-NEXT: v_and_b32_e32 v44, 0xffff, v7
+; SI-NEXT: v_and_b32_e32 v46, 0xffff, v6
+; SI-NEXT: s_waitcnt expcnt(0) lgkmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; SI-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v12
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14
+; SI-NEXT: s_waitcnt lgkmcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; SI-NEXT: v_and_b32_e32 v12, 0xffff, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; SI-NEXT: v_and_b32_e32 v14, 0xffff, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT: s_waitcnt lgkmcnt(3)
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21
+; SI-NEXT: v_and_b32_e32 v48, 0xffff, v21
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23
+; SI-NEXT: v_and_b32_e32 v50, 0xffff, v23
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT: s_waitcnt lgkmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17
+; SI-NEXT: v_and_b32_e32 v52, 0xffff, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; SI-NEXT: v_and_b32_e32 v54, 0xffff, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37
+; SI-NEXT: v_and_b32_e32 v56, 0xffff, v37
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v39
+; SI-NEXT: v_and_b32_e32 v58, 0xffff, v39
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38
+; SI-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v41
+; SI-NEXT: v_and_b32_e32 v60, 0xffff, v41
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v43
+; SI-NEXT: v_and_b32_e32 v62, 0xffff, v43
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v42
+; SI-NEXT: v_and_b32_e32 v42, 0xffff, v42
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: ds_write2_b64 v0, v[42:43], v[62:63] offset0:14 offset1:15
+; SI-NEXT: ds_write2_b64 v0, v[40:41], v[60:61] offset0:12 offset1:13
+; SI-NEXT: ds_write2_b64 v0, v[38:39], v[58:59] offset0:10 offset1:11
+; SI-NEXT: ds_write2_b64 v0, v[36:37], v[56:57] offset0:8 offset1:9
+; SI-NEXT: ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v0, v[20:21], v[48:49] offset1:1
+; SI-NEXT: ds_write2_b64 v0, v[10:11], v[14:15] offset0:30 offset1:31
+; SI-NEXT: ds_write2_b64 v0, v[8:9], v[12:13] offset0:28 offset1:29
+; SI-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:26 offset1:27
+; SI-NEXT: ds_write2_b64 v0, v[2:3], v[24:25] offset0:24 offset1:25
+; SI-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:22 offset1:23
+; SI-NEXT: ds_write2_b64 v0, v[34:35], v[32:33] offset0:20 offset1:21
+; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19
+; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v64i16_to_v64i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s14, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xe00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v56, s1
+; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:2 offset1:3
+; GFX9-NEXT: s_add_u32 s12, s12, s11
+; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v17
+; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v18
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v16
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v19
+; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v18
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v21
+; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset0:4 offset1:5
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v23
+; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v22
+; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:6 offset1:7
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18
+; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v18
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
+; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22
+; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v22
+; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v17
+; GFX9-NEXT: v_and_b32_e32 v40, 0xffff, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v16
+; GFX9-NEXT: v_and_b32_e32 v42, 0xffff, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v19
+; GFX9-NEXT: v_and_b32_e32 v44, 0xffff, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v18
+; GFX9-NEXT: v_and_b32_e32 v46, 0xffff, v18
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v21
+; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX9-NEXT: v_and_b32_e32 v50, 0xffff, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v23
+; GFX9-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v22
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff, v22
+; GFX9-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NEXT: v_and_b32_e32 v56, 0xffff, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v23
+; GFX9-NEXT: v_and_b32_e32 v62, 0xffff, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v19
+; GFX9-NEXT: v_and_b32_e32 v58, 0xffff, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v21
+; GFX9-NEXT: v_and_b32_e32 v60, 0xffff, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX9-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31
+; GFX9-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29
+; GFX9-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27
+; GFX9-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25
+; GFX9-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23
+; GFX9-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21
+; GFX9-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19
+; GFX9-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17
+; GFX9-NEXT: ds_write2_b64 v0, v[38:39], v[36:37] offset0:14 offset1:15
+; GFX9-NEXT: ds_write2_b64 v0, v[34:35], v[32:33] offset0:12 offset1:13
+; GFX9-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:10 offset1:11
+; GFX9-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:8 offset1:9
+; GFX9-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v64i16_to_v64i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 116, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.W, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Y, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Z, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.W, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Y, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Z, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.W, OQAP,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.Y, OQAP,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.Z, OQAP,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.W, OQAP,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T6.W
+; EG-NEXT: MOV T6.Y, OQAP,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T6.W
+; EG-NEXT: MOV T6.Z, OQAP,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T6.W
+; EG-NEXT: MOV T6.W, OQAP,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T7.W
+; EG-NEXT: MOV T7.Y, OQAP,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T7.W
+; EG-NEXT: MOV T7.Z, OQAP,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T7.W
+; EG-NEXT: MOV T7.W, OQAP,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T8.W
+; EG-NEXT: MOV T8.Y, OQAP,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T8.W
+; EG-NEXT: MOV T8.Z, OQAP,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T8.W
+; EG-NEXT: MOV T8.W, OQAP,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T9.W
+; EG-NEXT: MOV T9.Y, OQAP,
+; EG-NEXT: MOV * T9.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T9.W
+; EG-NEXT: MOV T9.Z, OQAP,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: ALU 95, @27, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_READ_RET * OQAP, T9.W
+; EG-NEXT: MOV T9.W, OQAP,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T10.W
+; EG-NEXT: MOV T10.Y, OQAP,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T10.W
+; EG-NEXT: MOV T10.Z, OQAP,
+; EG-NEXT: LSHR T10.W, T10.Y, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: AND_INT T10.W, T10.Y, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T10.W, T10.Z, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: AND_INT T10.W, T10.Z, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T10.W, T9.W, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: AND_INT T9.W, T9.W, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: LSHR T9.W, T9.Z, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: AND_INT T9.W, T9.Z, literal.x,
+; EG-NEXT: MOV * T10.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: LSHR T9.W, T9.Y, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: AND_INT T9.W, T9.Y, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: LSHR T9.W, T8.W, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: AND_INT T8.W, T8.W, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T8.W,
+; EG-NEXT: LSHR T8.W, T8.Z, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T8.W,
+; EG-NEXT: AND_INT T8.W, T8.Z, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T8.W,
+; EG-NEXT: LSHR T8.W, T8.Y, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T8.W,
+; EG-NEXT: AND_INT T8.W, T8.Y, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T8.W,
+; EG-NEXT: LSHR T8.W, T7.W, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT: LDS_WRITE * T9.W, T8.W,
+; EG-NEXT: AND_INT T7.W, T7.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43)
+; EG-NEXT: LDS_WRITE * T8.W, T7.W,
+; EG-NEXT: LSHR T7.W, T7.Z, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT: LDS_WRITE * T8.W, T7.W,
+; EG-NEXT: AND_INT T7.W, T7.Z, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT: LDS_WRITE * T8.W, T7.W,
+; EG-NEXT: LSHR T7.W, T7.Y, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT: LDS_WRITE * T8.W, T7.W,
+; EG-NEXT: AND_INT * T7.W, T7.Y, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: ALU 93, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.x,
+; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T8.W, T7.W,
+; EG-NEXT: LSHR T7.W, T6.W, literal.x,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT: LDS_WRITE * T8.W, T7.W,
+; EG-NEXT: AND_INT T6.W, T6.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: LSHR T6.W, T6.Z, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: AND_INT T6.W, T6.Z, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: LSHR T6.W, T6.Y, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: AND_INT T6.W, T6.Y, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: LSHR T6.W, T5.W, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: AND_INT T5.W, T5.W, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.W, T5.Z, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: AND_INT T5.W, T5.Z, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.W, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: AND_INT T5.W, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 152(2.129974e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: LSHR T5.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 136(1.905766e-43)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 184(2.578389e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43)
+; EG-NEXT: ALU 76, @29, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 168(2.354181e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 216(3.026805e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 200(2.802597e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 248(3.475220e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 232(3.251012e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <64 x i16>, ptr addrspace(3) %in
%ext = zext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v64i16_to_v64i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0xe8f000
+; SI-NEXT: s_add_u32 s12, s12, s11
+; SI-NEXT: s_addc_u32 s13, s13, 0
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v20, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[8:11], v20 offset0:8 offset1:9
+; SI-NEXT: ds_read2_b64 v[4:7], v20 offset0:10 offset1:11
+; SI-NEXT: ds_read2_b64 v[0:3], v20 offset0:12 offset1:13
+; SI-NEXT: ds_read2_b64 v[12:15], v20 offset0:14 offset1:15
+; SI-NEXT: ds_read2_b64 v[16:19], v20 offset1:1
+; SI-NEXT: ds_read2_b64 v[32:35], v20 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[36:39], v20 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[40:43], v20 offset0:6 offset1:7
+; SI-NEXT: s_waitcnt lgkmcnt(7)
+; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v9
+; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v8
+; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v11
+; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v10
+; SI-NEXT: s_waitcnt lgkmcnt(6)
+; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v5
+; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v4
+; SI-NEXT: v_ashrrev_i32_e32 v45, 16, v7
+; SI-NEXT: v_bfe_i32 v20, v9, 0, 16
+; SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT: v_bfe_i32 v22, v8, 0, 16
+; SI-NEXT: v_bfe_i32 v24, v11, 0, 16
+; SI-NEXT: v_bfe_i32 v26, v10, 0, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v6
+; SI-NEXT: s_waitcnt lgkmcnt(5)
+; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1
+; SI-NEXT: v_bfe_i32 v28, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v30, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v44, v7, 0, 16
+; SI-NEXT: v_bfe_i32 v20, v6, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v0
+; SI-NEXT: v_ashrrev_i32_e32 v47, 16, v3
+; SI-NEXT: v_bfe_i32 v9, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v6, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v46, v3, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v2
+; SI-NEXT: v_bfe_i32 v4, v2, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(4)
+; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13
+; SI-NEXT: v_bfe_i32 v2, v13, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12
+; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v49, 16, v15
+; SI-NEXT: v_bfe_i32 v48, v15, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14
+; SI-NEXT: v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(3)
+; SI-NEXT: v_ashrrev_i32_e32 v51, 16, v17
+; SI-NEXT: v_bfe_i32 v50, v17, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v16
+; SI-NEXT: v_bfe_i32 v16, v16, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v19
+; SI-NEXT: v_bfe_i32 v52, v19, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v18
+; SI-NEXT: v_bfe_i32 v18, v18, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(2)
+; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33
+; SI-NEXT: v_bfe_i32 v54, v33, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v32
+; SI-NEXT: v_bfe_i32 v32, v32, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v35
+; SI-NEXT: v_bfe_i32 v56, v35, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v35, 16, v34
+; SI-NEXT: v_bfe_i32 v34, v34, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_ashrrev_i32_e32 v59, 16, v37
+; SI-NEXT: v_bfe_i32 v58, v37, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v37, 16, v36
+; SI-NEXT: v_bfe_i32 v36, v36, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v61, 16, v39
+; SI-NEXT: v_bfe_i32 v60, v39, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v39, 16, v38
+; SI-NEXT: v_bfe_i32 v38, v38, 0, 16
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v63, 16, v41
+; SI-NEXT: v_bfe_i32 v62, v41, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v41, 16, v40
+; SI-NEXT: v_bfe_i32 v40, v40, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v43
+; SI-NEXT: v_bfe_i32 v0, v43, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v42
+; SI-NEXT: v_bfe_i32 v42, v42, 0, 16
+; SI-NEXT: v_mov_b32_e32 v8, s0
+; SI-NEXT: ds_write2_b64 v8, v[42:43], v[0:1] offset0:14 offset1:15
+; SI-NEXT: ds_write2_b64 v8, v[40:41], v[62:63] offset0:12 offset1:13
+; SI-NEXT: ds_write2_b64 v8, v[38:39], v[60:61] offset0:10 offset1:11
+; SI-NEXT: ds_write2_b64 v8, v[36:37], v[58:59] offset0:8 offset1:9
+; SI-NEXT: ds_write2_b64 v8, v[34:35], v[56:57] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v8, v[32:33], v[54:55] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v8, v[18:19], v[52:53] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v8, v[16:17], v[50:51] offset1:1
+; SI-NEXT: ds_write2_b64 v8, v[14:15], v[48:49] offset0:30 offset1:31
+; SI-NEXT: ds_write2_b64 v8, v[12:13], v[2:3] offset0:28 offset1:29
+; SI-NEXT: ds_write2_b64 v8, v[4:5], v[46:47] offset0:26 offset1:27
+; SI-NEXT: ds_write2_b64 v8, v[6:7], v[9:10] offset0:24 offset1:25
+; SI-NEXT: ds_write2_b64 v8, v[20:21], v[44:45] offset0:22 offset1:23
+; SI-NEXT: ds_write2_b64 v8, v[30:31], v[28:29] offset0:20 offset1:21
+; SI-NEXT: ds_write2_b64 v8, v[26:27], v[24:25] offset0:18 offset1:19
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ds_write2_b64 v8, v[22:23], v[0:1] offset0:16 offset1:17
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v64i16_to_v64i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s14, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xe00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v28, s1
+; GFX9-NEXT: ds_read2_b64 v[16:19], v28 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[20:23], v28 offset0:2 offset1:3
+; GFX9-NEXT: s_add_u32 s12, s12, s11
+; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v17
+; GFX9-NEXT: v_bfe_i32 v0, v17, 0, 16
+; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: ds_read2_b64 v[24:27], v28 offset0:4 offset1:5
+; GFX9-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v16
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 16, v19
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 16, v18
+; GFX9-NEXT: s_waitcnt lgkmcnt(2)
+; GFX9-NEXT: v_ashrrev_i32_e32 v9, 16, v21
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 16, v20
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 16, v23
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 16, v22
+; GFX9-NEXT: v_bfe_i32 v2, v16, 0, 16
+; GFX9-NEXT: v_bfe_i32 v4, v19, 0, 16
+; GFX9-NEXT: v_bfe_i32 v6, v18, 0, 16
+; GFX9-NEXT: v_bfe_i32 v8, v21, 0, 16
+; GFX9-NEXT: v_bfe_i32 v10, v20, 0, 16
+; GFX9-NEXT: v_bfe_i32 v12, v23, 0, 16
+; GFX9-NEXT: v_bfe_i32 v14, v22, 0, 16
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 16, v25
+; GFX9-NEXT: v_bfe_i32 v16, v25, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v19, 16, v24
+; GFX9-NEXT: v_bfe_i32 v18, v24, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v21, 16, v27
+; GFX9-NEXT: v_bfe_i32 v20, v27, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v23, 16, v26
+; GFX9-NEXT: v_bfe_i32 v22, v26, 0, 16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 16, v30
+; GFX9-NEXT: v_bfe_i32 v24, v30, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v27, 16, v29
+; GFX9-NEXT: v_bfe_i32 v26, v29, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v38, 16, v32
+; GFX9-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
+; GFX9-NEXT: v_bfe_i32 v37, v32, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v40, 16, v31
+; GFX9-NEXT: v_bfe_i32 v39, v31, 0, 16
+; GFX9-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_ashrrev_i32_e32 v42, 16, v34
+; GFX9-NEXT: v_bfe_i32 v41, v34, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v44, 16, v33
+; GFX9-NEXT: v_bfe_i32 v43, v33, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v46, 16, v36
+; GFX9-NEXT: v_bfe_i32 v45, v36, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v48, 16, v35
+; GFX9-NEXT: v_bfe_i32 v47, v35, 0, 16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v50, 16, v30
+; GFX9-NEXT: v_bfe_i32 v49, v30, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v52, 16, v29
+; GFX9-NEXT: v_bfe_i32 v51, v29, 0, 16
+; GFX9-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
+; GFX9-NEXT: v_ashrrev_i32_e32 v56, 16, v31
+; GFX9-NEXT: v_bfe_i32 v55, v31, 0, 16
+; GFX9-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; GFX9-NEXT: v_ashrrev_i32_e32 v54, 16, v32
+; GFX9-NEXT: v_bfe_i32 v53, v32, 0, 16
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_ashrrev_i32_e32 v58, 16, v34
+; GFX9-NEXT: v_bfe_i32 v57, v34, 0, 16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v32, 16, v31
+; GFX9-NEXT: v_bfe_i32 v31, v31, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v30
+; GFX9-NEXT: v_bfe_i32 v0, v30, 0, 16
+; GFX9-NEXT: v_mov_b32_e32 v30, s0
+; GFX9-NEXT: v_ashrrev_i32_e32 v34, 16, v33
+; GFX9-NEXT: v_bfe_i32 v33, v33, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v60, 16, v36
+; GFX9-NEXT: v_bfe_i32 v59, v36, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v36, 16, v35
+; GFX9-NEXT: v_bfe_i32 v35, v35, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v62, 16, v29
+; GFX9-NEXT: v_bfe_i32 v61, v29, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v29, 16, v28
+; GFX9-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX9-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
+; GFX9-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
+; GFX9-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
+; GFX9-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
+; GFX9-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
+; GFX9-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
+; GFX9-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
+; GFX9-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
+; GFX9-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
+; GFX9-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
+; GFX9-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
+; GFX9-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
+; GFX9-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v64i16_to_v64i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 116, @30, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.W, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Y, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Z, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.W, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Y, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Z, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.W, OQAP,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.Y, OQAP,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.Z, OQAP,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.W, OQAP,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T6.W
+; EG-NEXT: MOV T6.Y, OQAP,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T6.W
+; EG-NEXT: MOV T6.Z, OQAP,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T6.W
+; EG-NEXT: MOV T6.W, OQAP,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T7.W
+; EG-NEXT: MOV T7.Y, OQAP,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T7.W
+; EG-NEXT: MOV T7.Z, OQAP,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T7.W
+; EG-NEXT: MOV T7.W, OQAP,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T8.W
+; EG-NEXT: MOV T8.Y, OQAP,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T8.W
+; EG-NEXT: MOV T8.Z, OQAP,
+; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T8.W
+; EG-NEXT: MOV T8.W, OQAP,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T9.W
+; EG-NEXT: MOV T9.Y, OQAP,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T9.W
+; EG-NEXT: MOV T9.Z, OQAP,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: ALU 85, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_READ_RET * OQAP, T9.W
+; EG-NEXT: MOV T9.W, OQAP,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T10.W
+; EG-NEXT: MOV T10.Y, OQAP,
+; EG-NEXT: LSHR T10.W, T9.W, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT: LDS_READ_RET * OQAP, T11.W
+; EG-NEXT: MOV T10.Z, OQAP,
+; EG-NEXT: LSHR * T11.Z, T10.Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T10.W, T10.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T0.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T0.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T0.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T1.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T1.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T1.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T2.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T2.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T2.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T3.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T3.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T3.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T4.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T4.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT: ALU 83, @32, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T4.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T5.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T5.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T5.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T6.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T6.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T6.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T7.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T7.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T7.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T8.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T8.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T8.W, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T9.Y, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T12.Z, T9.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: LSHR T11.Z, T10.Z, literal.x,
+; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43)
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43)
+; EG-NEXT: ALU 94, @33, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T11.W, T10.W,
+; EG-NEXT: BFE_INT T9.W, T9.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: BFE_INT T9.W, T10.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: BFE_INT T9.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: BFE_INT T9.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: MOV * T10.W, KC0[2].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T10.W, T9.W,
+; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T9.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T4.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T4.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 144(2.017870e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T5.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T5.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 128(1.793662e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T6.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T6.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 176(2.466285e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T6.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT * T0.W, T7.Y, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ALU 34, @34, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T7.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T7.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 208(2.914701e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T8.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T8.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T8.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 240(3.363116e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T9.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: BFE_INT T0.W, T10.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 224(3.138909e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <64 x i16>, ptr addrspace(3) %in
%ext = sext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-
-; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: LDS_WRITE
define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_i16_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_u16 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_i16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_i16_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 8, @35, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: MOV T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%a = load i16, ptr addrspace(3) %in
%ext = zext i16 %a to i64
store i64 %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; FIXME: Need to optimize this sequence to avoid an extra shift.
-; t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
-; t28: i64 = any_extend t25
-; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
-; SI: ds_read_i16 v[[LO:[0-9]+]],
-; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
-; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
-; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-
-; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG-DAG: LDS_WRITE
-; EG-DAG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_i16_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_i16 v0, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_i16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_i16_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T1.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%a = load i16, ptr addrspace(3) %in
%ext = sext i16 %a to i64
store i64 %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: LDS_WRITE
define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v1i16_to_v1i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_u16 v0, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v1i16_to_v1i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v1i16_to_v1i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 8, @37, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: MOV T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <1 x i16>, ptr addrspace(3) %in
%ext = zext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG-DAG: LDS_WRITE
-; EG-DAG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v1i16_to_v1i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_i16 v0, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v1i16_to_v1i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_u16 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v1i16_to_v1i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T1.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <1 x i16>, ptr addrspace(3) %in
%ext = sext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v2i16_to_v2i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b32 v2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_mov_b32_e32 v4, s0
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v2i16_to_v2i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b32 v0, v0
+; GFX9-NEXT: s_mov_b32 s1, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v2i16_to_v2i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 17, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: AND_INT T0.W, PV.X, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.X, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%load = load <2 x i16>, ptr addrspace(3) %in
%ext = zext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v2i16_to_v2i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b32 v0, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v1, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: v_mov_b32_e32 v4, s0
+; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v2i16_to_v2i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b32 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v2i16_to_v2i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 18, @40, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.X, OQAP,
+; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T1.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.X, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.X, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <2 x i16>, ptr addrspace(3) %in
%ext = sext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v4i16_to_v4i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v3, 0
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v10, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1
+; SI-NEXT: ds_write2_b64 v10, v[4:5], v[2:3] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v10, v[6:7], v[8:9] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v4i16_to_v4i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: v_mov_b32_e32 v6, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: v_mov_b32_e32 v9, s0
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_sdwa v7, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v5, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX9-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v4i16_to_v4i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 35, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%load = load <4 x i16>, ptr addrspace(3) %in
%ext = zext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v4i16_to_v4i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read_b64 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v8, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_ashr_i64 v[4:5], v[0:1], 48
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v6, v3, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; SI-NEXT: ds_write2_b64 v8, v[2:3], v[4:5] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v8, v[0:1], v[6:7] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v4i16_to_v4i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read_b64 v[0:1], v0
+; GFX9-NEXT: v_mov_b32_e32 v8, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_bfe_i32 v4, v3, 0, 16
+; GFX9-NEXT: v_bfe_i32 v6, v1, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v4i16_to_v4i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 39, @42, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: BFE_INT * T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T1.Z, PV.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T1.W, PV.W, literal.y,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 20(2.802597e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.Z,
+; EG-NEXT: ASHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <4 x i16>, ptr addrspace(3) %in
%ext = sext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v8i16_to_v8i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT: v_mov_b32_e32 v5, 0
+; SI-NEXT: v_mov_b32_e32 v7, v5
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v11, v5
+; SI-NEXT: v_mov_b32_e32 v13, v5
+; SI-NEXT: v_mov_b32_e32 v15, v5
+; SI-NEXT: v_mov_b32_e32 v17, v5
+; SI-NEXT: v_mov_b32_e32 v19, v5
+; SI-NEXT: v_mov_b32_e32 v20, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; SI-NEXT: v_and_b32_e32 v14, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v12, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; SI-NEXT: ds_write2_b64 v20, v[8:9], v[6:7] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v20, v[12:13], v[4:5] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v20, v[10:11], v[16:17] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v20, v[14:15], v[18:19] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v8i16_to_v8i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v12, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, v12
+; GFX9-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NEXT: s_mov_b32 s1, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v13, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_sdwa v7, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v3
+; GFX9-NEXT: v_and_b32_sdwa v6, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, v12
+; GFX9-NEXT: ds_write2_b64 v13, v[11:12], v[7:8] offset0:6 offset1:7
+; GFX9-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-NEXT: v_and_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v1
+; GFX9-NEXT: ds_write2_b64 v13, v[2:3], v[6:7] offset0:4 offset1:5
+; GFX9-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-NEXT: v_and_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, v12
+; GFX9-NEXT: ds_write2_b64 v13, v[9:10], v[5:6] offset0:2 offset1:3
+; GFX9-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-NEXT: ds_write2_b64 v13, v[0:1], v[4:5] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v8i16_to_v8i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 71, @43, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: AND_INT T1.W, T0.W, literal.x,
+; EG-NEXT: MOV * T2.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%load = load <8 x i16>, ptr addrspace(3) %in
%ext = zext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v8i16_to_v8i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT: v_mov_b32_e32 v16, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; SI-NEXT: v_bfe_i32 v4, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v6, v1, 0, 16
+; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 48
+; SI-NEXT: v_bfe_i32 v8, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v10, v5, 0, 16
+; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], 48
+; SI-NEXT: v_bfe_i32 v12, v9, 0, 16
+; SI-NEXT: v_bfe_i32 v14, v7, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT: ds_write2_b64 v16, v[10:11], v[2:3] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v16, v[6:7], v[0:1] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v16, v[8:9], v[14:15] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v8i16_to_v8i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NEXT: v_mov_b32_e32 v16, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_bfe_i32 v14, v2, 0, 16
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_bfe_i32 v10, v9, 0, 16
+; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX9-NEXT: v_bfe_i32 v6, v5, 0, 16
+; GFX9-NEXT: v_bfe_i32 v8, v7, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NEXT: v_bfe_i32 v12, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v16, v[12:13], v[4:5] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v8i16_to_v8i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 80, @44, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV * T0.W, OQAP,
+; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: BFE_INT T1.Z, T0.W, 0.0, literal.x,
+; EG-NEXT: ASHR T2.W, T1.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: BFE_INT T2.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T2.W, T1.Z, literal.y,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: BFE_INT T3.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T2.W, T2.Z, literal.y,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: ASHR T2.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 52(7.286752e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: MOV * T2.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T1.Z,
+; EG-NEXT: ASHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T2.Z,
+; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T3.Z,
+; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: RETURN
%load = load <8 x i16>, ptr addrspace(3) %in
%ext = sext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v16i16_to_v16i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT: v_mov_b32_e32 v9, 0
+; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT: v_mov_b32_e32 v11, v9
+; SI-NEXT: v_mov_b32_e32 v13, v9
+; SI-NEXT: v_mov_b32_e32 v15, v9
+; SI-NEXT: v_mov_b32_e32 v16, v9
+; SI-NEXT: v_mov_b32_e32 v18, v9
+; SI-NEXT: v_mov_b32_e32 v19, v9
+; SI-NEXT: v_mov_b32_e32 v21, v9
+; SI-NEXT: v_mov_b32_e32 v22, v9
+; SI-NEXT: v_mov_b32_e32 v29, s0
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v1
+; SI-NEXT: ds_write2_b64 v29, v[17:18], v[14:15] offset0:10 offset1:11
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v14, v9
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3
+; SI-NEXT: ds_write2_b64 v29, v[20:21], v[12:13] offset0:14 offset1:15
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: v_mov_b32_e32 v24, v9
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; SI-NEXT: v_and_b32_e32 v18, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v15, 0xffff, v0
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2
+; SI-NEXT: v_mov_b32_e32 v26, v9
+; SI-NEXT: v_mov_b32_e32 v28, v9
+; SI-NEXT: s_waitcnt lgkmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; SI-NEXT: v_and_b32_e32 v13, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6
+; SI-NEXT: v_and_b32_e32 v21, 0xffff, v7
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_mov_b32_e32 v5, v9
+; SI-NEXT: ds_write2_b64 v29, v[21:22], v[10:11] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v29, v[13:14], v[8:9] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v29, v[15:16], v[23:24] offset0:8 offset1:9
+; SI-NEXT: ds_write2_b64 v29, v[18:19], v[25:26] offset0:12 offset1:13
+; SI-NEXT: ds_write2_b64 v29, v[0:1], v[27:28] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v29, v[2:3], v[4:5] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v16i16_to_v16i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v14, v8
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NEXT: v_mov_b32_e32 v17, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX9-NEXT: ds_write2_b64 v17, v[13:14], v[15:16] offset0:10 offset1:11
+; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v4
+; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: ds_write2_b64 v17, v[15:16], v[7:8] offset0:8 offset1:9
+; GFX9-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v8
+; GFX9-NEXT: ds_write2_b64 v17, v[14:15], v[6:7] offset0:12 offset1:13
+; GFX9-NEXT: v_mov_b32_e32 v14, v8
+; GFX9-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX9-NEXT: ds_write2_b64 v17, v[5:6], v[13:14] offset0:14 offset1:15
+; GFX9-NEXT: v_mov_b32_e32 v4, v8
+; GFX9-NEXT: v_mov_b32_e32 v13, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT: ds_write2_b64 v17, v[3:4], v[12:13] offset0:6 offset1:7
+; GFX9-NEXT: v_mov_b32_e32 v3, v8
+; GFX9-NEXT: v_mov_b32_e32 v12, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: ds_write2_b64 v17, v[2:3], v[11:12] offset0:4 offset1:5
+; GFX9-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-NEXT: v_mov_b32_e32 v11, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: ds_write2_b64 v17, v[1:2], v[10:11] offset0:2 offset1:3
+; GFX9-NEXT: v_mov_b32_e32 v1, v8
+; GFX9-NEXT: v_mov_b32_e32 v10, v8
+; GFX9-NEXT: ds_write2_b64 v17, v[0:1], v[9:10] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v16i16_to_v16i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 100, @45, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: MOV * T2.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT: MOV * T3.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: ALU 42, @46, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%load = load <16 x i16>, ptr addrspace(3) %in
%ext = zext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v16i16_to_v16i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT: v_mov_b32_e32 v25, s0
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v13, v7
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; SI-NEXT: v_bfe_i32 v8, v4, 0, 16
+; SI-NEXT: v_ashr_i64 v[10:11], v[4:5], 48
+; SI-NEXT: v_bfe_i32 v4, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v6, 0, 16
+; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], 48
+; SI-NEXT: v_bfe_i32 v14, v13, 0, 16
+; SI-NEXT: v_bfe_i32 v16, v0, 0, 16
+; SI-NEXT: v_ashr_i64 v[17:18], v[2:3], 48
+; SI-NEXT: v_bfe_i32 v19, v9, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v25, v[19:20], v[17:18] offset0:14 offset1:15
+; SI-NEXT: v_ashr_i64 v[17:18], v[0:1], 48
+; SI-NEXT: v_bfe_i32 v0, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v19, v23, 0, 16
+; SI-NEXT: v_bfe_i32 v21, v21, 0, 16
+; SI-NEXT: v_bfe_i32 v23, v22, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: ds_write2_b64 v25, v[0:1], v[17:18] offset0:10 offset1:11
+; SI-NEXT: v_bfe_i32 v0, v15, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT: ds_write2_b64 v25, v[14:15], v[6:7] offset0:6 offset1:7
+; SI-NEXT: ds_write2_b64 v25, v[4:5], v[10:11] offset0:2 offset1:3
+; SI-NEXT: ds_write2_b64 v25, v[2:3], v[0:1] offset0:12 offset1:13
+; SI-NEXT: ds_write2_b64 v25, v[16:17], v[23:24] offset0:8 offset1:9
+; SI-NEXT: ds_write2_b64 v25, v[12:13], v[21:22] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v25, v[8:9], v[19:20] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v16i16_to_v16i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NEXT: v_mov_b32_e32 v26, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v5
+; GFX9-NEXT: v_bfe_i32 v22, v22, 0, 16
+; GFX9-NEXT: v_bfe_i32 v24, v4, 0, 16
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v6
+; GFX9-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: v_bfe_i32 v4, v5, 0, 16
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX9-NEXT: v_bfe_i32 v18, v18, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX9-NEXT: ds_write2_b64 v26, v[24:25], v[22:23] offset0:8 offset1:9
+; GFX9-NEXT: v_mov_b32_e32 v24, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX9-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX9-NEXT: v_bfe_i32 v10, v9, 0, 16
+; GFX9-NEXT: v_bfe_i32 v12, v11, 0, 16
+; GFX9-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX9-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NEXT: v_bfe_i32 v22, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
+; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-NEXT: ds_write2_b64 v26, v[4:5], v[20:21] offset0:10 offset1:11
+; GFX9-NEXT: v_bfe_i32 v4, v24, 0, 16
+; GFX9-NEXT: v_bfe_i32 v20, v7, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX9-NEXT: ds_write2_b64 v26, v[6:7], v[18:19] offset0:12 offset1:13
+; GFX9-NEXT: ds_write2_b64 v26, v[20:21], v[16:17] offset0:14 offset1:15
+; GFX9-NEXT: ds_write2_b64 v26, v[4:5], v[14:15] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v26, v[2:3], v[12:13] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v26, v[0:1], v[10:11] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v26, v[22:23], v[8:9] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v16i16_to_v16i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 101, @47, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: MOV * T1.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: BFE_INT T2.W, T1.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV * T2.Z, OQAP,
+; EG-NEXT: BFE_INT T3.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T3.W, T2.W, literal.y,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: BFE_INT T4.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T3.W, T3.Z, literal.y,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: BFE_INT T5.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T3.W, T4.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: BFE_INT T6.Z, T0.W, 0.0, literal.x,
+; EG-NEXT: ASHR T3.W, T5.Z, literal.y,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T3.W, T6.Z, literal.y,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T3.W, T7.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: BFE_INT T9.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T3.W, T8.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: ASHR T3.W, T9.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 116(1.625506e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: ASHR T3.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: ASHR T1.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T3.W, T1.W,
+; EG-NEXT: MOV * T1.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T1.W, T2.W,
+; EG-NEXT: ASHR T1.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T1.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T3.Z,
+; EG-NEXT: ASHR T1.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT: ALU 62, @48, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T1.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T4.Z,
+; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T5.Z,
+; EG-NEXT: ASHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: ASHR T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T6.Z,
+; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T7.Z,
+; EG-NEXT: ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T8.Z,
+; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T9.Z,
+; EG-NEXT: RETURN
%load = load <16 x i16>, ptr addrspace(3) %in
%ext = sext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
+
define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v32i16_to_v32i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: v_mov_b32_e32 v4, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v13, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[5:8], v13 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[0:3], v13 offset1:1
+; SI-NEXT: v_mov_b32_e32 v17, v4
+; SI-NEXT: v_mov_b32_e32 v19, v4
+; SI-NEXT: v_mov_b32_e32 v21, v4
+; SI-NEXT: v_mov_b32_e32 v23, v4
+; SI-NEXT: ds_read2_b64 v[9:12], v13 offset0:4 offset1:5
+; SI-NEXT: ds_read2_b64 v[13:16], v13 offset0:6 offset1:7
+; SI-NEXT: v_mov_b32_e32 v28, s0
+; SI-NEXT: s_waitcnt lgkmcnt(3)
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v8
+; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:14 offset1:15
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v6
+; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:10 offset1:11
+; SI-NEXT: s_waitcnt lgkmcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v3
+; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:6 offset1:7
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v1
+; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:2 offset1:3
+; SI-NEXT: s_waitcnt lgkmcnt(4)
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v16
+; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:30 offset1:31
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; SI-NEXT: v_and_b32_e32 v20, 0xffff, v14
+; SI-NEXT: ds_write2_b64 v28, v[20:21], v[18:19] offset0:26 offset1:27
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7
+; SI-NEXT: v_and_b32_e32 v18, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12
+; SI-NEXT: v_and_b32_e32 v22, 0xffff, v12
+; SI-NEXT: ds_write2_b64 v28, v[22:23], v[16:17] offset0:22 offset1:23
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13
+; SI-NEXT: ds_write2_b64 v28, v[7:8], v[20:21] offset0:12 offset1:13
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9
+; SI-NEXT: ds_write2_b64 v28, v[18:19], v[5:6] offset0:8 offset1:9
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v16, 0xffff, v13
+; SI-NEXT: v_and_b32_e32 v18, 0xffff, v15
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; SI-NEXT: v_and_b32_e32 v24, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v26, 0xffff, v2
+; SI-NEXT: v_mov_b32_e32 v9, v4
+; SI-NEXT: ds_write2_b64 v28, v[8:9], v[3:4] offset0:18 offset1:19
+; SI-NEXT: v_mov_b32_e32 v27, v4
+; SI-NEXT: v_mov_b32_e32 v25, v4
+; SI-NEXT: v_mov_b32_e32 v11, v4
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v13, v4
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: v_mov_b32_e32 v15, v4
+; SI-NEXT: ds_write2_b64 v28, v[26:27], v[20:21] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v28, v[24:25], v[22:23] offset1:1
+; SI-NEXT: ds_write2_b64 v28, v[18:19], v[1:2] offset0:28 offset1:29
+; SI-NEXT: ds_write2_b64 v28, v[16:17], v[12:13] offset0:24 offset1:25
+; SI-NEXT: ds_write2_b64 v28, v[10:11], v[7:8] offset0:20 offset1:21
+; SI-NEXT: ds_write2_b64 v28, v[5:6], v[14:15] offset0:16 offset1:17
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_zextload_v32i16_to_v32i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v22, v1
+; GFX9-NEXT: v_mov_b32_e32 v20, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read2_b64 v[2:5], v0 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[6:9], v0 offset0:4 offset1:5
+; GFX9-NEXT: ds_read2_b64 v[10:13], v0 offset0:6 offset1:7
+; GFX9-NEXT: v_mov_b32_e32 v23, s0
+; GFX9-NEXT: ds_read2_b64 v[14:17], v0 offset0:2 offset1:3
+; GFX9-NEXT: s_waitcnt lgkmcnt(3)
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v12
+; GFX9-NEXT: ds_write2_b64 v23, v[21:22], v[19:20] offset0:28 offset1:29
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX9-NEXT: v_mov_b32_e32 v12, v1
+; GFX9-NEXT: ds_write2_b64 v23, v[11:12], v[19:20] offset0:26 offset1:27
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v10
+; GFX9-NEXT: ds_write2_b64 v23, v[19:20], v[11:12] offset0:24 offset1:25
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX9-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-NEXT: ds_write2_b64 v23, v[19:20], v[10:11] offset0:22 offset1:23
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v8
+; GFX9-NEXT: v_mov_b32_e32 v10, v1
+; GFX9-NEXT: ds_write2_b64 v23, v[11:12], v[9:10] offset0:20 offset1:21
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v7
+; GFX9-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:18 offset1:19
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v6
+; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:16 offset1:17
+; GFX9-NEXT: s_waitcnt lgkmcnt(7)
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v17
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v17
+; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:14 offset1:15
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v16
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16
+; GFX9-NEXT: ds_write2_b64 v23, v[10:11], v[8:9] offset0:12 offset1:13
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v14
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v14
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX9-NEXT: v_mov_b32_e32 v13, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX9-NEXT: ds_write2_b64 v23, v[12:13], v[10:11] offset0:8 offset1:9
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v10, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT: ds_write2_b64 v23, v[5:6], v[9:10] offset0:6 offset1:7
+; GFX9-NEXT: v_mov_b32_e32 v5, v1
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX9-NEXT: ds_write2_b64 v23, v[4:5], v[8:9] offset0:4 offset1:5
+; GFX9-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v15
+; GFX9-NEXT: ds_write2_b64 v23, v[0:1], v[19:20] offset0:30 offset1:31
+; GFX9-NEXT: v_mov_b32_e32 v17, v1
+; GFX9-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-NEXT: ds_write2_b64 v23, v[3:4], v[7:8] offset0:2 offset1:3
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v19, v1
+; GFX9-NEXT: ds_write2_b64 v23, v[16:17], v[14:15] offset0:10 offset1:11
+; GFX9-NEXT: ds_write2_b64 v23, v[2:3], v[18:19] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_zextload_v32i16_to_v32i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 105, @49, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.W, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T1.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T2.W
+; EG-NEXT: MOV T2.W, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Y, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.Z, OQAP,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T3.W
+; EG-NEXT: MOV T3.W, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Y, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.Z, OQAP,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T4.W
+; EG-NEXT: MOV T4.W, OQAP,
+; EG-NEXT: MOV * T5.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV T5.Y, OQAP,
+; EG-NEXT: LSHR T5.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T5.W,
+; EG-NEXT: AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T5.Y, literal.x,
+; EG-NEXT: MOV * T5.W, KC0[2].Y,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: LSHR T4.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT: LDS_WRITE * T5.W, T4.W,
+; EG-NEXT: AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT: ALU 93, @50, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: LSHR T3.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T4.W, T3.W,
+; EG-NEXT: AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: LSHR T2.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT: LDS_WRITE * T3.W, T2.W,
+; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: LSHR T1.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43)
+; EG-NEXT: LDS_WRITE * T2.W, T1.W,
+; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: ALU 87, @51, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 156(2.186026e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 140(1.961818e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 188(2.634441e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 172(2.410233e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 220(3.082857e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 204(2.858649e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 252(3.531272e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 236(3.307064e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 228(3.194960e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%load = load <32 x i16>, ptr addrspace(3) %in
%ext = zext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v32i16_to_v32i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v12, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v12 offset0:2 offset1:3
+; SI-NEXT: ds_read2_b64 v[4:7], v12 offset1:1
+; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7
+; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5
+; SI-NEXT: v_mov_b32_e32 v16, s0
+; SI-NEXT: s_waitcnt lgkmcnt(3)
+; SI-NEXT: v_mov_b32_e32 v19, v3
+; SI-NEXT: s_waitcnt lgkmcnt(2)
+; SI-NEXT: v_mov_b32_e32 v21, v7
+; SI-NEXT: s_waitcnt lgkmcnt(1)
+; SI-NEXT: v_mov_b32_e32 v22, v11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v23, v15
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; SI-NEXT: v_ashr_i64 v[17:18], v[2:3], 48
+; SI-NEXT: v_bfe_i32 v19, v19, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:14 offset1:15
+; SI-NEXT: v_ashr_i64 v[17:18], v[0:1], 48
+; SI-NEXT: v_bfe_i32 v19, v1, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:10 offset1:11
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10
+; SI-NEXT: v_ashr_i64 v[17:18], v[6:7], 48
+; SI-NEXT: v_bfe_i32 v19, v21, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:6 offset1:7
+; SI-NEXT: v_ashr_i64 v[17:18], v[4:5], 48
+; SI-NEXT: v_bfe_i32 v19, v5, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:2 offset1:3
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8
+; SI-NEXT: v_ashr_i64 v[17:18], v[10:11], 48
+; SI-NEXT: v_bfe_i32 v19, v22, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:30 offset1:31
+; SI-NEXT: v_ashr_i64 v[17:18], v[8:9], 48
+; SI-NEXT: v_bfe_i32 v19, v9, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:26 offset1:27
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; SI-NEXT: v_ashr_i64 v[17:18], v[14:15], 48
+; SI-NEXT: v_bfe_i32 v19, v23, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:22 offset1:23
+; SI-NEXT: v_ashr_i64 v[17:18], v[12:13], 48
+; SI-NEXT: v_bfe_i32 v19, v13, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:18 offset1:19
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; SI-NEXT: v_bfe_i32 v1, v12, 0, 16
+; SI-NEXT: v_bfe_i32 v3, v14, 0, 16
+; SI-NEXT: v_bfe_i32 v5, v8, 0, 16
+; SI-NEXT: v_bfe_i32 v7, v10, 0, 16
+; SI-NEXT: v_bfe_i32 v9, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v11, v6, 0, 16
+; SI-NEXT: v_bfe_i32 v12, v0, 0, 16
+; SI-NEXT: v_bfe_i32 v13, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v17, v15, 0, 16
+; SI-NEXT: v_bfe_i32 v19, v21, 0, 16
+; SI-NEXT: v_bfe_i32 v20, v24, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; SI-NEXT: ds_write2_b64 v16, v[13:14], v[20:21] offset0:12 offset1:13
+; SI-NEXT: v_bfe_i32 v14, v29, 0, 16
+; SI-NEXT: v_bfe_i32 v21, v28, 0, 16
+; SI-NEXT: v_bfe_i32 v23, v27, 0, 16
+; SI-NEXT: v_bfe_i32 v24, v25, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; SI-NEXT: ds_write2_b64 v16, v[12:13], v[24:25] offset0:8 offset1:9
+; SI-NEXT: v_bfe_i32 v25, v26, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
+; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
+; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21
+; SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; SI-NEXT: ds_write2_b64 v16, v[11:12], v[25:26] offset0:4 offset1:5
+; SI-NEXT: ds_write2_b64 v16, v[9:10], v[23:24] offset1:1
+; SI-NEXT: ds_write2_b64 v16, v[7:8], v[21:22] offset0:28 offset1:29
+; SI-NEXT: ds_write2_b64 v16, v[5:6], v[14:15] offset0:24 offset1:25
+; SI-NEXT: ds_write2_b64 v16, v[3:4], v[19:20] offset0:20 offset1:21
+; SI-NEXT: ds_write2_b64 v16, v[1:2], v[17:18] offset0:16 offset1:17
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_sextload_v32i16_to_v32i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v12, s1
+; GFX9-NEXT: ds_read2_b64 v[4:7], v12 offset1:1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v12 offset0:2 offset1:3
+; GFX9-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; GFX9-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; GFX9-NEXT: s_waitcnt lgkmcnt(3)
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v7
+; GFX9-NEXT: s_waitcnt lgkmcnt(2)
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15
+; GFX9-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GFX9-NEXT: v_bfe_i32 v25, v15, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GFX9-NEXT: v_mov_b32_e32 v15, s0
+; GFX9-NEXT: ds_write2_b64 v15, v[25:26], v[23:24] offset0:30 offset1:31
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v14
+; GFX9-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GFX9-NEXT: v_bfe_i32 v25, v14, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX9-NEXT: ds_write2_b64 v15, v[25:26], v[23:24] offset0:28 offset1:29
+; GFX9-NEXT: v_bfe_i32 v23, v14, 0, 16
+; GFX9-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v24, 31, v23
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v12
+; GFX9-NEXT: ds_write2_b64 v15, v[13:14], v[23:24] offset0:26 offset1:27
+; GFX9-NEXT: v_bfe_i32 v24, v12, 0, 16
+; GFX9-NEXT: v_bfe_i32 v26, v25, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[26:27] offset0:24 offset1:25
+; GFX9-NEXT: v_bfe_i32 v24, v12, 0, 16
+; GFX9-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-NEXT: ds_write2_b64 v15, v[11:12], v[24:25] offset0:22 offset1:23
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX9-NEXT: v_bfe_i32 v24, v10, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[11:12] offset0:20 offset1:21
+; GFX9-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX9-NEXT: v_bfe_i32 v24, v9, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[10:11] offset0:18 offset1:19
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v8
+; GFX9-NEXT: v_bfe_i32 v24, v8, 0, 16
+; GFX9-NEXT: v_bfe_i32 v26, v25, 0, 16
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[26:27] offset0:16 offset1:17
+; GFX9-NEXT: v_bfe_i32 v24, v8, 0, 16
+; GFX9-NEXT: v_bfe_i32 v26, v3, 0, 16
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: v_ashrrev_i32_e32 v27, 31, v26
+; GFX9-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX9-NEXT: ds_write2_b64 v15, v[26:27], v[24:25] offset0:14 offset1:15
+; GFX9-NEXT: v_mov_b32_e32 v26, v7
+; GFX9-NEXT: v_bfe_i32 v7, v2, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT: v_bfe_i32 v13, v22, 0, 16
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0
+; GFX9-NEXT: v_bfe_i32 v9, v28, 0, 16
+; GFX9-NEXT: ds_write2_b64 v15, v[7:8], v[11:12] offset0:12 offset1:13
+; GFX9-NEXT: v_bfe_i32 v11, v0, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
+; GFX9-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX9-NEXT: v_bfe_i32 v18, v17, 0, 16
+; GFX9-NEXT: v_bfe_i32 v20, v19, 0, 16
+; GFX9-NEXT: v_bfe_i32 v22, v22, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-NEXT: v_bfe_i32 v3, v4, 0, 16
+; GFX9-NEXT: v_bfe_i32 v24, v5, 0, 16
+; GFX9-NEXT: v_bfe_i32 v5, v6, 0, 16
+; GFX9-NEXT: v_bfe_i32 v7, v26, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NEXT: v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NEXT: v_ashrrev_i32_e32 v21, 31, v20
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-NEXT: v_ashrrev_i32_e32 v23, 31, v22
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v25, 31, v24
+; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v7
+; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-NEXT: ds_write2_b64 v15, v[0:1], v[9:10] offset0:10 offset1:11
+; GFX9-NEXT: ds_write2_b64 v15, v[11:12], v[22:23] offset0:8 offset1:9
+; GFX9-NEXT: ds_write2_b64 v15, v[7:8], v[13:14] offset0:6 offset1:7
+; GFX9-NEXT: ds_write2_b64 v15, v[5:6], v[20:21] offset0:4 offset1:5
+; GFX9-NEXT: ds_write2_b64 v15, v[24:25], v[18:19] offset0:2 offset1:3
+; GFX9-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_sextload_v32i16_to_v32i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 107, @52, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T1.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T1.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T1.W, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T2.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T2.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T2.W, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T3.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T3.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T3.W, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T4.Y, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T4.Z, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T4.W, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T5.Y, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T5.Z, OQAP,
+; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_READ_RET * OQAP, T5.W
+; EG-NEXT: MOV * T5.W, OQAP,
+; EG-NEXT: BFE_INT T0.Z, T5.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T0.W, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T6.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T0.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T6.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T7.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T9.Z, T1.W, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T8.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T10.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T9.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T11.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T10.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT * T12.Z, T2.W, 0.0, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ALU 98, @53, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ASHR T6.W, T11.Z, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 100(1.401298e-43)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T13.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T12.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T14.Z, T3.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T13.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T15.Z, T3.W, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T14.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T16.Z, T4.Y, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T15.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T17.Z, T4.Z, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T16.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T18.Z, T4.W, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T17.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: BFE_INT T19.Z, T5.W, 0.0, literal.x,
+; EG-NEXT: ASHR T6.W, T18.Z, literal.y,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: ASHR T6.W, T19.Z, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 228(3.194960e-43)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: ASHR T6.W, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: ASHR T6.W, T5.Y, literal.x,
+; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT: LDS_WRITE * T7.W, T6.W,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ASHR T0.W, T5.Z, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ASHR T0.W, T5.Z, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.Z,
+; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T6.Z,
+; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T7.Z,
+; EG-NEXT: ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ASHR * T0.W, T1.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ALU 99, @54, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x,
+; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T8.Z,
+; EG-NEXT: ASHR T0.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43)
+; EG-NEXT: LDS_WRITE * T6.W, T0.W,
+; EG-NEXT: ASHR T0.W, T1.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T9.Z,
+; EG-NEXT: ASHR T0.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T2.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T10.Z,
+; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T11.Z,
+; EG-NEXT: ASHR T0.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 156(2.186026e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T2.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T12.Z,
+; EG-NEXT: ASHR T0.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 140(1.961818e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T3.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T13.Z,
+; EG-NEXT: ASHR T0.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 188(2.634441e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T3.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T14.Z,
+; EG-NEXT: ASHR T0.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 172(2.410233e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T3.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T15.Z,
+; EG-NEXT: ASHR T0.W, T4.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 220(3.082857e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T4.Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T16.Z,
+; EG-NEXT: ASHR T0.W, T4.Z, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 204(2.858649e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR * T0.W, T4.Z, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: ALU 27, @55, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 200(2.802597e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T17.Z,
+; EG-NEXT: ASHR T0.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 252(3.531272e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T4.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T18.Z,
+; EG-NEXT: ASHR T0.W, T5.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 31(4.344025e-44), 236(3.307064e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ASHR T0.W, T5.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT: LDS_WRITE * T1.W, T0.W,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T19.Z,
+; EG-NEXT: RETURN
%load = load <32 x i16>, ptr addrspace(3) %in
%ext = sext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(3) %out
ret void
}
-; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
-; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
-; %load = load <64 x i16>, ptr addrspace(3) %in
-; %ext = zext <64 x i16> %load to <64 x i64>
-; store <64 x i64> %ext, ptr addrspace(3) %out
-; ret void
-; }
-
-; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
-; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
-; %load = load <64 x i16>, ptr addrspace(3) %in
-; %ext = sext <64 x i16> %load to <64 x i64>
-; store <64 x i64> %ext, ptr addrspace(3) %out
-; ret void
-; }
-
-; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
-; FUNC-LABEL: {{^}}local_v8i16_to_128:
-
-; SI-NOT: ds_read_b128
-; SI-NOT: ds_write_b128
-
-; CIVI: ds_read_b128
-; CIVI: ds_write_b128
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_v8i16_to_128:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT: v_mov_b32_e32 v4, s0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: local_v8i16_to_128:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: local_v8i16_to_128:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 25, @56, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.W, KC0[2].Z,
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: MOV * T0.W, KC0[2].Y,
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_READ_RET * OQAP, T0.W
+; EG-NEXT: MOV T0.X, OQAP,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LDS_WRITE * T0.W, T0.X,
+; EG-NEXT: RETURN
%ld = load <8 x i16>, ptr addrspace(3) %in, align 16
store <8 x i16> %ld, ptr addrspace(3) %out, align 16
ret void
}
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CIVI: {{.*}}
+; FUNC: {{.*}}
+; GCN: {{.*}}
+; GFX89: {{.*}}
+; SICIVI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index af7f92798a9319..336e29b53dfbe4 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -496,52 +496,51 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @22, KC0[], KC1[]
; EG-NEXT: TEX 7 @6
-; EG-NEXT: ALU 30, @23, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: ALU 29, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_8 T5.X, T4.X, 74, #3
-; EG-NEXT: VTX_READ_8 T6.X, T4.X, 108, #3
-; EG-NEXT: VTX_READ_8 T7.X, T4.X, 72, #3
-; EG-NEXT: VTX_READ_8 T8.X, T4.X, 111, #3
-; EG-NEXT: VTX_READ_8 T9.X, T4.X, 75, #3
-; EG-NEXT: VTX_READ_8 T10.X, T4.X, 109, #3
-; EG-NEXT: VTX_READ_8 T11.X, T4.X, 73, #3
-; EG-NEXT: VTX_READ_8 T4.X, T4.X, 110, #3
+; EG-NEXT: VTX_READ_8 T1.X, T0.X, 73, #3
+; EG-NEXT: VTX_READ_8 T2.X, T0.X, 111, #3
+; EG-NEXT: VTX_READ_8 T3.X, T0.X, 75, #3
+; EG-NEXT: VTX_READ_8 T4.X, T0.X, 108, #3
+; EG-NEXT: VTX_READ_8 T5.X, T0.X, 72, #3
+; EG-NEXT: VTX_READ_8 T6.X, T0.X, 110, #3
+; EG-NEXT: VTX_READ_8 T7.X, T0.X, 74, #3
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 109, #3
; EG-NEXT: ALU clause starting at 22:
-; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 23:
-; EG-NEXT: BFE_INT T0.Z, T5.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T4.X, T11.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: BFE_INT T0.X, T7.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T0.Y, T6.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT * T1.Z, T5.X, 0.0, literal.x, BS:VEC_201
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T1.W, T8.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T1.W, T4.X, 0.0, literal.x,
; EG-NEXT: MIN_INT * T0.W, T0.Z, T0.W,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T0.Z, T1.Z, PV.W,
; EG-NEXT: AND_INT T0.W, PS, literal.x,
-; EG-NEXT: MIN_INT * T1.W, T4.X, T0.Y,
+; EG-NEXT: MIN_INT * T1.W, T0.X, T0.Y,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: AND_INT T4.X, PS, literal.x,
+; EG-NEXT: AND_INT T0.X, PS, literal.x,
; EG-NEXT: LSHL T0.Y, PV.W, literal.y,
-; EG-NEXT: BFE_INT T1.Z, T7.X, 0.0, literal.z,
-; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212
-; EG-NEXT: LSHL * T1.W, PV.Z, literal.w,
-; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
-; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: BFE_INT T1.Z, T3.X, 0.0, literal.y,
+; EG-NEXT: BFE_INT T0.W, T2.X, 0.0, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
; EG-NEXT: MIN_INT T0.Z, PV.Z, PV.W,
; EG-NEXT: OR_INT T0.W, PS, PV.Y,
; EG-NEXT: LSHL * T1.W, PV.X, literal.x,
-; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T0.W, PV.W, PS,
-; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
-; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: OR_INT T4.X, PV.W, PS,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_v4i8:
@@ -728,30 +727,30 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; EG-NEXT: ALU 0, @14, KC0[], KC1[]
; EG-NEXT: TEX 3 @6
; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
-; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
-; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
-; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
+; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
+; EG-NEXT: VTX_READ_16 T2.X, T0.X, 46, #3
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 42, #3
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
; EG-NEXT: ALU clause starting at 14:
-; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT: BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201
+; EG-NEXT: BFE_INT T1.X, T1.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T0.Y, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT * T0.Z, T3.X, 0.0, literal.x, BS:VEC_201
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT * T0.W, T6.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T2.X, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: MIN_INT T0.W, T0.Z, PV.W,
-; EG-NEXT: MIN_INT * T1.W, T5.X, T0.Y,
-; EG-NEXT: LSHL T1.W, PS, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T4.X, PV.W, PS,
-; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: MIN_INT * T1.W, T1.X, T0.Y,
+; EG-NEXT: AND_INT T1.W, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: s_test_imin_sle_v2i16:
@@ -3977,37 +3976,34 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
-; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
+; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W,
+; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W,
; EG-NEXT: ALU clause starting at 16:
-; EG-NEXT: LSHR T1.W, T0.X, literal.x,
-; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x,
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x,
-; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: BFE_INT T0.Y, T0.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: ASHR T1.W, T0.X, literal.x,
+; EG-NEXT: ASHR * T2.W, T1.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z,
-; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X,
-; EG-NEXT: LSHL T2.W, PS, literal.x,
-; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T0.X, PS, PV.W,
+; EG-NEXT: MIN_INT T1.W, PS, PV.W,
+; EG-NEXT: MIN_INT * T2.W, PV.Z, PV.Y,
+; EG-NEXT: AND_INT T2.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_imin_sle_v2i16:
@@ -4131,25 +4127,25 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W,
+; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W,
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: LSHR T1.W, T0.X, literal.x,
-; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
+; EG-NEXT: LSHR * T2.W, T1.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
-; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: AND_INT T3.W, T1.X, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: MIN_UINT * T1.W, PS, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, PS, literal.x,
@@ -4157,7 +4153,7 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T0.X, PS, PV.W,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CI-LABEL: v_test_imin_ule_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
index 5c0192d0d1af50..5c06506a542e8c 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -141,35 +141,28 @@ define amdgpu_kernel void @v2i16_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @v4i16_extract_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
; EG-LABEL: v4i16_extract_i8:
; EG: ; %bb.0:
-; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 17, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_16 T6.X, T5.X, 6, #1
-; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1
-; EG-NEXT: ALU clause starting at 10:
-; EG-NEXT: MOV * T5.X, KC0[2].Z,
-; EG-NEXT: ALU clause starting at 11:
-; EG-NEXT: LSHL * T0.W, T6.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T5.X,
-; EG-NEXT: MOV * T3.X, PV.W,
-; EG-NEXT: MOV T0.Y, PV.X,
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
-; EG-NEXT: MOV * T1.W, literal.y,
+; EG-NEXT: LSHR * T1.W, T0.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 8(1.121039e-44)
-; EG-NEXT: BFE_UINT T1.W, PV.Y, literal.x, PS,
-; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
-; EG-NEXT: LSHL T5.X, PV.W, PS,
-; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, T1.W, PV.W,
+; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT: MOV T5.Y, 0.0,
-; EG-NEXT: MOV * T5.Z, 0.0,
-; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <4 x i16>, ptr addrspace(1) %in, align 2
%bc = bitcast <4 x i16> %load to <8 x i8>
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 6b4bca11d80c78..2fa1fcaaebd4d6 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -590,31 +590,31 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1
; EG-NEXT: Fetch clause starting at 10:
-; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: MOV * T7.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
; EG-NEXT: LSHR T0.W, T0.X, literal.y,
-; EG-NEXT: LSHR * T1.W, T7.X, literal.y,
+; EG-NEXT: LSHR * T1.W, T1.X, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHL T0.W, PS, PV.W,
-; EG-NEXT: LSHL * T1.W, T7.X, PV.Z,
+; EG-NEXT: LSHL * T1.W, T1.X, PV.Z,
; EG-NEXT: AND_INT T1.W, PS, literal.x,
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: OR_INT T0.X, PV.W, PS,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 9d550ec27a63bf..8d121988e98bf1 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -206,29 +206,27 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T6.XY, T6.X, 0, #1
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV * T6.X, KC0[2].Z,
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR * T0.W, T6.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
-; EG-NEXT: LSHR T0.Z, T6.Y, literal.x,
-; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.x,
-; EG-NEXT: AND_INT * T1.W, T6.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: ASHR T0.W, PV.W, PS,
-; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z,
-; EG-NEXT: LSHL T1.W, PS, literal.x,
-; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT T6.X, PS, PV.W,
-; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT T1.Y, T0.Y, literal.x,
+; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.y,
+; EG-NEXT: LSHR T0.W, T0.Y, literal.y,
+; EG-NEXT: ASHR * T1.W, T0.X, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: ASHR T0.W, PS, PV.W,
+; EG-NEXT: ASHR * T1.W, PV.Z, PV.Y,
+; EG-NEXT: AND_INT T1.W, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT T0.X, PV.W, PS,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in, i16 1
%a = load <2 x i16>, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 28a7dc046139b2..8c16adc5f23513 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -514,77 +514,133 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
; MIPS64-NEXT: jr $ra
; MIPS64-NEXT: nop
;
-; MIPS32R5-LABEL: i8_4:
-; MIPS32R5: # %bb.0:
-; MIPS32R5-NEXT: addiu $sp, $sp, -16
-; MIPS32R5-NEXT: .cfi_def_cfa_offset 16
-; MIPS32R5-NEXT: sw $5, 8($sp)
-; MIPS32R5-NEXT: sw $4, 12($sp)
-; MIPS32R5-NEXT: lbu $1, 9($sp)
-; MIPS32R5-NEXT: lbu $2, 8($sp)
-; MIPS32R5-NEXT: insert.w $w0[0], $2
-; MIPS32R5-NEXT: insert.w $w0[1], $1
-; MIPS32R5-NEXT: lbu $1, 10($sp)
-; MIPS32R5-NEXT: insert.w $w0[2], $1
-; MIPS32R5-NEXT: lbu $1, 11($sp)
-; MIPS32R5-NEXT: insert.w $w0[3], $1
-; MIPS32R5-NEXT: lbu $1, 13($sp)
-; MIPS32R5-NEXT: lbu $2, 12($sp)
-; MIPS32R5-NEXT: insert.w $w1[0], $2
-; MIPS32R5-NEXT: insert.w $w1[1], $1
-; MIPS32R5-NEXT: lbu $1, 14($sp)
-; MIPS32R5-NEXT: insert.w $w1[2], $1
-; MIPS32R5-NEXT: lbu $1, 15($sp)
-; MIPS32R5-NEXT: insert.w $w1[3], $1
-; MIPS32R5-NEXT: addv.w $w0, $w1, $w0
-; MIPS32R5-NEXT: copy_s.w $1, $w0[0]
-; MIPS32R5-NEXT: copy_s.w $2, $w0[1]
-; MIPS32R5-NEXT: copy_s.w $3, $w0[2]
-; MIPS32R5-NEXT: copy_s.w $4, $w0[3]
-; MIPS32R5-NEXT: sb $4, 7($sp)
-; MIPS32R5-NEXT: sb $3, 6($sp)
-; MIPS32R5-NEXT: sb $2, 5($sp)
-; MIPS32R5-NEXT: sb $1, 4($sp)
-; MIPS32R5-NEXT: lw $2, 4($sp)
-; MIPS32R5-NEXT: addiu $sp, $sp, 16
-; MIPS32R5-NEXT: jr $ra
-; MIPS32R5-NEXT: nop
+; MIPS32R5EB-LABEL: i8_4:
+; MIPS32R5EB: # %bb.0:
+; MIPS32R5EB-NEXT: srl $1, $5, 16
+; MIPS32R5EB-NEXT: srl $2, $5, 24
+; MIPS32R5EB-NEXT: insert.w $w0[0], $2
+; MIPS32R5EB-NEXT: insert.w $w0[1], $1
+; MIPS32R5EB-NEXT: srl $1, $5, 8
+; MIPS32R5EB-NEXT: insert.w $w0[2], $1
+; MIPS32R5EB-NEXT: insert.w $w0[3], $5
+; MIPS32R5EB-NEXT: srl $1, $4, 16
+; MIPS32R5EB-NEXT: srl $2, $4, 24
+; MIPS32R5EB-NEXT: insert.w $w1[0], $2
+; MIPS32R5EB-NEXT: insert.w $w1[1], $1
+; MIPS32R5EB-NEXT: srl $1, $4, 8
+; MIPS32R5EB-NEXT: insert.w $w1[2], $1
+; MIPS32R5EB-NEXT: insert.w $w1[3], $4
+; MIPS32R5EB-NEXT: addv.w $w0, $w1, $w0
+; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0]
+; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1]
+; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3]
+; MIPS32R5EB-NEXT: copy_s.w $4, $w0[2]
+; MIPS32R5EB-NEXT: andi $4, $4, 255
+; MIPS32R5EB-NEXT: ins $3, $4, 8, 24
+; MIPS32R5EB-NEXT: andi $2, $2, 255
+; MIPS32R5EB-NEXT: sll $2, $2, 16
+; MIPS32R5EB-NEXT: or $2, $3, $2
+; MIPS32R5EB-NEXT: sll $1, $1, 24
+; MIPS32R5EB-NEXT: or $2, $2, $1
+; MIPS32R5EB-NEXT: jr $ra
+; MIPS32R5EB-NEXT: nop
;
-; MIPS64R5-LABEL: i8_4:
-; MIPS64R5: # %bb.0:
-; MIPS64R5-NEXT: daddiu $sp, $sp, -16
-; MIPS64R5-NEXT: .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT: sw $5, 8($sp)
-; MIPS64R5-NEXT: sw $4, 12($sp)
-; MIPS64R5-NEXT: lbu $1, 9($sp)
-; MIPS64R5-NEXT: lbu $2, 8($sp)
-; MIPS64R5-NEXT: insert.w $w0[0], $2
-; MIPS64R5-NEXT: insert.w $w0[1], $1
-; MIPS64R5-NEXT: lbu $1, 10($sp)
-; MIPS64R5-NEXT: insert.w $w0[2], $1
-; MIPS64R5-NEXT: lbu $1, 11($sp)
-; MIPS64R5-NEXT: insert.w $w0[3], $1
-; MIPS64R5-NEXT: lbu $1, 13($sp)
-; MIPS64R5-NEXT: lbu $2, 12($sp)
-; MIPS64R5-NEXT: insert.w $w1[0], $2
-; MIPS64R5-NEXT: insert.w $w1[1], $1
-; MIPS64R5-NEXT: lbu $1, 14($sp)
-; MIPS64R5-NEXT: insert.w $w1[2], $1
-; MIPS64R5-NEXT: lbu $1, 15($sp)
-; MIPS64R5-NEXT: insert.w $w1[3], $1
-; MIPS64R5-NEXT: addv.w $w0, $w1, $w0
-; MIPS64R5-NEXT: copy_s.w $1, $w0[0]
-; MIPS64R5-NEXT: copy_s.w $2, $w0[1]
-; MIPS64R5-NEXT: copy_s.w $3, $w0[2]
-; MIPS64R5-NEXT: copy_s.w $4, $w0[3]
-; MIPS64R5-NEXT: sb $4, 7($sp)
-; MIPS64R5-NEXT: sb $3, 6($sp)
-; MIPS64R5-NEXT: sb $2, 5($sp)
-; MIPS64R5-NEXT: sb $1, 4($sp)
-; MIPS64R5-NEXT: lw $2, 4($sp)
-; MIPS64R5-NEXT: daddiu $sp, $sp, 16
-; MIPS64R5-NEXT: jr $ra
-; MIPS64R5-NEXT: nop
+; MIPS64R5EB-LABEL: i8_4:
+; MIPS64R5EB: # %bb.0:
+; MIPS64R5EB-NEXT: sll $1, $5, 0
+; MIPS64R5EB-NEXT: srl $2, $1, 16
+; MIPS64R5EB-NEXT: srl $3, $1, 24
+; MIPS64R5EB-NEXT: insert.w $w0[0], $3
+; MIPS64R5EB-NEXT: insert.w $w0[1], $2
+; MIPS64R5EB-NEXT: srl $2, $1, 8
+; MIPS64R5EB-NEXT: insert.w $w0[2], $2
+; MIPS64R5EB-NEXT: sll $2, $4, 0
+; MIPS64R5EB-NEXT: insert.w $w0[3], $1
+; MIPS64R5EB-NEXT: srl $1, $2, 16
+; MIPS64R5EB-NEXT: srl $3, $2, 24
+; MIPS64R5EB-NEXT: insert.w $w1[0], $3
+; MIPS64R5EB-NEXT: insert.w $w1[1], $1
+; MIPS64R5EB-NEXT: srl $1, $2, 8
+; MIPS64R5EB-NEXT: insert.w $w1[2], $1
+; MIPS64R5EB-NEXT: insert.w $w1[3], $2
+; MIPS64R5EB-NEXT: addv.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT: copy_s.w $1, $w0[0]
+; MIPS64R5EB-NEXT: copy_s.w $2, $w0[1]
+; MIPS64R5EB-NEXT: copy_s.w $3, $w0[3]
+; MIPS64R5EB-NEXT: copy_s.w $4, $w0[2]
+; MIPS64R5EB-NEXT: andi $4, $4, 255
+; MIPS64R5EB-NEXT: ins $3, $4, 8, 24
+; MIPS64R5EB-NEXT: andi $2, $2, 255
+; MIPS64R5EB-NEXT: sll $2, $2, 16
+; MIPS64R5EB-NEXT: or $2, $3, $2
+; MIPS64R5EB-NEXT: sll $1, $1, 24
+; MIPS64R5EB-NEXT: or $2, $2, $1
+; MIPS64R5EB-NEXT: jr $ra
+; MIPS64R5EB-NEXT: nop
+;
+; MIPS32R5EL-LABEL: i8_4:
+; MIPS32R5EL: # %bb.0:
+; MIPS32R5EL-NEXT: srl $1, $5, 8
+; MIPS32R5EL-NEXT: insert.w $w0[0], $5
+; MIPS32R5EL-NEXT: insert.w $w0[1], $1
+; MIPS32R5EL-NEXT: srl $1, $5, 16
+; MIPS32R5EL-NEXT: insert.w $w0[2], $1
+; MIPS32R5EL-NEXT: srl $1, $5, 24
+; MIPS32R5EL-NEXT: insert.w $w0[3], $1
+; MIPS32R5EL-NEXT: srl $1, $4, 8
+; MIPS32R5EL-NEXT: insert.w $w1[0], $4
+; MIPS32R5EL-NEXT: insert.w $w1[1], $1
+; MIPS32R5EL-NEXT: srl $1, $4, 16
+; MIPS32R5EL-NEXT: insert.w $w1[2], $1
+; MIPS32R5EL-NEXT: srl $1, $4, 24
+; MIPS32R5EL-NEXT: insert.w $w1[3], $1
+; MIPS32R5EL-NEXT: addv.w $w0, $w1, $w0
+; MIPS32R5EL-NEXT: copy_s.w $1, $w0[3]
+; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2]
+; MIPS32R5EL-NEXT: copy_s.w $3, $w0[0]
+; MIPS32R5EL-NEXT: copy_s.w $4, $w0[1]
+; MIPS32R5EL-NEXT: andi $4, $4, 255
+; MIPS32R5EL-NEXT: ins $3, $4, 8, 24
+; MIPS32R5EL-NEXT: andi $2, $2, 255
+; MIPS32R5EL-NEXT: sll $2, $2, 16
+; MIPS32R5EL-NEXT: or $2, $3, $2
+; MIPS32R5EL-NEXT: sll $1, $1, 24
+; MIPS32R5EL-NEXT: or $2, $2, $1
+; MIPS32R5EL-NEXT: jr $ra
+; MIPS32R5EL-NEXT: nop
+;
+; MIPS64R5EL-LABEL: i8_4:
+; MIPS64R5EL: # %bb.0:
+; MIPS64R5EL-NEXT: sll $1, $5, 0
+; MIPS64R5EL-NEXT: srl $2, $1, 8
+; MIPS64R5EL-NEXT: insert.w $w0[0], $1
+; MIPS64R5EL-NEXT: insert.w $w0[1], $2
+; MIPS64R5EL-NEXT: srl $2, $1, 16
+; MIPS64R5EL-NEXT: insert.w $w0[2], $2
+; MIPS64R5EL-NEXT: sll $2, $4, 0
+; MIPS64R5EL-NEXT: srl $1, $1, 24
+; MIPS64R5EL-NEXT: insert.w $w0[3], $1
+; MIPS64R5EL-NEXT: srl $1, $2, 8
+; MIPS64R5EL-NEXT: insert.w $w1[0], $2
+; MIPS64R5EL-NEXT: insert.w $w1[1], $1
+; MIPS64R5EL-NEXT: srl $1, $2, 16
+; MIPS64R5EL-NEXT: insert.w $w1[2], $1
+; MIPS64R5EL-NEXT: srl $1, $2, 24
+; MIPS64R5EL-NEXT: insert.w $w1[3], $1
+; MIPS64R5EL-NEXT: addv.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT: copy_s.w $1, $w0[3]
+; MIPS64R5EL-NEXT: copy_s.w $2, $w0[2]
+; MIPS64R5EL-NEXT: copy_s.w $3, $w0[0]
+; MIPS64R5EL-NEXT: copy_s.w $4, $w0[1]
+; MIPS64R5EL-NEXT: andi $4, $4, 255
+; MIPS64R5EL-NEXT: ins $3, $4, 8, 24
+; MIPS64R5EL-NEXT: andi $2, $2, 255
+; MIPS64R5EL-NEXT: sll $2, $2, 16
+; MIPS64R5EL-NEXT: or $2, $3, $2
+; MIPS64R5EL-NEXT: sll $1, $1, 24
+; MIPS64R5EL-NEXT: or $2, $2, $1
+; MIPS64R5EL-NEXT: jr $ra
+; MIPS64R5EL-NEXT: nop
%1 = add <4 x i8> %a, %b
ret <4 x i8> %1
}
@@ -771,65 +827,80 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
-; MIPS64R5-LABEL: i8_8:
-; MIPS64R5: # %bb.0:
-; MIPS64R5-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT: sd $5, 16($sp)
-; MIPS64R5-NEXT: lbu $1, 17($sp)
-; MIPS64R5-NEXT: lbu $2, 16($sp)
-; MIPS64R5-NEXT: sd $4, 24($sp)
-; MIPS64R5-NEXT: insert.h $w0[0], $2
-; MIPS64R5-NEXT: insert.h $w0[1], $1
-; MIPS64R5-NEXT: lbu $1, 18($sp)
-; MIPS64R5-NEXT: insert.h $w0[2], $1
-; MIPS64R5-NEXT: lbu $1, 19($sp)
-; MIPS64R5-NEXT: insert.h $w0[3], $1
-; MIPS64R5-NEXT: lbu $1, 20($sp)
-; MIPS64R5-NEXT: insert.h $w0[4], $1
-; MIPS64R5-NEXT: lbu $1, 25($sp)
-; MIPS64R5-NEXT: lbu $2, 24($sp)
-; MIPS64R5-NEXT: insert.h $w1[0], $2
-; MIPS64R5-NEXT: insert.h $w1[1], $1
-; MIPS64R5-NEXT: lbu $1, 21($sp)
-; MIPS64R5-NEXT: lbu $2, 26($sp)
-; MIPS64R5-NEXT: insert.h $w1[2], $2
-; MIPS64R5-NEXT: insert.h $w0[5], $1
-; MIPS64R5-NEXT: lbu $1, 27($sp)
-; MIPS64R5-NEXT: lbu $2, 23($sp)
-; MIPS64R5-NEXT: lbu $3, 22($sp)
-; MIPS64R5-NEXT: lbu $4, 31($sp)
-; MIPS64R5-NEXT: insert.h $w0[6], $3
-; MIPS64R5-NEXT: insert.h $w0[7], $2
-; MIPS64R5-NEXT: insert.h $w1[3], $1
-; MIPS64R5-NEXT: lbu $1, 28($sp)
-; MIPS64R5-NEXT: insert.h $w1[4], $1
-; MIPS64R5-NEXT: lbu $1, 29($sp)
-; MIPS64R5-NEXT: insert.h $w1[5], $1
-; MIPS64R5-NEXT: lbu $1, 30($sp)
-; MIPS64R5-NEXT: insert.h $w1[6], $1
-; MIPS64R5-NEXT: insert.h $w1[7], $4
-; MIPS64R5-NEXT: addv.h $w0, $w1, $w0
-; MIPS64R5-NEXT: copy_s.h $1, $w0[0]
-; MIPS64R5-NEXT: copy_s.h $2, $w0[1]
-; MIPS64R5-NEXT: copy_s.h $3, $w0[2]
-; MIPS64R5-NEXT: copy_s.h $4, $w0[3]
-; MIPS64R5-NEXT: copy_s.h $5, $w0[4]
-; MIPS64R5-NEXT: copy_s.h $6, $w0[5]
-; MIPS64R5-NEXT: copy_s.h $7, $w0[6]
-; MIPS64R5-NEXT: copy_s.h $8, $w0[7]
-; MIPS64R5-NEXT: sb $8, 15($sp)
-; MIPS64R5-NEXT: sb $7, 14($sp)
-; MIPS64R5-NEXT: sb $6, 13($sp)
-; MIPS64R5-NEXT: sb $5, 12($sp)
-; MIPS64R5-NEXT: sb $4, 11($sp)
-; MIPS64R5-NEXT: sb $3, 10($sp)
-; MIPS64R5-NEXT: sb $2, 9($sp)
-; MIPS64R5-NEXT: sb $1, 8($sp)
-; MIPS64R5-NEXT: ld $2, 8($sp)
-; MIPS64R5-NEXT: daddiu $sp, $sp, 32
-; MIPS64R5-NEXT: jr $ra
-; MIPS64R5-NEXT: nop
+; MIPS64R5EB-LABEL: i8_8:
+; MIPS64R5EB: # %bb.0:
+; MIPS64R5EB-NEXT: dsrl $1, $5, 48
+; MIPS64R5EB-NEXT: dsrl $2, $5, 40
+; MIPS64R5EB-NEXT: dsrl $3, $4, 48
+; MIPS64R5EB-NEXT: sll $1, $1, 0
+; MIPS64R5EB-NEXT: dsrl $6, $5, 56
+; MIPS64R5EB-NEXT: sll $6, $6, 0
+; MIPS64R5EB-NEXT: insert.h $w0[0], $6
+; MIPS64R5EB-NEXT: insert.h $w0[1], $1
+; MIPS64R5EB-NEXT: sll $1, $2, 0
+; MIPS64R5EB-NEXT: sll $2, $3, 0
+; MIPS64R5EB-NEXT: dsrl $3, $4, 56
+; MIPS64R5EB-NEXT: sll $3, $3, 0
+; MIPS64R5EB-NEXT: insert.h $w1[0], $3
+; MIPS64R5EB-NEXT: insert.h $w1[1], $2
+; MIPS64R5EB-NEXT: insert.h $w0[2], $1
+; MIPS64R5EB-NEXT: dsrl $1, $4, 40
+; MIPS64R5EB-NEXT: sll $1, $1, 0
+; MIPS64R5EB-NEXT: dsrl $2, $5, 32
+; MIPS64R5EB-NEXT: sll $2, $2, 0
+; MIPS64R5EB-NEXT: insert.h $w0[3], $2
+; MIPS64R5EB-NEXT: insert.h $w1[2], $1
+; MIPS64R5EB-NEXT: dsrl $1, $5, 24
+; MIPS64R5EB-NEXT: dsrl $2, $4, 24
+; MIPS64R5EB-NEXT: sll $1, $1, 0
+; MIPS64R5EB-NEXT: dsrl $3, $4, 32
+; MIPS64R5EB-NEXT: sll $3, $3, 0
+; MIPS64R5EB-NEXT: insert.h $w1[3], $3
+; MIPS64R5EB-NEXT: insert.h $w0[4], $1
+; MIPS64R5EB-NEXT: sll $1, $5, 0
+; MIPS64R5EB-NEXT: srl $3, $1, 16
+; MIPS64R5EB-NEXT: insert.h $w0[5], $3
+; MIPS64R5EB-NEXT: sll $2, $2, 0
+; MIPS64R5EB-NEXT: srl $3, $1, 8
+; MIPS64R5EB-NEXT: insert.h $w0[6], $3
+; MIPS64R5EB-NEXT: insert.h $w0[7], $1
+; MIPS64R5EB-NEXT: insert.h $w1[4], $2
+; MIPS64R5EB-NEXT: sll $1, $4, 0
+; MIPS64R5EB-NEXT: srl $2, $1, 16
+; MIPS64R5EB-NEXT: insert.h $w1[5], $2
+; MIPS64R5EB-NEXT: srl $2, $1, 8
+; MIPS64R5EB-NEXT: insert.h $w1[6], $2
+; MIPS64R5EB-NEXT: insert.h $w1[7], $1
+; MIPS64R5EB-NEXT: addv.h $w0, $w1, $w0
+; MIPS64R5EB-NEXT: copy_s.h $1, $w0[1]
+; MIPS64R5EB-NEXT: copy_s.h $2, $w0[0]
+; MIPS64R5EB-NEXT: copy_s.h $3, $w0[2]
+; MIPS64R5EB-NEXT: copy_s.h $4, $w0[3]
+; MIPS64R5EB-NEXT: copy_s.h $5, $w0[4]
+; MIPS64R5EB-NEXT: copy_s.h $6, $w0[5]
+; MIPS64R5EB-NEXT: copy_s.h $7, $w0[6]
+; MIPS64R5EB-NEXT: copy_s.h $8, $w0[7]
+; MIPS64R5EB-NEXT: andi $7, $7, 255
+; MIPS64R5EB-NEXT: dinsm $8, $7, 8, 56
+; MIPS64R5EB-NEXT: andi $6, $6, 255
+; MIPS64R5EB-NEXT: dsll $6, $6, 16
+; MIPS64R5EB-NEXT: or $6, $8, $6
+; MIPS64R5EB-NEXT: andi $5, $5, 255
+; MIPS64R5EB-NEXT: dsll $5, $5, 24
+; MIPS64R5EB-NEXT: or $5, $6, $5
+; MIPS64R5EB-NEXT: andi $4, $4, 255
+; MIPS64R5EB-NEXT: dsll $4, $4, 32
+; MIPS64R5EB-NEXT: or $4, $5, $4
+; MIPS64R5EB-NEXT: andi $3, $3, 255
+; MIPS64R5EB-NEXT: dsll $3, $3, 40
+; MIPS64R5EB-NEXT: or $3, $4, $3
+; MIPS64R5EB-NEXT: andi $1, $1, 255
+; MIPS64R5EB-NEXT: dsll $1, $1, 48
+; MIPS64R5EB-NEXT: or $1, $3, $1
+; MIPS64R5EB-NEXT: dsll $2, $2, 56
+; MIPS64R5EB-NEXT: or $2, $1, $2
+; MIPS64R5EB-NEXT: jr $ra
+; MIPS64R5EB-NEXT: nop
;
; MIPS32R5EL-LABEL: i8_8:
; MIPS32R5EL: # %bb.0:
@@ -909,6 +980,85 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
; MIPS32R5EL-NEXT: addiu $sp, $sp, 48
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
+;
+; MIPS64R5EL-LABEL: i8_8:
+; MIPS64R5EL: # %bb.0:
+; MIPS64R5EL-NEXT: dsrl $1, $5, 8
+; MIPS64R5EL-NEXT: dsrl $2, $4, 8
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: sll $3, $5, 0
+; MIPS64R5EL-NEXT: insert.h $w0[0], $3
+; MIPS64R5EL-NEXT: insert.h $w0[1], $1
+; MIPS64R5EL-NEXT: sll $1, $2, 0
+; MIPS64R5EL-NEXT: sll $2, $4, 0
+; MIPS64R5EL-NEXT: dsrl $3, $5, 16
+; MIPS64R5EL-NEXT: sll $3, $3, 0
+; MIPS64R5EL-NEXT: insert.h $w0[2], $3
+; MIPS64R5EL-NEXT: insert.h $w1[0], $2
+; MIPS64R5EL-NEXT: insert.h $w1[1], $1
+; MIPS64R5EL-NEXT: dsrl $1, $4, 16
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: insert.h $w1[2], $1
+; MIPS64R5EL-NEXT: dsrl $1, $5, 32
+; MIPS64R5EL-NEXT: dsrl $2, $4, 32
+; MIPS64R5EL-NEXT: dsrl $3, $5, 24
+; MIPS64R5EL-NEXT: sll $3, $3, 0
+; MIPS64R5EL-NEXT: insert.h $w0[3], $3
+; MIPS64R5EL-NEXT: dsrl $3, $5, 56
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: dsrl $6, $4, 24
+; MIPS64R5EL-NEXT: sll $6, $6, 0
+; MIPS64R5EL-NEXT: insert.h $w1[3], $6
+; MIPS64R5EL-NEXT: insert.h $w0[4], $1
+; MIPS64R5EL-NEXT: sll $1, $2, 0
+; MIPS64R5EL-NEXT: dsrl $2, $5, 48
+; MIPS64R5EL-NEXT: dsrl $5, $5, 40
+; MIPS64R5EL-NEXT: sll $5, $5, 0
+; MIPS64R5EL-NEXT: dsrl $6, $4, 56
+; MIPS64R5EL-NEXT: dsrl $7, $4, 48
+; MIPS64R5EL-NEXT: insert.h $w0[5], $5
+; MIPS64R5EL-NEXT: sll $2, $2, 0
+; MIPS64R5EL-NEXT: insert.h $w0[6], $2
+; MIPS64R5EL-NEXT: sll $2, $3, 0
+; MIPS64R5EL-NEXT: insert.h $w0[7], $2
+; MIPS64R5EL-NEXT: insert.h $w1[4], $1
+; MIPS64R5EL-NEXT: dsrl $1, $4, 40
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: insert.h $w1[5], $1
+; MIPS64R5EL-NEXT: sll $1, $7, 0
+; MIPS64R5EL-NEXT: insert.h $w1[6], $1
+; MIPS64R5EL-NEXT: sll $1, $6, 0
+; MIPS64R5EL-NEXT: insert.h $w1[7], $1
+; MIPS64R5EL-NEXT: addv.h $w0, $w1, $w0
+; MIPS64R5EL-NEXT: copy_s.h $1, $w0[6]
+; MIPS64R5EL-NEXT: copy_s.h $2, $w0[7]
+; MIPS64R5EL-NEXT: copy_s.h $3, $w0[5]
+; MIPS64R5EL-NEXT: copy_s.h $4, $w0[4]
+; MIPS64R5EL-NEXT: copy_s.h $5, $w0[3]
+; MIPS64R5EL-NEXT: copy_s.h $6, $w0[2]
+; MIPS64R5EL-NEXT: copy_s.h $7, $w0[1]
+; MIPS64R5EL-NEXT: copy_s.h $8, $w0[0]
+; MIPS64R5EL-NEXT: andi $7, $7, 255
+; MIPS64R5EL-NEXT: dinsm $8, $7, 8, 56
+; MIPS64R5EL-NEXT: andi $6, $6, 255
+; MIPS64R5EL-NEXT: dsll $6, $6, 16
+; MIPS64R5EL-NEXT: or $6, $8, $6
+; MIPS64R5EL-NEXT: andi $5, $5, 255
+; MIPS64R5EL-NEXT: dsll $5, $5, 24
+; MIPS64R5EL-NEXT: or $5, $6, $5
+; MIPS64R5EL-NEXT: andi $4, $4, 255
+; MIPS64R5EL-NEXT: dsll $4, $4, 32
+; MIPS64R5EL-NEXT: or $4, $5, $4
+; MIPS64R5EL-NEXT: andi $3, $3, 255
+; MIPS64R5EL-NEXT: dsll $3, $3, 40
+; MIPS64R5EL-NEXT: or $3, $4, $3
+; MIPS64R5EL-NEXT: andi $1, $1, 255
+; MIPS64R5EL-NEXT: dsll $1, $1, 48
+; MIPS64R5EL-NEXT: or $1, $3, $1
+; MIPS64R5EL-NEXT: dsll $2, $2, 56
+; MIPS64R5EL-NEXT: or $2, $1, $2
+; MIPS64R5EL-NEXT: jr $ra
+; MIPS64R5EL-NEXT: nop
%1 = add <8 x i8> %a, %b
ret <8 x i8> %1
}
@@ -1221,102 +1371,101 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) {
;
; MIPS32R5EB-LABEL: i16_2:
; MIPS32R5EB: # %bb.0:
-; MIPS32R5EB-NEXT: addiu $sp, $sp, -64
-; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 64
-; MIPS32R5EB-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT: addiu $sp, $sp, -48
+; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48
+; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill
; MIPS32R5EB-NEXT: .cfi_offset 31, -4
; MIPS32R5EB-NEXT: .cfi_offset 30, -8
; MIPS32R5EB-NEXT: move $fp, $sp
; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30
; MIPS32R5EB-NEXT: addiu $1, $zero, -16
; MIPS32R5EB-NEXT: and $sp, $sp, $1
-; MIPS32R5EB-NEXT: sw $5, 48($sp)
-; MIPS32R5EB-NEXT: sw $4, 52($sp)
-; MIPS32R5EB-NEXT: lhu $1, 50($sp)
-; MIPS32R5EB-NEXT: sw $1, 28($sp)
-; MIPS32R5EB-NEXT: lhu $1, 48($sp)
+; MIPS32R5EB-NEXT: sw $5, 28($sp)
+; MIPS32R5EB-NEXT: srl $1, $5, 16
; MIPS32R5EB-NEXT: sw $1, 20($sp)
-; MIPS32R5EB-NEXT: lhu $1, 54($sp)
-; MIPS32R5EB-NEXT: sw $1, 12($sp)
-; MIPS32R5EB-NEXT: lhu $1, 52($sp)
+; MIPS32R5EB-NEXT: sw $4, 12($sp)
+; MIPS32R5EB-NEXT: srl $1, $4, 16
; MIPS32R5EB-NEXT: sw $1, 4($sp)
; MIPS32R5EB-NEXT: ld.d $w0, 16($sp)
; MIPS32R5EB-NEXT: ld.d $w1, 0($sp)
; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1]
; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3]
-; MIPS32R5EB-NEXT: sh $2, 46($sp)
-; MIPS32R5EB-NEXT: sh $1, 44($sp)
-; MIPS32R5EB-NEXT: lw $2, 44($sp)
+; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1]
+; MIPS32R5EB-NEXT: ins $2, $1, 16, 16
; MIPS32R5EB-NEXT: move $sp, $fp
-; MIPS32R5EB-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT: addiu $sp, $sp, 64
+; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT: addiu $sp, $sp, 48
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
-; MIPS64R5-LABEL: i16_2:
-; MIPS64R5: # %bb.0:
-; MIPS64R5-NEXT: daddiu $sp, $sp, -16
-; MIPS64R5-NEXT: .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT: sw $5, 8($sp)
-; MIPS64R5-NEXT: sw $4, 12($sp)
-; MIPS64R5-NEXT: lh $1, 10($sp)
-; MIPS64R5-NEXT: lh $2, 8($sp)
-; MIPS64R5-NEXT: insert.d $w0[0], $2
-; MIPS64R5-NEXT: insert.d $w0[1], $1
-; MIPS64R5-NEXT: lh $1, 14($sp)
-; MIPS64R5-NEXT: lh $2, 12($sp)
-; MIPS64R5-NEXT: insert.d $w1[0], $2
-; MIPS64R5-NEXT: insert.d $w1[1], $1
-; MIPS64R5-NEXT: addv.d $w0, $w1, $w0
-; MIPS64R5-NEXT: copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT: copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT: sh $2, 6($sp)
-; MIPS64R5-NEXT: sh $1, 4($sp)
-; MIPS64R5-NEXT: lw $2, 4($sp)
-; MIPS64R5-NEXT: daddiu $sp, $sp, 16
-; MIPS64R5-NEXT: jr $ra
-; MIPS64R5-NEXT: nop
+; MIPS64R5EB-LABEL: i16_2:
+; MIPS64R5EB: # %bb.0:
+; MIPS64R5EB-NEXT: sll $1, $5, 0
+; MIPS64R5EB-NEXT: srl $1, $1, 16
+; MIPS64R5EB-NEXT: insert.d $w0[0], $1
+; MIPS64R5EB-NEXT: insert.d $w0[1], $5
+; MIPS64R5EB-NEXT: sll $1, $4, 0
+; MIPS64R5EB-NEXT: srl $1, $1, 16
+; MIPS64R5EB-NEXT: insert.d $w1[0], $1
+; MIPS64R5EB-NEXT: insert.d $w1[1], $4
+; MIPS64R5EB-NEXT: addv.d $w0, $w1, $w0
+; MIPS64R5EB-NEXT: shf.w $w0, $w0, 177
+; MIPS64R5EB-NEXT: copy_s.w $2, $w0[3]
+; MIPS64R5EB-NEXT: copy_s.w $1, $w0[1]
+; MIPS64R5EB-NEXT: ins $2, $1, 16, 16
+; MIPS64R5EB-NEXT: jr $ra
+; MIPS64R5EB-NEXT: nop
;
; MIPS32R5EL-LABEL: i16_2:
; MIPS32R5EL: # %bb.0:
-; MIPS32R5EL-NEXT: addiu $sp, $sp, -64
-; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 64
-; MIPS32R5EL-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT: addiu $sp, $sp, -48
+; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48
+; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill
; MIPS32R5EL-NEXT: .cfi_offset 31, -4
; MIPS32R5EL-NEXT: .cfi_offset 30, -8
; MIPS32R5EL-NEXT: move $fp, $sp
; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30
; MIPS32R5EL-NEXT: addiu $1, $zero, -16
; MIPS32R5EL-NEXT: and $sp, $sp, $1
-; MIPS32R5EL-NEXT: sw $5, 48($sp)
-; MIPS32R5EL-NEXT: sw $4, 52($sp)
-; MIPS32R5EL-NEXT: lhu $1, 50($sp)
+; MIPS32R5EL-NEXT: sw $5, 16($sp)
+; MIPS32R5EL-NEXT: srl $1, $5, 16
; MIPS32R5EL-NEXT: sw $1, 24($sp)
-; MIPS32R5EL-NEXT: lhu $1, 48($sp)
-; MIPS32R5EL-NEXT: sw $1, 16($sp)
-; MIPS32R5EL-NEXT: lhu $1, 54($sp)
+; MIPS32R5EL-NEXT: sw $4, 0($sp)
+; MIPS32R5EL-NEXT: srl $1, $4, 16
; MIPS32R5EL-NEXT: sw $1, 8($sp)
-; MIPS32R5EL-NEXT: lhu $1, 52($sp)
-; MIPS32R5EL-NEXT: sw $1, 0($sp)
; MIPS32R5EL-NEXT: ld.d $w0, 16($sp)
; MIPS32R5EL-NEXT: ld.d $w1, 0($sp)
; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0
-; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0]
-; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2]
-; MIPS32R5EL-NEXT: sh $2, 46($sp)
-; MIPS32R5EL-NEXT: sh $1, 44($sp)
-; MIPS32R5EL-NEXT: lw $2, 44($sp)
+; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0]
+; MIPS32R5EL-NEXT: copy_s.w $1, $w0[2]
+; MIPS32R5EL-NEXT: ins $2, $1, 16, 16
; MIPS32R5EL-NEXT: move $sp, $fp
-; MIPS32R5EL-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT: addiu $sp, $sp, 64
+; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT: addiu $sp, $sp, 48
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
+;
+; MIPS64R5EL-LABEL: i16_2:
+; MIPS64R5EL: # %bb.0:
+; MIPS64R5EL-NEXT: sll $1, $5, 0
+; MIPS64R5EL-NEXT: srl $1, $1, 16
+; MIPS64R5EL-NEXT: insert.d $w0[0], $5
+; MIPS64R5EL-NEXT: insert.d $w0[1], $1
+; MIPS64R5EL-NEXT: sll $1, $4, 0
+; MIPS64R5EL-NEXT: srl $1, $1, 16
+; MIPS64R5EL-NEXT: insert.d $w1[0], $4
+; MIPS64R5EL-NEXT: insert.d $w1[1], $1
+; MIPS64R5EL-NEXT: addv.d $w0, $w1, $w0
+; MIPS64R5EL-NEXT: copy_s.w $2, $w0[0]
+; MIPS64R5EL-NEXT: copy_s.w $1, $w0[2]
+; MIPS64R5EL-NEXT: ins $2, $1, 16, 16
+; MIPS64R5EL-NEXT: jr $ra
+; MIPS64R5EL-NEXT: nop
%1 = add <2 x i16> %a, %b
ret <2 x i16> %1
}
@@ -1427,41 +1576,44 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) {
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
-; MIPS64R5-LABEL: i16_4:
-; MIPS64R5: # %bb.0:
-; MIPS64R5-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT: sd $5, 16($sp)
-; MIPS64R5-NEXT: sd $4, 24($sp)
-; MIPS64R5-NEXT: lhu $1, 18($sp)
-; MIPS64R5-NEXT: lhu $2, 16($sp)
-; MIPS64R5-NEXT: insert.w $w0[0], $2
-; MIPS64R5-NEXT: insert.w $w0[1], $1
-; MIPS64R5-NEXT: lhu $1, 20($sp)
-; MIPS64R5-NEXT: insert.w $w0[2], $1
-; MIPS64R5-NEXT: lhu $1, 22($sp)
-; MIPS64R5-NEXT: insert.w $w0[3], $1
-; MIPS64R5-NEXT: lhu $1, 26($sp)
-; MIPS64R5-NEXT: lhu $2, 24($sp)
-; MIPS64R5-NEXT: insert.w $w1[0], $2
-; MIPS64R5-NEXT: insert.w $w1[1], $1
-; MIPS64R5-NEXT: lhu $1, 28($sp)
-; MIPS64R5-NEXT: insert.w $w1[2], $1
-; MIPS64R5-NEXT: lhu $1, 30($sp)
-; MIPS64R5-NEXT: insert.w $w1[3], $1
-; MIPS64R5-NEXT: addv.w $w0, $w1, $w0
-; MIPS64R5-NEXT: copy_s.w $1, $w0[0]
-; MIPS64R5-NEXT: copy_s.w $2, $w0[1]
-; MIPS64R5-NEXT: copy_s.w $3, $w0[2]
-; MIPS64R5-NEXT: copy_s.w $4, $w0[3]
-; MIPS64R5-NEXT: sh $4, 14($sp)
-; MIPS64R5-NEXT: sh $3, 12($sp)
-; MIPS64R5-NEXT: sh $2, 10($sp)
-; MIPS64R5-NEXT: sh $1, 8($sp)
-; MIPS64R5-NEXT: ld $2, 8($sp)
-; MIPS64R5-NEXT: daddiu $sp, $sp, 32
-; MIPS64R5-NEXT: jr $ra
-; MIPS64R5-NEXT: nop
+; MIPS64R5EB-LABEL: i16_4:
+; MIPS64R5EB: # %bb.0:
+; MIPS64R5EB-NEXT: dsrl $1, $5, 32
+; MIPS64R5EB-NEXT: sll $1, $1, 0
+; MIPS64R5EB-NEXT: dsrl $2, $5, 48
+; MIPS64R5EB-NEXT: sll $2, $2, 0
+; MIPS64R5EB-NEXT: insert.w $w0[0], $2
+; MIPS64R5EB-NEXT: insert.w $w0[1], $1
+; MIPS64R5EB-NEXT: dsrl $1, $4, 32
+; MIPS64R5EB-NEXT: dsrl $2, $5, 16
+; MIPS64R5EB-NEXT: sll $2, $2, 0
+; MIPS64R5EB-NEXT: insert.w $w0[2], $2
+; MIPS64R5EB-NEXT: sll $1, $1, 0
+; MIPS64R5EB-NEXT: dsrl $2, $4, 48
+; MIPS64R5EB-NEXT: sll $2, $2, 0
+; MIPS64R5EB-NEXT: sll $3, $5, 0
+; MIPS64R5EB-NEXT: insert.w $w0[3], $3
+; MIPS64R5EB-NEXT: insert.w $w1[0], $2
+; MIPS64R5EB-NEXT: insert.w $w1[1], $1
+; MIPS64R5EB-NEXT: dsrl $1, $4, 16
+; MIPS64R5EB-NEXT: sll $1, $1, 0
+; MIPS64R5EB-NEXT: insert.w $w1[2], $1
+; MIPS64R5EB-NEXT: sll $1, $4, 0
+; MIPS64R5EB-NEXT: insert.w $w1[3], $1
+; MIPS64R5EB-NEXT: addv.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT: copy_s.w $1, $w0[0]
+; MIPS64R5EB-NEXT: copy_s.w $2, $w0[1]
+; MIPS64R5EB-NEXT: copy_s.w $3, $w0[2]
+; MIPS64R5EB-NEXT: copy_s.w $4, $w0[3]
+; MIPS64R5EB-NEXT: andi $3, $3, 65535
+; MIPS64R5EB-NEXT: dinsm $4, $3, 16, 48
+; MIPS64R5EB-NEXT: andi $2, $2, 65535
+; MIPS64R5EB-NEXT: dsll $2, $2, 32
+; MIPS64R5EB-NEXT: or $2, $4, $2
+; MIPS64R5EB-NEXT: dsll $1, $1, 48
+; MIPS64R5EB-NEXT: or $2, $2, $1
+; MIPS64R5EB-NEXT: jr $ra
+; MIPS64R5EB-NEXT: nop
;
; MIPS32R5EL-LABEL: i16_4:
; MIPS32R5EL: # %bb.0:
@@ -1517,6 +1669,45 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) {
; MIPS32R5EL-NEXT: addiu $sp, $sp, 48
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
+;
+; MIPS64R5EL-LABEL: i16_4:
+; MIPS64R5EL: # %bb.0:
+; MIPS64R5EL-NEXT: dsrl $1, $5, 16
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: sll $2, $5, 0
+; MIPS64R5EL-NEXT: insert.w $w0[0], $2
+; MIPS64R5EL-NEXT: insert.w $w0[1], $1
+; MIPS64R5EL-NEXT: dsrl $1, $4, 16
+; MIPS64R5EL-NEXT: dsrl $2, $5, 32
+; MIPS64R5EL-NEXT: sll $2, $2, 0
+; MIPS64R5EL-NEXT: insert.w $w0[2], $2
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: sll $2, $4, 0
+; MIPS64R5EL-NEXT: dsrl $3, $5, 48
+; MIPS64R5EL-NEXT: sll $3, $3, 0
+; MIPS64R5EL-NEXT: insert.w $w0[3], $3
+; MIPS64R5EL-NEXT: insert.w $w1[0], $2
+; MIPS64R5EL-NEXT: insert.w $w1[1], $1
+; MIPS64R5EL-NEXT: dsrl $1, $4, 32
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: insert.w $w1[2], $1
+; MIPS64R5EL-NEXT: dsrl $1, $4, 48
+; MIPS64R5EL-NEXT: sll $1, $1, 0
+; MIPS64R5EL-NEXT: insert.w $w1[3], $1
+; MIPS64R5EL-NEXT: addv.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT: copy_s.w $1, $w0[3]
+; MIPS64R5EL-NEXT: copy_s.w $2, $w0[2]
+; MIPS64R5EL-NEXT: copy_s.w $3, $w0[1]
+; MIPS64R5EL-NEXT: copy_s.w $4, $w0[0]
+; MIPS64R5EL-NEXT: andi $3, $3, 65535
+; MIPS64R5EL-NEXT: dinsm $4, $3, 16, 48
+; MIPS64R5EL-NEXT: andi $2, $2, 65535
+; MIPS64R5EL-NEXT: dsll $2, $2, 32
+; MIPS64R5EL-NEXT: or $2, $4, $2
+; MIPS64R5EL-NEXT: dsll $1, $1, 48
+; MIPS64R5EL-NEXT: or $2, $2, $1
+; MIPS64R5EL-NEXT: jr $ra
+; MIPS64R5EL-NEXT: nop
%1 = add <4 x i16> %a, %b
ret <4 x i16> %1
}
@@ -1749,8 +1940,6 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
;
; MIPS64R5EB-LABEL: i32_2:
; MIPS64R5EB: # %bb.0:
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32
; MIPS64R5EB-NEXT: dsrl $1, $5, 32
; MIPS64R5EB-NEXT: insert.d $w0[0], $1
; MIPS64R5EB-NEXT: insert.d $w0[1], $5
@@ -1758,12 +1947,12 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
; MIPS64R5EB-NEXT: insert.d $w1[0], $1
; MIPS64R5EB-NEXT: insert.d $w1[1], $4
; MIPS64R5EB-NEXT: addv.d $w0, $w1, $w0
-; MIPS64R5EB-NEXT: copy_s.d $1, $w0[0]
-; MIPS64R5EB-NEXT: copy_s.d $2, $w0[1]
-; MIPS64R5EB-NEXT: sw $2, 12($sp)
-; MIPS64R5EB-NEXT: sw $1, 8($sp)
-; MIPS64R5EB-NEXT: ld $2, 8($sp)
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT: shf.w $w0, $w0, 177
+; MIPS64R5EB-NEXT: copy_s.w $1, $w0[1]
+; MIPS64R5EB-NEXT: copy_s.w $2, $w0[3]
+; MIPS64R5EB-NEXT: dext $2, $2, 0, 32
+; MIPS64R5EB-NEXT: dsll $1, $1, 32
+; MIPS64R5EB-NEXT: or $2, $2, $1
; MIPS64R5EB-NEXT: jr $ra
; MIPS64R5EB-NEXT: nop
;
@@ -1797,23 +1986,18 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
;
; MIPS64R5EL-LABEL: i32_2:
; MIPS64R5EL: # %bb.0:
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT: sd $5, 16($sp)
-; MIPS64R5EL-NEXT: sd $4, 24($sp)
-; MIPS64R5EL-NEXT: lw $1, 20($sp)
+; MIPS64R5EL-NEXT: dsrl $1, $5, 32
; MIPS64R5EL-NEXT: insert.d $w0[0], $5
; MIPS64R5EL-NEXT: insert.d $w0[1], $1
-; MIPS64R5EL-NEXT: lw $1, 28($sp)
+; MIPS64R5EL-NEXT: dsrl $1, $4, 32
; MIPS64R5EL-NEXT: insert.d $w1[0], $4
; MIPS64R5EL-NEXT: insert.d $w1[1], $1
; MIPS64R5EL-NEXT: addv.d $w0, $w1, $w0
-; MIPS64R5EL-NEXT: copy_s.d $1, $w0[0]
-; MIPS64R5EL-NEXT: copy_s.d $2, $w0[1]
-; MIPS64R5EL-NEXT: sw $2, 12($sp)
-; MIPS64R5EL-NEXT: sw $1, 8($sp)
-; MIPS64R5EL-NEXT: ld $2, 8($sp)
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT: copy_s.w $1, $w0[2]
+; MIPS64R5EL-NEXT: copy_s.w $2, $w0[0]
+; MIPS64R5EL-NEXT: dext $2, $2, 0, 32
+; MIPS64R5EL-NEXT: dsll $1, $1, 32
+; MIPS64R5EL-NEXT: or $2, $2, $1
; MIPS64R5EL-NEXT: jr $ra
; MIPS64R5EL-NEXT: nop
%1 = add <2 x i32> %a, %b
@@ -3424,9 +3608,9 @@ define void @call_i8_4() {
;
; MIPS32R5EB-LABEL: call_i8_4:
; MIPS32R5EB: # %bb.0: # %entry
-; MIPS32R5EB-NEXT: addiu $sp, $sp, -32
-; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT: addiu $sp, $sp, -24
+; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 24
+; MIPS32R5EB-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
; MIPS32R5EB-NEXT: .cfi_offset 31, -4
; MIPS32R5EB-NEXT: lui $1, 1543
; MIPS32R5EB-NEXT: ori $4, $1, 2314
@@ -3436,17 +3620,17 @@ define void @call_i8_4() {
; MIPS32R5EB-NEXT: nop
; MIPS32R5EB-NEXT: lui $1, %hi(gv4i8)
; MIPS32R5EB-NEXT: sw $2, %lo(gv4i8)($1)
-; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT: addiu $sp, $sp, 32
+; MIPS32R5EB-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT: addiu $sp, $sp, 24
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
; MIPS64R5EB-LABEL: call_i8_4:
; MIPS64R5EB: # %bb.0: # %entry
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EB-NEXT: .cfi_offset 31, -8
; MIPS64R5EB-NEXT: .cfi_offset 28, -16
; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_4)))
@@ -3461,9 +3645,9 @@ define void @call_i8_4() {
; MIPS64R5EB-NEXT: nop
; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i8)($gp)
; MIPS64R5EB-NEXT: sw $2, 0($1)
-; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EB-NEXT: jr $ra
; MIPS64R5EB-NEXT: nop
;
@@ -3512,9 +3696,9 @@ define void @call_i8_4() {
;
; MIPS32R5EL-LABEL: call_i8_4:
; MIPS32R5EL: # %bb.0: # %entry
-; MIPS32R5EL-NEXT: addiu $sp, $sp, -32
-; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT: addiu $sp, $sp, -24
+; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 24
+; MIPS32R5EL-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
; MIPS32R5EL-NEXT: .cfi_offset 31, -4
; MIPS32R5EL-NEXT: lui $1, 2569
; MIPS32R5EL-NEXT: ori $4, $1, 1798
@@ -3523,17 +3707,17 @@ define void @call_i8_4() {
; MIPS32R5EL-NEXT: nop
; MIPS32R5EL-NEXT: lui $1, %hi(gv4i8)
; MIPS32R5EL-NEXT: sw $2, %lo(gv4i8)($1)
-; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT: addiu $sp, $sp, 32
+; MIPS32R5EL-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT: addiu $sp, $sp, 24
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
;
; MIPS64R5EL-LABEL: call_i8_4:
; MIPS64R5EL: # %bb.0: # %entry
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EL-NEXT: .cfi_offset 31, -8
; MIPS64R5EL-NEXT: .cfi_offset 28, -16
; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_4)))
@@ -3547,9 +3731,9 @@ define void @call_i8_4() {
; MIPS64R5EL-NEXT: nop
; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i8)($gp)
; MIPS64R5EL-NEXT: sw $2, 0($1)
-; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EL-NEXT: jr $ra
; MIPS64R5EL-NEXT: nop
entry:
@@ -3641,10 +3825,10 @@ define void @call_i8_8() {
;
; MIPS64R5EB-LABEL: call_i8_8:
; MIPS64R5EB: # %bb.0: # %entry
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EB-NEXT: .cfi_offset 31, -8
; MIPS64R5EB-NEXT: .cfi_offset 28, -16
; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_8)))
@@ -3667,9 +3851,9 @@ define void @call_i8_8() {
; MIPS64R5EB-NEXT: nop
; MIPS64R5EB-NEXT: ld $1, %got_disp(gv8i8)($gp)
; MIPS64R5EB-NEXT: sd $2, 0($1)
-; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EB-NEXT: jr $ra
; MIPS64R5EB-NEXT: nop
;
@@ -3748,10 +3932,10 @@ define void @call_i8_8() {
;
; MIPS64R5EL-LABEL: call_i8_8:
; MIPS64R5EL: # %bb.0: # %entry
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EL-NEXT: .cfi_offset 31, -8
; MIPS64R5EL-NEXT: .cfi_offset 28, -16
; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(call_i8_8)))
@@ -3769,9 +3953,9 @@ define void @call_i8_8() {
; MIPS64R5EL-NEXT: nop
; MIPS64R5EL-NEXT: ld $1, %got_disp(gv8i8)($gp)
; MIPS64R5EL-NEXT: sd $2, 0($1)
-; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EL-NEXT: jr $ra
; MIPS64R5EL-NEXT: nop
entry:
@@ -4059,9 +4243,9 @@ define void @calli16_2() {
;
; MIPS32R5EB-LABEL: calli16_2:
; MIPS32R5EB: # %bb.0: # %entry
-; MIPS32R5EB-NEXT: addiu $sp, $sp, -32
-; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT: addiu $sp, $sp, -24
+; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 24
+; MIPS32R5EB-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
; MIPS32R5EB-NEXT: .cfi_offset 31, -4
; MIPS32R5EB-NEXT: lui $1, 6
; MIPS32R5EB-NEXT: ori $4, $1, 7
@@ -4071,17 +4255,17 @@ define void @calli16_2() {
; MIPS32R5EB-NEXT: nop
; MIPS32R5EB-NEXT: lui $1, %hi(gv2i16)
; MIPS32R5EB-NEXT: sw $2, %lo(gv2i16)($1)
-; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT: addiu $sp, $sp, 32
+; MIPS32R5EB-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT: addiu $sp, $sp, 24
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
; MIPS64R5EB-LABEL: calli16_2:
; MIPS64R5EB: # %bb.0: # %entry
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EB-NEXT: .cfi_offset 31, -8
; MIPS64R5EB-NEXT: .cfi_offset 28, -16
; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_2)))
@@ -4096,9 +4280,9 @@ define void @calli16_2() {
; MIPS64R5EB-NEXT: nop
; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2i16)($gp)
; MIPS64R5EB-NEXT: sw $2, 0($1)
-; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EB-NEXT: jr $ra
; MIPS64R5EB-NEXT: nop
;
@@ -4149,9 +4333,9 @@ define void @calli16_2() {
;
; MIPS32R5EL-LABEL: calli16_2:
; MIPS32R5EL: # %bb.0: # %entry
-; MIPS32R5EL-NEXT: addiu $sp, $sp, -32
-; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT: addiu $sp, $sp, -24
+; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 24
+; MIPS32R5EL-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
; MIPS32R5EL-NEXT: .cfi_offset 31, -4
; MIPS32R5EL-NEXT: lui $1, 7
; MIPS32R5EL-NEXT: ori $4, $1, 6
@@ -4161,17 +4345,17 @@ define void @calli16_2() {
; MIPS32R5EL-NEXT: nop
; MIPS32R5EL-NEXT: lui $1, %hi(gv2i16)
; MIPS32R5EL-NEXT: sw $2, %lo(gv2i16)($1)
-; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT: addiu $sp, $sp, 32
+; MIPS32R5EL-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT: addiu $sp, $sp, 24
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
;
; MIPS64R5EL-LABEL: calli16_2:
; MIPS64R5EL: # %bb.0: # %entry
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EL-NEXT: .cfi_offset 31, -8
; MIPS64R5EL-NEXT: .cfi_offset 28, -16
; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_2)))
@@ -4186,9 +4370,9 @@ define void @calli16_2() {
; MIPS64R5EL-NEXT: nop
; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2i16)($gp)
; MIPS64R5EL-NEXT: sw $2, 0($1)
-; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EL-NEXT: jr $ra
; MIPS64R5EL-NEXT: nop
entry:
@@ -4282,10 +4466,10 @@ define void @calli16_4() {
;
; MIPS64R5EB-LABEL: calli16_4:
; MIPS64R5EB: # %bb.0: # %entry
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EB-NEXT: .cfi_offset 31, -8
; MIPS64R5EB-NEXT: .cfi_offset 28, -16
; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_4)))
@@ -4308,9 +4492,9 @@ define void @calli16_4() {
; MIPS64R5EB-NEXT: nop
; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i16)($gp)
; MIPS64R5EB-NEXT: sd $2, 0($1)
-; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EB-NEXT: jr $ra
; MIPS64R5EB-NEXT: nop
;
@@ -4398,10 +4582,10 @@ define void @calli16_4() {
;
; MIPS64R5EL-LABEL: calli16_4:
; MIPS64R5EL: # %bb.0: # %entry
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EL-NEXT: .cfi_offset 31, -8
; MIPS64R5EL-NEXT: .cfi_offset 28, -16
; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_4)))
@@ -4424,9 +4608,9 @@ define void @calli16_4() {
; MIPS64R5EL-NEXT: nop
; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i16)($gp)
; MIPS64R5EL-NEXT: sd $2, 0($1)
-; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EL-NEXT: jr $ra
; MIPS64R5EL-NEXT: nop
entry:
@@ -4807,10 +4991,10 @@ define void @calli32_2() {
;
; MIPS64R5EB-LABEL: calli32_2:
; MIPS64R5EB: # %bb.0: # %entry
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EB-NEXT: .cfi_offset 31, -8
; MIPS64R5EB-NEXT: .cfi_offset 28, -16
; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_2)))
@@ -4826,9 +5010,9 @@ define void @calli32_2() {
; MIPS64R5EB-NEXT: nop
; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2i32)($gp)
; MIPS64R5EB-NEXT: sd $2, 0($1)
-; MIPS64R5EB-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EB-NEXT: jr $ra
; MIPS64R5EB-NEXT: nop
;
@@ -4862,10 +5046,10 @@ define void @calli32_2() {
;
; MIPS64R5EL-LABEL: calli32_2:
; MIPS64R5EL: # %bb.0: # %entry
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT: sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
; MIPS64R5EL-NEXT: .cfi_offset 31, -8
; MIPS64R5EL-NEXT: .cfi_offset 28, -16
; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_2)))
@@ -4882,9 +5066,9 @@ define void @calli32_2() {
; MIPS64R5EL-NEXT: nop
; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2i32)($gp)
; MIPS64R5EL-NEXT: sd $2, 0($1)
-; MIPS64R5EL-NEXT: ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16
; MIPS64R5EL-NEXT: jr $ra
; MIPS64R5EL-NEXT: nop
entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 6cb98557c9bc13..97fc0634bfd53a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -281,7 +281,6 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr
; CHECK-NEXT: .LBB2_1: @ %vector.ph
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: vmov.i32 q5, #0x0
; CHECK-NEXT: bic r3, r3, #3
@@ -316,20 +315,14 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr
; CHECK-NEXT: vmov q3, q5
; CHECK-NEXT: vcmp.u32 cs, q1, q4
; CHECK-NEXT: @ implicit-def: $q5
-; CHECK-NEXT: vmrs r4, p0
-; CHECK-NEXT: and r2, r4, #1
-; CHECK-NEXT: rsbs r5, r2, #0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: bfi r2, r5, #0, #1
-; CHECK-NEXT: ubfx r5, r4, #4, #1
-; CHECK-NEXT: rsbs r5, r5, #0
-; CHECK-NEXT: bfi r2, r5, #1, #1
-; CHECK-NEXT: ubfx r5, r4, #8, #1
-; CHECK-NEXT: ubfx r4, r4, #12, #1
-; CHECK-NEXT: rsbs r5, r5, #0
-; CHECK-NEXT: bfi r2, r5, #2, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #3, #1
+; CHECK-NEXT: vmrs r2, p0
+; CHECK-NEXT: and r5, r2, #1
+; CHECK-NEXT: ubfx r4, r2, #4, #1
+; CHECK-NEXT: orr.w r4, r5, r4, lsl #1
+; CHECK-NEXT: ubfx r5, r2, #8, #1
+; CHECK-NEXT: ubfx r2, r2, #12, #1
+; CHECK-NEXT: orr.w r4, r4, r5, lsl #2
+; CHECK-NEXT: orr.w r2, r4, r2, lsl #3
; CHECK-NEXT: lsls r4, r2, #31
; CHECK-NEXT: bne .LBB2_12
; CHECK-NEXT: @ %bb.4: @ %else
@@ -352,20 +345,14 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr
; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1
; CHECK-NEXT: vcmp.u32 cs, q2, q4
; CHECK-NEXT: @ implicit-def: $q6
-; CHECK-NEXT: vmrs r4, p0
-; CHECK-NEXT: and r2, r4, #1
-; CHECK-NEXT: rsbs r5, r2, #0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: bfi r2, r5, #0, #1
-; CHECK-NEXT: ubfx r5, r4, #4, #1
-; CHECK-NEXT: rsbs r5, r5, #0
-; CHECK-NEXT: bfi r2, r5, #1, #1
-; CHECK-NEXT: ubfx r5, r4, #8, #1
-; CHECK-NEXT: ubfx r4, r4, #12, #1
-; CHECK-NEXT: rsbs r5, r5, #0
-; CHECK-NEXT: bfi r2, r5, #2, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #3, #1
+; CHECK-NEXT: vmrs r2, p0
+; CHECK-NEXT: and r5, r2, #1
+; CHECK-NEXT: ubfx r4, r2, #4, #1
+; CHECK-NEXT: orr.w r4, r5, r4, lsl #1
+; CHECK-NEXT: ubfx r5, r2, #8, #1
+; CHECK-NEXT: ubfx r2, r2, #12, #1
+; CHECK-NEXT: orr.w r4, r4, r5, lsl #2
+; CHECK-NEXT: orr.w r2, r4, r2, lsl #3
; CHECK-NEXT: lsls r4, r2, #31
; CHECK-NEXT: bne .LBB2_15
; CHECK-NEXT: @ %bb.9: @ %else15
@@ -432,7 +419,6 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(ptr nocapture readonly %b, ptr
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vadd.f32 q0, q0, r0
-; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index bcd92f81911b26..595bf8b3b294a6 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -277,10 +277,10 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext
; CHECK-LABEL: test_width2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: beq .LBB5_3
-; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r7, pc}
+; CHECK-NEXT: .LBB5_1: @ %for.body.preheader
; CHECK-NEXT: adds r0, r2, #1
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: bic r0, r0, #1
@@ -291,32 +291,24 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.64 r2
; CHECK-NEXT: @ implicit-def: $q0
-; CHECK-NEXT: subs r2, #2
-; CHECK-NEXT: vmrs r3, p0
-; CHECK-NEXT: and r0, r3, #1
-; CHECK-NEXT: ubfx r3, r3, #8, #1
-; CHECK-NEXT: rsb.w r12, r0, #0
-; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r0, r12, #0, #1
; CHECK-NEXT: sub.w r12, r1, #8
-; CHECK-NEXT: bfi r0, r3, #1, #1
-; CHECK-NEXT: lsls r3, r0, #31
+; CHECK-NEXT: vmrs r0, p0
+; CHECK-NEXT: subs r2, #2
+; CHECK-NEXT: ubfx r3, r0, #8, #1
+; CHECK-NEXT: and r0, r0, #1
+; CHECK-NEXT: orr.w r3, r0, r3, lsl #1
+; CHECK-NEXT: lsls r0, r3, #31
; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrne.w r3, [r12]
-; CHECK-NEXT: vmovne.32 q0[0], r3
-; CHECK-NEXT: lsls r0, r0, #30
+; CHECK-NEXT: ldrne.w r0, [r12]
+; CHECK-NEXT: vmovne.32 q0[0], r0
+; CHECK-NEXT: lsls r0, r3, #30
; CHECK-NEXT: itt mi
; CHECK-NEXT: ldrmi.w r0, [r12, #4]
; CHECK-NEXT: vmovmi.32 q0[2], r0
-; CHECK-NEXT: vmrs r3, p0
-; CHECK-NEXT: and r0, r3, #1
-; CHECK-NEXT: ubfx r3, r3, #8, #1
-; CHECK-NEXT: rsb.w r12, r0, #0
-; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r0, r12, #0, #1
-; CHECK-NEXT: bfi r0, r3, #1, #1
+; CHECK-NEXT: vmrs r0, p0
+; CHECK-NEXT: ubfx r3, r0, #8, #1
+; CHECK-NEXT: and r0, r0, #1
+; CHECK-NEXT: orr.w r0, r0, r3, lsl #1
; CHECK-NEXT: lsls r3, r0, #31
; CHECK-NEXT: itt ne
; CHECK-NEXT: vmovne r3, s0
@@ -327,8 +319,7 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext
; CHECK-NEXT: strmi r0, [r1, #4]
; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: le lr, .LBB5_2
-; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp9.not = icmp eq i8 %m, 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index 4934d223209037..00c72e47165f33 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -91,72 +91,76 @@ define void @foo_sext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r4, r5, r7, lr}
; CHECK-LE-NEXT: push {r4, r5, r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: ldrd r12, lr, [r1]
; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: @ implicit-def: $q1
+; CHECK-LE-NEXT: @ implicit-def: $q0
; CHECK-LE-NEXT: rsbs.w r3, r12, #0
-; CHECK-LE-NEXT: vmov q0[2], q0[0], r12, lr
+; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr
; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31
; CHECK-LE-NEXT: csetm r3, lt
; CHECK-LE-NEXT: rsbs.w r4, lr, #0
; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: bfi r1, r3, #0, #8
; CHECK-LE-NEXT: csetm r3, lt
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: movs r4, #0
+; CHECK-LE-NEXT: bfi r1, r3, #8, #8
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: and r1, r1, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1
; CHECK-LE-NEXT: lsls r3, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrne r3, [r2]
-; CHECK-LE-NEXT: vmovne.32 q1[0], r3
+; CHECK-LE-NEXT: vmovne.32 q0[0], r3
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: ldrmi r1, [r2, #4]
-; CHECK-LE-NEXT: vmovmi.32 q1[2], r1
-; CHECK-LE-NEXT: vmov r2, s6
-; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: vmovmi.32 q0[2], r1
+; CHECK-LE-NEXT: vmov r1, s2
+; CHECK-LE-NEXT: vmov r2, s4
; CHECK-LE-NEXT: vmov r3, s0
-; CHECK-LE-NEXT: vmov r4, s4
-; CHECK-LE-NEXT: vmov q1[2], q1[0], r4, r2
-; CHECK-LE-NEXT: rsbs r5, r3, #0
-; CHECK-LE-NEXT: asr.w r12, r2, #31
-; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31
-; CHECK-LE-NEXT: vmov r3, s2
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: asr.w lr, r4, #31
-; CHECK-LE-NEXT: vmov q1[3], q1[1], lr, r12
-; CHECK-LE-NEXT: rsbs r5, r3, #0
-; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1
+; CHECK-LE-NEXT: rsbs r5, r2, #0
+; CHECK-LE-NEXT: asr.w r12, r1, #31
+; CHECK-LE-NEXT: sbcs.w r1, r4, r2, asr #31
+; CHECK-LE-NEXT: vmov r2, s6
+; CHECK-LE-NEXT: asr.w lr, r3, #31
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12
+; CHECK-LE-NEXT: rsbs r3, r2, #0
+; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT: bfi r4, r1, #0, #8
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: bfi r4, r1, #8, #8
+; CHECK-LE-NEXT: and r2, r4, #1
+; CHECK-LE-NEXT: ubfx r1, r4, #8, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: it ne
-; CHECK-LE-NEXT: vstrne d2, [r0]
+; CHECK-LE-NEXT: vstrne d0, [r0]
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
-; CHECK-LE-NEXT: vstrmi d3, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
;
; CHECK-BE-LABEL: foo_sext_v2i64_v2i32:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r4, r5, r7, lr}
; CHECK-BE-NEXT: push {r4, r5, r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: ldrd r12, lr, [r1]
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
-; CHECK-BE-NEXT: mov.w r1, #0
-; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr
-; CHECK-BE-NEXT: csetm lr, lt
-; CHECK-BE-NEXT: rsbs.w r3, r12, #0
+; CHECK-BE-NEXT: ldrd lr, r12, [r1]
+; CHECK-BE-NEXT: movs r3, #0
; CHECK-BE-NEXT: @ implicit-def: $q2
-; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31
-; CHECK-BE-NEXT: bfi r1, lr, #0, #1
-; CHECK-BE-NEXT: csetm r3, lt
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: csetm lr, lt
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: bfi r3, lr, #0, #8
+; CHECK-BE-NEXT: csetm r1, lt
+; CHECK-BE-NEXT: bfi r3, r1, #8, #8
+; CHECK-BE-NEXT: and r1, r3, #1
+; CHECK-BE-NEXT: ubfx r3, r3, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1
; CHECK-BE-NEXT: lsls r3, r1, #30
; CHECK-BE-NEXT: bpl .LBB5_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
@@ -175,31 +179,33 @@ define void @foo_sext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: .LBB5_4: @ %else2
; CHECK-BE-NEXT: vrev64.32 q0, q2
; CHECK-BE-NEXT: vrev64.32 q2, q1
-; CHECK-BE-NEXT: vmov r2, s3
-; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: vmov r2, s9
+; CHECK-BE-NEXT: movs r4, #0
+; CHECK-BE-NEXT: vmov r1, s3
; CHECK-BE-NEXT: vmov r3, s1
-; CHECK-BE-NEXT: vmov r4, s11
-; CHECK-BE-NEXT: asr.w r12, r2, #31
+; CHECK-BE-NEXT: rsbs r5, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: vmov r2, s11
+; CHECK-BE-NEXT: asr.w r12, r1, #31
; CHECK-BE-NEXT: asr.w lr, r3, #31
-; CHECK-BE-NEXT: rsbs r5, r4, #0
; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12
-; CHECK-BE-NEXT: sbcs.w r4, r1, r4, asr #31
-; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r2
-; CHECK-BE-NEXT: vmov r3, s9
-; CHECK-BE-NEXT: csetm r2, lt
+; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1
+; CHECK-BE-NEXT: csetm r1, lt
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: rsbs r5, r3, #0
-; CHECK-BE-NEXT: sbcs.w r3, r1, r3, asr #31
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
-; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: bfi r4, r1, #0, #8
+; CHECK-BE-NEXT: csetm r1, lt
+; CHECK-BE-NEXT: bfi r4, r1, #8, #8
+; CHECK-BE-NEXT: and r1, r4, #1
+; CHECK-BE-NEXT: ubfx r2, r4, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: it mi
; CHECK-BE-NEXT: vstrmi d0, [r0]
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: it ne
; CHECK-BE-NEXT: vstrne d1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r4, r5, r7, pc}
entry:
%0 = load <2 x i32>, ptr %mask, align 4
@@ -215,8 +221,6 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r4, r5, r7, lr}
; CHECK-LE-NEXT: push {r4, r5, r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: ldrd r12, lr, [r1]
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: @ implicit-def: $q0
@@ -226,9 +230,13 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: csetm r3, lt
; CHECK-LE-NEXT: rsbs.w r4, lr, #0
; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: bfi r1, r3, #0, #8
; CHECK-LE-NEXT: csetm r3, lt
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: movs r4, #0
+; CHECK-LE-NEXT: bfi r1, r3, #8, #8
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: and r1, r1, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1
; CHECK-LE-NEXT: lsls r3, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrne r3, [r2]
@@ -237,23 +245,25 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: ldrmi r1, [r2, #4]
; CHECK-LE-NEXT: vmovmi.32 q0[2], r1
-; CHECK-LE-NEXT: vmov r2, s2
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: vmov r3, s4
-; CHECK-LE-NEXT: vmov r4, s0
-; CHECK-LE-NEXT: vmov q0[2], q0[0], r4, r2
-; CHECK-LE-NEXT: rsbs r5, r3, #0
-; CHECK-LE-NEXT: asr.w r12, r2, #31
-; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31
-; CHECK-LE-NEXT: vmov r3, s6
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: asr.w lr, r4, #31
+; CHECK-LE-NEXT: vmov r1, s2
+; CHECK-LE-NEXT: vmov r2, s4
+; CHECK-LE-NEXT: vmov r3, s0
+; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1
+; CHECK-LE-NEXT: rsbs r5, r2, #0
+; CHECK-LE-NEXT: asr.w r12, r1, #31
+; CHECK-LE-NEXT: sbcs.w r1, r4, r2, asr #31
+; CHECK-LE-NEXT: vmov r2, s6
+; CHECK-LE-NEXT: asr.w lr, r3, #31
+; CHECK-LE-NEXT: csetm r1, lt
; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12
-; CHECK-LE-NEXT: rsbs r5, r3, #0
-; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: rsbs r3, r2, #0
+; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT: bfi r4, r1, #0, #8
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: bfi r4, r1, #8, #8
+; CHECK-LE-NEXT: and r2, r4, #1
+; CHECK-LE-NEXT: ubfx r1, r4, #8, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne r2, r3, d0
@@ -262,27 +272,27 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi r1, r2, d1
; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
;
; CHECK-BE-LABEL: foo_sext_v2i64_v2i32_unaligned:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r4, r5, r7, lr}
; CHECK-BE-NEXT: push {r4, r5, r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: ldrd r12, lr, [r1]
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
-; CHECK-BE-NEXT: mov.w r1, #0
-; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT: vmov q0[3], q0[1], r12, lr
-; CHECK-BE-NEXT: csetm lr, lt
-; CHECK-BE-NEXT: rsbs.w r3, r12, #0
+; CHECK-BE-NEXT: ldrd lr, r12, [r1]
+; CHECK-BE-NEXT: movs r3, #0
; CHECK-BE-NEXT: @ implicit-def: $q2
-; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31
-; CHECK-BE-NEXT: bfi r1, lr, #0, #1
-; CHECK-BE-NEXT: csetm r3, lt
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: vmov q0[3], q0[1], lr, r12
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: csetm lr, lt
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: bfi r3, lr, #0, #8
+; CHECK-BE-NEXT: csetm r1, lt
+; CHECK-BE-NEXT: bfi r3, r1, #8, #8
+; CHECK-BE-NEXT: and r1, r3, #1
+; CHECK-BE-NEXT: ubfx r3, r3, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1
; CHECK-BE-NEXT: lsls r3, r1, #30
; CHECK-BE-NEXT: bpl .LBB6_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
@@ -301,24 +311,27 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: .LBB6_4: @ %else2
; CHECK-BE-NEXT: vrev64.32 q0, q2
; CHECK-BE-NEXT: vrev64.32 q2, q1
-; CHECK-BE-NEXT: vmov r2, s3
-; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: vmov r2, s9
+; CHECK-BE-NEXT: movs r4, #0
+; CHECK-BE-NEXT: vmov r1, s3
; CHECK-BE-NEXT: vmov r3, s1
-; CHECK-BE-NEXT: vmov r4, s11
-; CHECK-BE-NEXT: asr.w r12, r2, #31
+; CHECK-BE-NEXT: rsbs r5, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: vmov r2, s11
+; CHECK-BE-NEXT: asr.w r12, r1, #31
; CHECK-BE-NEXT: asr.w lr, r3, #31
-; CHECK-BE-NEXT: rsbs r5, r4, #0
; CHECK-BE-NEXT: vmov q1[2], q1[0], lr, r12
-; CHECK-BE-NEXT: sbcs.w r4, r1, r4, asr #31
-; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r2
-; CHECK-BE-NEXT: vmov r3, s9
-; CHECK-BE-NEXT: csetm r2, lt
+; CHECK-BE-NEXT: vmov q1[3], q1[1], r3, r1
+; CHECK-BE-NEXT: csetm r1, lt
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: rsbs r5, r3, #0
-; CHECK-BE-NEXT: sbcs.w r3, r1, r3, asr #31
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
-; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: bfi r4, r1, #0, #8
+; CHECK-BE-NEXT: csetm r1, lt
+; CHECK-BE-NEXT: bfi r4, r1, #8, #8
+; CHECK-BE-NEXT: and r1, r4, #1
+; CHECK-BE-NEXT: ubfx r2, r4, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi r2, r3, d0
@@ -327,7 +340,6 @@ define void @foo_sext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne r1, r2, d1
; CHECK-BE-NEXT: strdne r2, r1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r4, r5, r7, pc}
entry:
%0 = load <2 x i32>, ptr %mask, align 4
@@ -343,8 +355,6 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r4, lr}
; CHECK-LE-NEXT: push {r4, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: ldrd r12, lr, [r1]
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: @ implicit-def: $q0
@@ -355,9 +365,12 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: csetm r3, lt
; CHECK-LE-NEXT: rsbs.w r4, lr, #0
; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: bfi r1, r3, #0, #8
; CHECK-LE-NEXT: csetm r3, lt
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: bfi r1, r3, #8, #8
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: and r1, r1, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1
; CHECK-LE-NEXT: lsls r3, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrne r3, [r2]
@@ -375,36 +388,39 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: csetm r2, lt
; CHECK-LE-NEXT: rsbs r4, r3, #0
; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
+; CHECK-LE-NEXT: bfi r1, r2, #0, #8
; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: bfi r1, r2, #8, #8
+; CHECK-LE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-LE-NEXT: and r1, r1, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: it ne
; CHECK-LE-NEXT: vstrne d0, [r0]
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: pop {r4, pc}
;
; CHECK-BE-LABEL: foo_zext_v2i64_v2i32:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: ldrd r12, lr, [r1]
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
-; CHECK-BE-NEXT: mov.w r1, #0
-; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT: vmov q1[3], q1[1], r12, lr
-; CHECK-BE-NEXT: csetm lr, lt
-; CHECK-BE-NEXT: rsbs.w r3, r12, #0
+; CHECK-BE-NEXT: ldrd lr, r12, [r1]
+; CHECK-BE-NEXT: movs r3, #0
; CHECK-BE-NEXT: @ implicit-def: $q0
-; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31
-; CHECK-BE-NEXT: bfi r1, lr, #0, #1
-; CHECK-BE-NEXT: csetm r3, lt
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: vmov q1[3], q1[1], lr, r12
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: csetm lr, lt
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: bfi r3, lr, #0, #8
+; CHECK-BE-NEXT: csetm r1, lt
+; CHECK-BE-NEXT: bfi r3, r1, #8, #8
+; CHECK-BE-NEXT: and r1, r3, #1
+; CHECK-BE-NEXT: ubfx r3, r3, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1
; CHECK-BE-NEXT: lsls r3, r1, #30
; CHECK-BE-NEXT: bpl .LBB7_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
@@ -425,24 +441,26 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: movs r1, #0
; CHECK-BE-NEXT: vrev64.32 q3, q1
; CHECK-BE-NEXT: vrev64.32 q1, q2
-; CHECK-BE-NEXT: vmov r2, s7
+; CHECK-BE-NEXT: vmov r2, s5
; CHECK-BE-NEXT: vand q0, q0, q3
; CHECK-BE-NEXT: rsbs r3, r2, #0
-; CHECK-BE-NEXT: vmov r3, s5
+; CHECK-BE-NEXT: vmov r3, s7
; CHECK-BE-NEXT: sbcs.w r2, r1, r2, asr #31
; CHECK-BE-NEXT: csetm r12, lt
; CHECK-BE-NEXT: rsbs r2, r3, #0
; CHECK-BE-NEXT: sbcs.w r2, r1, r3, asr #31
-; CHECK-BE-NEXT: bfi r1, r12, #0, #1
+; CHECK-BE-NEXT: bfi r1, r12, #0, #8
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: bfi r1, r2, #8, #8
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: it mi
; CHECK-BE-NEXT: vstrmi d0, [r0]
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: it ne
; CHECK-BE-NEXT: vstrne d1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r7, pc}
entry:
%0 = load <2 x i32>, ptr %mask, align 4
@@ -458,8 +476,6 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r4, lr}
; CHECK-LE-NEXT: push {r4, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: ldrd r12, lr, [r1]
; CHECK-LE-NEXT: movs r1, #0
; CHECK-LE-NEXT: @ implicit-def: $q0
@@ -470,9 +486,12 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: csetm r3, lt
; CHECK-LE-NEXT: rsbs.w r4, lr, #0
; CHECK-LE-NEXT: sbcs.w r4, r1, lr, asr #31
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: bfi r1, r3, #0, #8
; CHECK-LE-NEXT: csetm r3, lt
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: bfi r1, r3, #8, #8
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: and r1, r1, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #1
; CHECK-LE-NEXT: lsls r3, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrne r3, [r2]
@@ -490,9 +509,12 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: csetm r2, lt
; CHECK-LE-NEXT: rsbs r4, r3, #0
; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
+; CHECK-LE-NEXT: bfi r1, r2, #0, #8
; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: bfi r1, r2, #8, #8
+; CHECK-LE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-LE-NEXT: and r1, r1, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne r2, r3, d0
@@ -501,27 +523,27 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi r1, r2, d1
; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: pop {r4, pc}
;
; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: ldrd r12, lr, [r1]
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
-; CHECK-BE-NEXT: mov.w r1, #0
-; CHECK-BE-NEXT: sbcs.w r3, r1, lr, asr #31
-; CHECK-BE-NEXT: vmov q1[3], q1[1], r12, lr
-; CHECK-BE-NEXT: csetm lr, lt
-; CHECK-BE-NEXT: rsbs.w r3, r12, #0
+; CHECK-BE-NEXT: ldrd lr, r12, [r1]
+; CHECK-BE-NEXT: movs r3, #0
; CHECK-BE-NEXT: @ implicit-def: $q0
-; CHECK-BE-NEXT: sbcs.w r3, r1, r12, asr #31
-; CHECK-BE-NEXT: bfi r1, lr, #0, #1
-; CHECK-BE-NEXT: csetm r3, lt
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: vmov q1[3], q1[1], lr, r12
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: csetm lr, lt
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: bfi r3, lr, #0, #8
+; CHECK-BE-NEXT: csetm r1, lt
+; CHECK-BE-NEXT: bfi r3, r1, #8, #8
+; CHECK-BE-NEXT: and r1, r3, #1
+; CHECK-BE-NEXT: ubfx r3, r3, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1
; CHECK-BE-NEXT: lsls r3, r1, #30
; CHECK-BE-NEXT: bpl .LBB8_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
@@ -542,17 +564,20 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: movs r1, #0
; CHECK-BE-NEXT: vrev64.32 q3, q1
; CHECK-BE-NEXT: vrev64.32 q1, q2
-; CHECK-BE-NEXT: vmov r2, s7
+; CHECK-BE-NEXT: vmov r2, s5
; CHECK-BE-NEXT: vand q0, q0, q3
; CHECK-BE-NEXT: rsbs r3, r2, #0
-; CHECK-BE-NEXT: vmov r3, s5
+; CHECK-BE-NEXT: vmov r3, s7
; CHECK-BE-NEXT: sbcs.w r2, r1, r2, asr #31
; CHECK-BE-NEXT: csetm r12, lt
; CHECK-BE-NEXT: rsbs r2, r3, #0
; CHECK-BE-NEXT: sbcs.w r2, r1, r3, asr #31
-; CHECK-BE-NEXT: bfi r1, r12, #0, #1
+; CHECK-BE-NEXT: bfi r1, r12, #0, #8
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: bfi r1, r2, #8, #8
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi r2, r3, d0
@@ -561,7 +586,6 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne r1, r2, d1
; CHECK-BE-NEXT: strdne r2, r1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r7, pc}
entry:
%0 = load <2 x i32>, ptr %mask, align 4
@@ -724,27 +748,17 @@ entry:
define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-LABEL: foo_v4f32_v4f16:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .save {r7, lr}
-; CHECK-LE-NEXT: push {r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vldrh.s32 q0, [r1]
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
-; CHECK-LE-NEXT: vmrs lr, p0
-; CHECK-LE-NEXT: and r1, lr, #1
-; CHECK-LE-NEXT: ubfx r3, lr, #4, #1
-; CHECK-LE-NEXT: rsb.w r12, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r12, #0, #1
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, lr, #8, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: ubfx r3, lr, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r12, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r12, r3, r12, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r3, r12, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #3
; CHECK-LE-NEXT: lsls r3, r1, #31
; CHECK-LE-NEXT: bne .LBB18_6
; CHECK-LE-NEXT: @ %bb.1: @ %else
@@ -760,24 +774,18 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: vldr.16 s2, [r2, #6]
; CHECK-LE-NEXT: vins.f16 s1, s2
; CHECK-LE-NEXT: .LBB18_5: @ %else8
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: vmrs r1, p0
; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1
; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1
; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0
; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-LE-NEXT: and r3, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne r2, s0
@@ -794,8 +802,7 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi r1, s3
; CHECK-LE-NEXT: strmi r1, [r0, #12]
-; CHECK-LE-NEXT: add sp, #4
-; CHECK-LE-NEXT: pop {r7, pc}
+; CHECK-LE-NEXT: bx lr
; CHECK-LE-NEXT: .LBB18_6: @ %cond.load
; CHECK-LE-NEXT: vldr.16 s0, [r2]
; CHECK-LE-NEXT: lsls r3, r1, #30
@@ -815,27 +822,17 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) {
;
; CHECK-BE-LABEL: foo_v4f32_v4f16:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .save {r7, lr}
-; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vldrh.s32 q0, [r1]
; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-BE-NEXT: @ implicit-def: $q0
-; CHECK-BE-NEXT: vmrs lr, p0
-; CHECK-BE-NEXT: ubfx r1, lr, #12, #1
-; CHECK-BE-NEXT: ubfx r3, lr, #8, #1
-; CHECK-BE-NEXT: rsb.w r12, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r12, #0, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, lr, #4, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: and r3, lr, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r12, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r12, r3, r12, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r3, r12, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #3
; CHECK-BE-NEXT: lsls r3, r1, #28
; CHECK-BE-NEXT: bmi .LBB18_6
; CHECK-BE-NEXT: @ %bb.1: @ %else
@@ -851,24 +848,18 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: vldr.16 s2, [r2, #6]
; CHECK-BE-NEXT: vins.f16 s1, s2
; CHECK-BE-NEXT: .LBB18_5: @ %else8
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: vmrs r1, p0
; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1
; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1
; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0
; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-BE-NEXT: ubfx r3, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi r2, s0
@@ -885,8 +876,7 @@ define void @foo_v4f32_v4f16(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne r1, s3
; CHECK-BE-NEXT: strne r1, [r0, #12]
-; CHECK-BE-NEXT: add sp, #4
-; CHECK-BE-NEXT: pop {r7, pc}
+; CHECK-BE-NEXT: bx lr
; CHECK-BE-NEXT: .LBB18_6: @ %cond.load
; CHECK-BE-NEXT: vldr.16 s0, [r2]
; CHECK-BE-NEXT: lsls r3, r1, #29
@@ -915,27 +905,17 @@ entry:
define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-LABEL: foo_v4f32_v4f16_unaligned:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .save {r7, lr}
-; CHECK-LE-NEXT: push {r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vldrh.s32 q0, [r1]
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
-; CHECK-LE-NEXT: vmrs lr, p0
-; CHECK-LE-NEXT: and r1, lr, #1
-; CHECK-LE-NEXT: ubfx r3, lr, #4, #1
-; CHECK-LE-NEXT: rsb.w r12, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r12, #0, #1
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, lr, #8, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: ubfx r3, lr, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r12, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r12, r3, r12, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r3, r12, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #3
; CHECK-LE-NEXT: lsls r3, r1, #31
; CHECK-LE-NEXT: bne .LBB19_6
; CHECK-LE-NEXT: @ %bb.1: @ %else
@@ -951,24 +931,18 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: vldr.16 s2, [r2, #6]
; CHECK-LE-NEXT: vins.f16 s1, s2
; CHECK-LE-NEXT: .LBB19_5: @ %else8
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: vmrs r1, p0
; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1
; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1
; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0
; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-LE-NEXT: and r3, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne r2, s0
@@ -985,8 +959,7 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi r1, s3
; CHECK-LE-NEXT: strmi r1, [r0, #12]
-; CHECK-LE-NEXT: add sp, #4
-; CHECK-LE-NEXT: pop {r7, pc}
+; CHECK-LE-NEXT: bx lr
; CHECK-LE-NEXT: .LBB19_6: @ %cond.load
; CHECK-LE-NEXT: vldr.16 s0, [r2]
; CHECK-LE-NEXT: lsls r3, r1, #30
@@ -1006,27 +979,17 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) {
;
; CHECK-BE-LABEL: foo_v4f32_v4f16_unaligned:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .save {r7, lr}
-; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vldrh.s32 q0, [r1]
; CHECK-BE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-BE-NEXT: @ implicit-def: $q0
-; CHECK-BE-NEXT: vmrs lr, p0
-; CHECK-BE-NEXT: ubfx r1, lr, #12, #1
-; CHECK-BE-NEXT: ubfx r3, lr, #8, #1
-; CHECK-BE-NEXT: rsb.w r12, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r12, #0, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, lr, #4, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: and r3, lr, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r12, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r12, r3, r12, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r3, r12, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #3
; CHECK-BE-NEXT: lsls r3, r1, #28
; CHECK-BE-NEXT: bmi .LBB19_6
; CHECK-BE-NEXT: @ %bb.1: @ %else
@@ -1042,24 +1005,18 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: vldr.16 s2, [r2, #6]
; CHECK-BE-NEXT: vins.f16 s1, s2
; CHECK-BE-NEXT: .LBB19_5: @ %else8
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: vmrs r1, p0
; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1
; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1
; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0
; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-BE-NEXT: ubfx r3, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi r2, s0
@@ -1076,8 +1033,7 @@ define void @foo_v4f32_v4f16_unaligned(ptr %dest, ptr %mask, ptr %src) {
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne r1, s3
; CHECK-BE-NEXT: strne r1, [r0, #12]
-; CHECK-BE-NEXT: add sp, #4
-; CHECK-BE-NEXT: pop {r7, pc}
+; CHECK-BE-NEXT: bx lr
; CHECK-BE-NEXT: .LBB19_6: @ %cond.load
; CHECK-BE-NEXT: vldr.16 s0, [r2]
; CHECK-BE-NEXT: lsls r3, r1, #29
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index b0a3a6354daa70..9e3c63718e9a58 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -45,24 +45,16 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(ptr %dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i32_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: and r1, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrne r2, [r0]
@@ -79,30 +71,21 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(ptr %dest, <4 x i32>
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: ldrmi r0, [r0, #12]
; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4i32_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
; CHECK-BE-NEXT: @ implicit-def: $q1
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: ubfx r1, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: ldrmi r2, [r0]
@@ -120,7 +103,6 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(ptr %dest, <4 x i32>
; CHECK-BE-NEXT: ldrne r0, [r0, #12]
; CHECK-BE-NEXT: vmovne.32 q1[3], r0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -195,24 +177,16 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(ptr %dest, <4 x i32> %a) {
; CHECK-LE-LABEL: zext16_masked_v4i32_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: and r1, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrhne r2, [r0]
@@ -230,30 +204,21 @@ define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(ptr %dest, <4
; CHECK-LE-NEXT: ldrhmi r0, [r0, #6]
; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
; CHECK-LE-NEXT: vmovlb.s16 q0, q0
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: zext16_masked_v4i32_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: @ implicit-def: $q0
; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: ubfx r1, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: ldrhmi r2, [r0]
@@ -272,7 +237,6 @@ define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(ptr %dest, <4
; CHECK-BE-NEXT: vmovne.32 q0[3], r0
; CHECK-BE-NEXT: vmovlb.s16 q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -353,24 +317,16 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(ptr %dest, <4 x i32> %a) {
; CHECK-LE-LABEL: sext16_masked_v4i32_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: and r1, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrhne r2, [r0]
@@ -388,30 +344,21 @@ define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(ptr %dest, <4
; CHECK-LE-NEXT: ldrhmi r0, [r0, #6]
; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
; CHECK-LE-NEXT: vmovlb.s16 q0, q0
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: sext16_masked_v4i32_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: @ implicit-def: $q0
; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: ubfx r1, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: ldrhmi r2, [r0]
@@ -430,7 +377,6 @@ define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(ptr %dest, <4
; CHECK-BE-NEXT: vmovne.32 q0[3], r0
; CHECK-BE-NEXT: vmovlb.s16 q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -553,38 +499,25 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8i16_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
; CHECK-LE-NEXT: vmrs r1, p0
-; CHECK-LE-NEXT: and r2, r1, #1
-; CHECK-LE-NEXT: rsbs r3, r2, #0
-; CHECK-LE-NEXT: movs r2, #0
-; CHECK-LE-NEXT: bfi r2, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #6, #1
-; CHECK-LE-NEXT: rsbs r1, r1, #0
-; CHECK-LE-NEXT: bfi r2, r1, #7, #1
-; CHECK-LE-NEXT: uxtb r1, r2
-; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrhne r2, [r0]
; CHECK-LE-NEXT: vmovne.16 q0[0], r2
@@ -616,44 +549,30 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16>
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: ldrhmi r0, [r0, #14]
; CHECK-LE-NEXT: vmovmi.16 q0[7], r0
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8i16_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr
; CHECK-BE-NEXT: @ implicit-def: $q1
; CHECK-BE-NEXT: vmrs r1, p0
-; CHECK-BE-NEXT: ubfx r2, r1, #14, #1
-; CHECK-BE-NEXT: rsbs r3, r2, #0
-; CHECK-BE-NEXT: movs r2, #0
-; CHECK-BE-NEXT: bfi r2, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r2, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #14, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
; CHECK-BE-NEXT: and r1, r1, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #6, #1
-; CHECK-BE-NEXT: rsbs r1, r1, #0
-; CHECK-BE-NEXT: bfi r2, r1, #7, #1
-; CHECK-BE-NEXT: uxtb r1, r2
-; CHECK-BE-NEXT: lsls r2, r2, #24
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-BE-NEXT: lsls r2, r1, #24
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: ldrhmi r2, [r0]
; CHECK-BE-NEXT: vmovmi.16 q1[0], r2
@@ -686,7 +605,6 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(ptr %dest, <8 x i16>
; CHECK-BE-NEXT: ldrhne r0, [r0, #14]
; CHECK-BE-NEXT: vmovne.16 q1[7], r0
; CHECK-BE-NEXT: vrev64.16 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -1221,24 +1139,16 @@ entry:
define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(ptr %dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4f32_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: and r1, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: ldrne r2, [r0]
@@ -1255,30 +1165,21 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(ptr %dest, <4 x i3
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: ldrmi r0, [r0, #12]
; CHECK-LE-NEXT: vmovmi s3, r0
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f32_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
; CHECK-BE-NEXT: @ implicit-def: $q1
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: ubfx r1, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: ldrmi r2, [r0]
@@ -1296,7 +1197,6 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(ptr %dest, <4 x i3
; CHECK-BE-NEXT: ldrne r0, [r0, #12]
; CHECK-BE-NEXT: vmovne s7, r0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -1417,38 +1317,27 @@ entry:
define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8f16_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #36
-; CHECK-LE-NEXT: sub sp, #36
+; CHECK-LE-NEXT: .pad #32
+; CHECK-LE-NEXT: sub sp, #32
; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr
; CHECK-LE-NEXT: @ implicit-def: $q0
; CHECK-LE-NEXT: vmrs r1, p0
-; CHECK-LE-NEXT: and r2, r1, #1
-; CHECK-LE-NEXT: rsbs r3, r2, #0
-; CHECK-LE-NEXT: movs r2, #0
-; CHECK-LE-NEXT: bfi r2, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #6, #1
-; CHECK-LE-NEXT: rsbs r1, r1, #0
-; CHECK-LE-NEXT: bfi r2, r1, #7, #1
-; CHECK-LE-NEXT: uxtb r1, r2
-; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: bne .LBB45_9
; CHECK-LE-NEXT: @ %bb.1: @ %else
; CHECK-LE-NEXT: lsls r2, r1, #30
@@ -1472,7 +1361,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16
; CHECK-LE-NEXT: lsls r1, r1, #24
; CHECK-LE-NEXT: bmi .LBB45_16
; CHECK-LE-NEXT: .LBB45_8: @ %else20
-; CHECK-LE-NEXT: add sp, #36
+; CHECK-LE-NEXT: add sp, #32
; CHECK-LE-NEXT: bx lr
; CHECK-LE-NEXT: .LBB45_9: @ %cond.load
; CHECK-LE-NEXT: ldrh r2, [r0]
@@ -1530,44 +1419,33 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16
; CHECK-LE-NEXT: strh.w r0, [sp]
; CHECK-LE-NEXT: vldr.16 s4, [sp]
; CHECK-LE-NEXT: vins.f16 s3, s4
-; CHECK-LE-NEXT: add sp, #36
+; CHECK-LE-NEXT: add sp, #32
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8f16_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #36
-; CHECK-BE-NEXT: sub sp, #36
+; CHECK-BE-NEXT: .pad #32
+; CHECK-BE-NEXT: sub sp, #32
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr
; CHECK-BE-NEXT: @ implicit-def: $q1
; CHECK-BE-NEXT: vmrs r1, p0
-; CHECK-BE-NEXT: ubfx r2, r1, #14, #1
-; CHECK-BE-NEXT: rsbs r3, r2, #0
-; CHECK-BE-NEXT: movs r2, #0
-; CHECK-BE-NEXT: bfi r2, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r2, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #14, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
; CHECK-BE-NEXT: and r1, r1, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #6, #1
-; CHECK-BE-NEXT: rsbs r1, r1, #0
-; CHECK-BE-NEXT: bfi r2, r1, #7, #1
-; CHECK-BE-NEXT: uxtb r1, r2
-; CHECK-BE-NEXT: lsls r2, r2, #24
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-BE-NEXT: lsls r2, r1, #24
; CHECK-BE-NEXT: bmi .LBB45_10
; CHECK-BE-NEXT: @ %bb.1: @ %else
; CHECK-BE-NEXT: lsls r2, r1, #25
@@ -1597,7 +1475,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(ptr %dest, <8 x i16
; CHECK-BE-NEXT: vins.f16 s7, s0
; CHECK-BE-NEXT: .LBB45_9: @ %else20
; CHECK-BE-NEXT: vrev64.16 q0, q1
-; CHECK-BE-NEXT: add sp, #36
+; CHECK-BE-NEXT: add sp, #32
; CHECK-BE-NEXT: bx lr
; CHECK-BE-NEXT: .LBB45_10: @ %cond.load
; CHECK-BE-NEXT: ldrh r2, [r0]
@@ -1732,19 +1610,20 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64>
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r7, lr}
; CHECK-LE-NEXT: push {r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: vmov r2, r3, d0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: vmov r12, lr, d1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: sbcs.w r2, r1, r3
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: rsbs.w r3, r12, #0
-; CHECK-LE-NEXT: sbcs.w r3, r1, lr
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: vmov r1, lr, d0
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: vmov r3, r12, d1
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, lr
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: sbcs.w r3, r2, r12
+; CHECK-LE-NEXT: bfi r2, r1, #0, #8
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: bfi r2, r1, #8, #8
+; CHECK-LE-NEXT: ubfx r1, r2, #8, #1
+; CHECK-LE-NEXT: and r2, r2, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: beq .LBB49_2
; CHECK-LE-NEXT: @ %bb.1: @ %cond.load
@@ -1757,7 +1636,6 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64>
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
; CHECK-LE-NEXT: vldrmi d1, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: pop {r7, pc}
; CHECK-LE-NEXT: .p2align 3
; CHECK-LE-NEXT: @ %bb.4:
@@ -1769,20 +1647,21 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64>
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vmov r2, r3, d3
-; CHECK-BE-NEXT: vmov r12, lr, d2
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: sbcs.w r2, r1, r2
+; CHECK-BE-NEXT: vmov lr, r2, d2
+; CHECK-BE-NEXT: vmov r12, r3, d3
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r1, lr
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
+; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: sbcs.w r3, r1, r12
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
+; CHECK-BE-NEXT: bfi r1, r2, #0, #8
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: bfi r1, r2, #8, #8
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: bpl .LBB49_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
@@ -1795,7 +1674,6 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64>
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: it ne
; CHECK-BE-NEXT: vldrne d1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r7, pc}
; CHECK-BE-NEXT: .p2align 3
; CHECK-BE-NEXT: @ %bb.4:
@@ -1813,19 +1691,20 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r7, lr}
; CHECK-LE-NEXT: push {r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: vmov r2, r3, d2
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: vmov r12, lr, d3
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: sbcs.w r2, r1, r3
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: rsbs.w r3, r12, #0
-; CHECK-LE-NEXT: sbcs.w r3, r1, lr
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: vmov r1, lr, d2
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: vmov r3, r12, d3
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, lr
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: sbcs.w r3, r2, r12
+; CHECK-LE-NEXT: bfi r2, r1, #0, #8
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: bfi r2, r1, #8, #8
+; CHECK-LE-NEXT: ubfx r1, r2, #8, #1
+; CHECK-LE-NEXT: and r2, r2, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: beq .LBB50_2
; CHECK-LE-NEXT: @ %bb.1: @ %cond.load
@@ -1838,7 +1717,6 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
; CHECK-LE-NEXT: vldrmi d1, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: pop {r7, pc}
; CHECK-LE-NEXT: .p2align 3
; CHECK-LE-NEXT: @ %bb.4:
@@ -1850,20 +1728,21 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vmov r2, r3, d1
-; CHECK-BE-NEXT: vmov r12, lr, d0
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: sbcs.w r2, r1, r2
+; CHECK-BE-NEXT: vmov lr, r2, d0
+; CHECK-BE-NEXT: vmov r12, r3, d1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r1, lr
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
+; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: sbcs.w r3, r1, r12
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
+; CHECK-BE-NEXT: bfi r1, r2, #0, #8
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: bfi r1, r2, #8, #8
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: bpl .LBB50_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
@@ -1876,7 +1755,6 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x do
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: it ne
; CHECK-BE-NEXT: vldrne d1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r7, pc}
; CHECK-BE-NEXT: .p2align 3
; CHECK-BE-NEXT: @ %bb.4:
@@ -1912,30 +1790,22 @@ entry:
define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(ptr %dest, <4 x i32> %a) {
; CHECK-LE-LABEL: anyext_v4i16_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT: mov.w r12, #0
-; CHECK-LE-NEXT: vmrs r3, p0
-; CHECK-LE-NEXT: and r1, r3, #1
-; CHECK-LE-NEXT: rsbs r2, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: ubfx r2, r3, #4, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
-; CHECK-LE-NEXT: ubfx r2, r3, #8, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #2, #1
-; CHECK-LE-NEXT: ubfx r2, r3, #12, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: beq .LBB52_2
; CHECK-LE-NEXT: @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT: ldrh r2, [r0]
-; CHECK-LE-NEXT: vdup.32 q0, r12
-; CHECK-LE-NEXT: vmov.32 q0[0], r2
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: ldrh r3, [r0]
+; CHECK-LE-NEXT: vdup.32 q0, r2
+; CHECK-LE-NEXT: vmov.32 q0[0], r3
; CHECK-LE-NEXT: b .LBB52_3
; CHECK-LE-NEXT: .LBB52_2:
; CHECK-LE-NEXT: vmov.i32 q0, #0x0
@@ -1952,36 +1822,27 @@ define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(ptr %dest, <4 x i32> %a) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: ldrhmi r0, [r0, #6]
; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: anyext_v4i16_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: mov.w r12, #0
; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT: vmrs r3, p0
-; CHECK-BE-NEXT: ubfx r1, r3, #12, #1
-; CHECK-BE-NEXT: rsbs r2, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
-; CHECK-BE-NEXT: ubfx r2, r3, #8, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
-; CHECK-BE-NEXT: ubfx r2, r3, #4, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #2, #1
-; CHECK-BE-NEXT: and r2, r3, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: bpl .LBB52_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT: ldrh r2, [r0]
-; CHECK-BE-NEXT: vdup.32 q1, r12
-; CHECK-BE-NEXT: vmov.32 q1[0], r2
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: ldrh r3, [r0]
+; CHECK-BE-NEXT: vdup.32 q1, r2
+; CHECK-BE-NEXT: vmov.32 q1[0], r3
; CHECK-BE-NEXT: b .LBB52_3
; CHECK-BE-NEXT: .LBB52_2:
; CHECK-BE-NEXT: vmov.i32 q1, #0x0
@@ -1999,7 +1860,6 @@ define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(ptr %dest, <4 x i32> %a) {
; CHECK-BE-NEXT: ldrhne r0, [r0, #6]
; CHECK-BE-NEXT: vmovne.32 q1[3], r0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
index 9012fada2bee24..73e7827b7046c2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -24,23 +24,15 @@ entry:
define arm_aapcs_vfpcc void @masked_v4i32_align1(ptr %dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i32_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: and r1, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne r2, s0
@@ -57,29 +49,20 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(ptr %dest, <4 x i32> %a) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi r1, s3
; CHECK-LE-NEXT: strmi r1, [r0, #12]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4i32_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: ubfx r1, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi r2, s4
@@ -96,7 +79,6 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(ptr %dest, <4 x i32> %a) {
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne r1, s7
; CHECK-BE-NEXT: strne r1, [r0, #12]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -181,37 +163,24 @@ entry:
define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8i16_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr
; CHECK-LE-NEXT: vmrs r1, p0
-; CHECK-LE-NEXT: and r2, r1, #1
-; CHECK-LE-NEXT: rsbs r3, r2, #0
-; CHECK-LE-NEXT: movs r2, #0
-; CHECK-LE-NEXT: bfi r2, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #6, #1
-; CHECK-LE-NEXT: rsbs r1, r1, #0
-; CHECK-LE-NEXT: bfi r2, r1, #7, #1
-; CHECK-LE-NEXT: uxtb r1, r2
-; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne.u16 r2, q0[0]
; CHECK-LE-NEXT: strhne r2, [r0]
@@ -243,43 +212,29 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7]
; CHECK-LE-NEXT: strhmi r1, [r0, #14]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8i16_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr
; CHECK-BE-NEXT: vmrs r1, p0
-; CHECK-BE-NEXT: ubfx r2, r1, #14, #1
-; CHECK-BE-NEXT: rsbs r3, r2, #0
-; CHECK-BE-NEXT: movs r2, #0
-; CHECK-BE-NEXT: bfi r2, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r2, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #14, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
; CHECK-BE-NEXT: and r1, r1, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #6, #1
-; CHECK-BE-NEXT: rsbs r1, r1, #0
-; CHECK-BE-NEXT: bfi r2, r1, #7, #1
-; CHECK-BE-NEXT: uxtb r1, r2
-; CHECK-BE-NEXT: lsls r2, r2, #24
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-BE-NEXT: lsls r2, r1, #24
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi.u16 r2, q1[0]
; CHECK-BE-NEXT: strhmi r2, [r0]
@@ -311,7 +266,6 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(ptr %dest, <8 x i16> %a) {
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne.u16 r1, q1[7]
; CHECK-BE-NEXT: strhne r1, [r0, #14]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -471,23 +425,15 @@ entry:
define arm_aapcs_vfpcc void @masked_v4f32_align1(ptr %dest, <4 x float> %a, <4 x i32> %b) {
; CHECK-LE-LABEL: masked_v4f32_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: and r3, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne r2, s0
@@ -504,30 +450,21 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(ptr %dest, <4 x float> %a, <4 x
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi r1, s3
; CHECK-LE-NEXT: strmi r1, [r0, #12]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f32_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q2, q1
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: ubfx r3, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi r2, s4
@@ -544,7 +481,6 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(ptr %dest, <4 x float> %a, <4 x
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne r1, s7
; CHECK-BE-NEXT: strne r1, [r0, #12]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp ugt <4 x i32> %b, zeroinitializer
@@ -630,37 +566,26 @@ entry:
define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x i16> %b) {
; CHECK-LE-LABEL: masked_v8f16_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #36
-; CHECK-LE-NEXT: sub sp, #36
+; CHECK-LE-NEXT: .pad #32
+; CHECK-LE-NEXT: sub sp, #32
; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr
-; CHECK-LE-NEXT: movs r2, #0
; CHECK-LE-NEXT: vmrs r1, p0
; CHECK-LE-NEXT: and r3, r1, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r2, r3, #6, #1
-; CHECK-LE-NEXT: rsbs r1, r1, #0
-; CHECK-LE-NEXT: bfi r2, r1, #7, #1
-; CHECK-LE-NEXT: uxtb r1, r2
-; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: bne .LBB16_9
; CHECK-LE-NEXT: @ %bb.1: @ %else
; CHECK-LE-NEXT: lsls r2, r1, #30
@@ -684,7 +609,7 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x
; CHECK-LE-NEXT: lsls r1, r1, #24
; CHECK-LE-NEXT: bmi .LBB16_16
; CHECK-LE-NEXT: .LBB16_8: @ %else14
-; CHECK-LE-NEXT: add sp, #36
+; CHECK-LE-NEXT: add sp, #32
; CHECK-LE-NEXT: bx lr
; CHECK-LE-NEXT: .LBB16_9: @ %cond.store
; CHECK-LE-NEXT: vstr.16 s0, [sp, #28]
@@ -736,44 +661,33 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x
; CHECK-LE-NEXT: vstr.16 s0, [sp]
; CHECK-LE-NEXT: ldrh.w r1, [sp]
; CHECK-LE-NEXT: strh r1, [r0, #14]
-; CHECK-LE-NEXT: add sp, #36
+; CHECK-LE-NEXT: add sp, #32
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8f16_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #36
-; CHECK-BE-NEXT: sub sp, #36
+; CHECK-BE-NEXT: .pad #32
+; CHECK-BE-NEXT: sub sp, #32
; CHECK-BE-NEXT: vrev64.16 q2, q1
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr
; CHECK-BE-NEXT: vmrs r1, p0
-; CHECK-BE-NEXT: ubfx r2, r1, #14, #1
-; CHECK-BE-NEXT: rsbs r3, r2, #0
-; CHECK-BE-NEXT: movs r2, #0
-; CHECK-BE-NEXT: bfi r2, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r2, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #14, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #3
; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #4
; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #5
; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
; CHECK-BE-NEXT: and r1, r1, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r2, r3, #6, #1
-; CHECK-BE-NEXT: rsbs r1, r1, #0
-; CHECK-BE-NEXT: bfi r2, r1, #7, #1
-; CHECK-BE-NEXT: uxtb r1, r2
-; CHECK-BE-NEXT: lsls r2, r2, #24
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #6
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #7
+; CHECK-BE-NEXT: lsls r2, r1, #24
; CHECK-BE-NEXT: bmi .LBB16_9
; CHECK-BE-NEXT: @ %bb.1: @ %else
; CHECK-BE-NEXT: lsls r2, r1, #25
@@ -797,7 +711,7 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: bne .LBB16_16
; CHECK-BE-NEXT: .LBB16_8: @ %else14
-; CHECK-BE-NEXT: add sp, #36
+; CHECK-BE-NEXT: add sp, #32
; CHECK-BE-NEXT: bx lr
; CHECK-BE-NEXT: .LBB16_9: @ %cond.store
; CHECK-BE-NEXT: vstr.16 s4, [sp, #28]
@@ -849,7 +763,7 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(ptr %dest, <8 x half> %a, <8 x
; CHECK-BE-NEXT: vstr.16 s0, [sp]
; CHECK-BE-NEXT: ldrh.w r1, [sp]
; CHECK-BE-NEXT: strh r1, [r0, #14]
-; CHECK-BE-NEXT: add sp, #36
+; CHECK-BE-NEXT: add sp, #32
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp ugt <8 x i16> %b, zeroinitializer
@@ -917,53 +831,53 @@ define arm_aapcs_vfpcc void @masked_v2i64(ptr %dest, <2 x i64> %a) {
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r7, lr}
; CHECK-LE-NEXT: push {r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: vmov r2, r3, d0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: vmov r12, lr, d1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: sbcs.w r2, r1, r3
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: rsbs.w r3, r12, #0
-; CHECK-LE-NEXT: sbcs.w r3, r1, lr
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: vmov r1, lr, d0
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: vmov r3, r12, d1
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, lr
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: sbcs.w r3, r2, r12
+; CHECK-LE-NEXT: bfi r2, r1, #0, #8
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: bfi r2, r1, #8, #8
+; CHECK-LE-NEXT: ubfx r1, r2, #8, #1
+; CHECK-LE-NEXT: and r2, r2, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: it ne
; CHECK-LE-NEXT: vstrne d0, [r0]
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: pop {r7, pc}
;
; CHECK-BE-LABEL: masked_v2i64:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vmov r2, r3, d3
-; CHECK-BE-NEXT: vmov r12, lr, d2
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: sbcs.w r2, r1, r2
+; CHECK-BE-NEXT: vmov lr, r2, d2
+; CHECK-BE-NEXT: vmov r12, r3, d3
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r1, lr
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
+; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: sbcs.w r3, r1, r12
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
+; CHECK-BE-NEXT: bfi r1, r2, #0, #8
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: bfi r1, r2, #8, #8
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: it mi
; CHECK-BE-NEXT: vstrmi d0, [r0]
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: it ne
; CHECK-BE-NEXT: vstrne d1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r7, pc}
entry:
%c = icmp sgt <2 x i64> %a, zeroinitializer
@@ -976,53 +890,53 @@ define arm_aapcs_vfpcc void @masked_v2f64(ptr %dest, <2 x double> %a, <2 x i64>
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: .save {r7, lr}
; CHECK-LE-NEXT: push {r7, lr}
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: vmov r2, r3, d2
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: vmov r12, lr, d3
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: sbcs.w r2, r1, r3
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: rsbs.w r3, r12, #0
-; CHECK-LE-NEXT: sbcs.w r3, r1, lr
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, lt
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: vmov r1, lr, d2
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: vmov r3, r12, d3
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, lr
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: sbcs.w r3, r2, r12
+; CHECK-LE-NEXT: bfi r2, r1, #0, #8
+; CHECK-LE-NEXT: csetm r1, lt
+; CHECK-LE-NEXT: bfi r2, r1, #8, #8
+; CHECK-LE-NEXT: ubfx r1, r2, #8, #1
+; CHECK-LE-NEXT: and r2, r2, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: it ne
; CHECK-LE-NEXT: vstrne d0, [r0]
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: pop {r7, pc}
;
; CHECK-BE-LABEL: masked_v2f64:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r7, lr}
; CHECK-BE-NEXT: push {r7, lr}
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q2, q1
; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vmov r2, r3, d5
-; CHECK-BE-NEXT: vmov r12, lr, d4
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: sbcs.w r2, r1, r2
+; CHECK-BE-NEXT: vmov lr, r2, d4
+; CHECK-BE-NEXT: vmov r12, r3, d5
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r1, lr
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: rsbs.w r3, lr, #0
+; CHECK-BE-NEXT: rsbs r3, r3, #0
; CHECK-BE-NEXT: sbcs.w r3, r1, r12
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
+; CHECK-BE-NEXT: bfi r1, r2, #0, #8
; CHECK-BE-NEXT: csetm r2, lt
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: bfi r1, r2, #8, #8
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #1
; CHECK-BE-NEXT: lsls r2, r1, #30
; CHECK-BE-NEXT: it mi
; CHECK-BE-NEXT: vstrmi d0, [r0]
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: it ne
; CHECK-BE-NEXT: vstrne d1, [r0, #8]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: pop {r7, pc}
entry:
%c = icmp sgt <2 x i64> %b, zeroinitializer
@@ -1093,23 +1007,15 @@ entry:
define arm_aapcs_vfpcc void @masked_v4i16_align1(ptr %dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i16_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
-; CHECK-LE-NEXT: vmrs r2, p0
-; CHECK-LE-NEXT: and r1, r2, #1
-; CHECK-LE-NEXT: rsbs r3, r1, #0
-; CHECK-LE-NEXT: movs r1, #0
-; CHECK-LE-NEXT: bfi r1, r3, #0, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #1, #1
-; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
-; CHECK-LE-NEXT: rsbs r3, r3, #0
-; CHECK-LE-NEXT: bfi r1, r3, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
+; CHECK-LE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
+; CHECK-LE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: itt ne
; CHECK-LE-NEXT: vmovne r2, s0
@@ -1126,29 +1032,20 @@ define arm_aapcs_vfpcc void @masked_v4i16_align1(ptr %dest, <4 x i32> %a) {
; CHECK-LE-NEXT: itt mi
; CHECK-LE-NEXT: vmovmi r1, s3
; CHECK-LE-NEXT: strhmi r1, [r0, #6]
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4i16_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
-; CHECK-BE-NEXT: vmrs r2, p0
-; CHECK-BE-NEXT: ubfx r1, r2, #12, #1
-; CHECK-BE-NEXT: rsbs r3, r1, #0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: bfi r1, r3, #0, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #1, #1
-; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
-; CHECK-BE-NEXT: and r2, r2, #1
-; CHECK-BE-NEXT: rsbs r3, r3, #0
-; CHECK-BE-NEXT: bfi r1, r3, #2, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: orr.w r2, r3, r2, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: and r1, r1, #1
+; CHECK-BE-NEXT: orr.w r2, r2, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: itt mi
; CHECK-BE-NEXT: vmovmi r2, s4
@@ -1165,7 +1062,6 @@ define arm_aapcs_vfpcc void @masked_v4i16_align1(ptr %dest, <4 x i32> %a) {
; CHECK-BE-NEXT: itt ne
; CHECK-BE-NEXT: vmovne r1, s7
; CHECK-BE-NEXT: strhne r1, [r0, #6]
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -1177,120 +1073,122 @@ entry:
define arm_aapcs_vfpcc void @masked_v4f16_align4(ptr %dest, <4 x float> %a) {
; CHECK-LE-LABEL: masked_v4f16_align4:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.f32 s0, #0
-; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: movs r2, #0
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s1, #0
; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0
; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2
; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1
; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: bfi r2, r1, #0, #4
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s3, #0
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: bfi r2, r1, #4, #4
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT: bfi r1, r2, #2, #1
-; CHECK-LE-NEXT: csetm r2, gt
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: bfi r2, r1, #8, #4
+; CHECK-LE-NEXT: csetm r1, gt
+; CHECK-LE-NEXT: bfi r2, r1, #12, #4
+; CHECK-LE-NEXT: and r3, r2, #1
+; CHECK-LE-NEXT: ubfx r1, r2, #4, #1
+; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
-; CHECK-LE-NEXT: bne .LBB25_5
+; CHECK-LE-NEXT: bne .LBB25_6
; CHECK-LE-NEXT: @ %bb.1: @ %else
; CHECK-LE-NEXT: lsls r2, r1, #30
-; CHECK-LE-NEXT: bmi .LBB25_6
+; CHECK-LE-NEXT: bmi .LBB25_7
; CHECK-LE-NEXT: .LBB25_2: @ %else2
; CHECK-LE-NEXT: lsls r2, r1, #29
-; CHECK-LE-NEXT: bmi .LBB25_7
-; CHECK-LE-NEXT: .LBB25_3: @ %else4
+; CHECK-LE-NEXT: bpl .LBB25_4
+; CHECK-LE-NEXT: .LBB25_3: @ %cond.store3
+; CHECK-LE-NEXT: vstr.16 s5, [r0, #4]
+; CHECK-LE-NEXT: .LBB25_4: @ %else4
; CHECK-LE-NEXT: lsls r1, r1, #28
-; CHECK-LE-NEXT: bmi .LBB25_8
-; CHECK-LE-NEXT: .LBB25_4: @ %else6
-; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: it pl
+; CHECK-LE-NEXT: bxpl lr
+; CHECK-LE-NEXT: .LBB25_5: @ %cond.store5
+; CHECK-LE-NEXT: vmovx.f16 s0, s5
+; CHECK-LE-NEXT: vstr.16 s0, [r0, #6]
; CHECK-LE-NEXT: bx lr
-; CHECK-LE-NEXT: .LBB25_5: @ %cond.store
+; CHECK-LE-NEXT: .LBB25_6: @ %cond.store
; CHECK-LE-NEXT: vstr.16 s4, [r0]
; CHECK-LE-NEXT: lsls r2, r1, #30
; CHECK-LE-NEXT: bpl .LBB25_2
-; CHECK-LE-NEXT: .LBB25_6: @ %cond.store1
+; CHECK-LE-NEXT: .LBB25_7: @ %cond.store1
; CHECK-LE-NEXT: vmovx.f16 s0, s4
; CHECK-LE-NEXT: vstr.16 s0, [r0, #2]
; CHECK-LE-NEXT: lsls r2, r1, #29
-; CHECK-LE-NEXT: bpl .LBB25_3
-; CHECK-LE-NEXT: .LBB25_7: @ %cond.store3
-; CHECK-LE-NEXT: vstr.16 s5, [r0, #4]
-; CHECK-LE-NEXT: lsls r1, r1, #28
-; CHECK-LE-NEXT: bpl .LBB25_4
-; CHECK-LE-NEXT: .LBB25_8: @ %cond.store5
-; CHECK-LE-NEXT: vmovx.f16 s0, s5
-; CHECK-LE-NEXT: vstr.16 s0, [r0, #6]
-; CHECK-LE-NEXT: add sp, #4
-; CHECK-LE-NEXT: bx lr
+; CHECK-LE-NEXT: bmi .LBB25_3
+; CHECK-LE-NEXT: b .LBB25_4
;
; CHECK-BE-LABEL: masked_v4f16_align4:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vcmp.f32 s7, #0
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: vcmp.f32 s4, #0
; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s6, #0
+; CHECK-BE-NEXT: vcmp.f32 s5, #0
; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6
; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5
; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s5, #0
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: vcmp.f32 s6, #0
+; CHECK-BE-NEXT: bfi r2, r1, #0, #4
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s4, #0
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: vcmp.f32 s7, #0
+; CHECK-BE-NEXT: bfi r2, r1, #4, #4
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: bfi r1, r2, #2, #1
-; CHECK-BE-NEXT: csetm r2, gt
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: bfi r2, r1, #8, #4
+; CHECK-BE-NEXT: csetm r1, gt
+; CHECK-BE-NEXT: bfi r2, r1, #12, #4
+; CHECK-BE-NEXT: ubfx r1, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #12, #1
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: and r2, r2, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
-; CHECK-BE-NEXT: bmi .LBB25_5
+; CHECK-BE-NEXT: bmi .LBB25_6
; CHECK-BE-NEXT: @ %bb.1: @ %else
; CHECK-BE-NEXT: lsls r2, r1, #29
-; CHECK-BE-NEXT: bmi .LBB25_6
+; CHECK-BE-NEXT: bmi .LBB25_7
; CHECK-BE-NEXT: .LBB25_2: @ %else2
; CHECK-BE-NEXT: lsls r2, r1, #30
-; CHECK-BE-NEXT: bmi .LBB25_7
-; CHECK-BE-NEXT: .LBB25_3: @ %else4
+; CHECK-BE-NEXT: bpl .LBB25_4
+; CHECK-BE-NEXT: .LBB25_3: @ %cond.store3
+; CHECK-BE-NEXT: vstr.16 s1, [r0, #4]
+; CHECK-BE-NEXT: .LBB25_4: @ %else4
; CHECK-BE-NEXT: lsls r1, r1, #31
-; CHECK-BE-NEXT: bne .LBB25_8
-; CHECK-BE-NEXT: .LBB25_4: @ %else6
-; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: it eq
+; CHECK-BE-NEXT: bxeq lr
+; CHECK-BE-NEXT: .LBB25_5: @ %cond.store5
+; CHECK-BE-NEXT: vmovx.f16 s0, s1
+; CHECK-BE-NEXT: vstr.16 s0, [r0, #6]
; CHECK-BE-NEXT: bx lr
-; CHECK-BE-NEXT: .LBB25_5: @ %cond.store
+; CHECK-BE-NEXT: .LBB25_6: @ %cond.store
; CHECK-BE-NEXT: vstr.16 s0, [r0]
; CHECK-BE-NEXT: lsls r2, r1, #29
; CHECK-BE-NEXT: bpl .LBB25_2
-; CHECK-BE-NEXT: .LBB25_6: @ %cond.store1
+; CHECK-BE-NEXT: .LBB25_7: @ %cond.store1
; CHECK-BE-NEXT: vmovx.f16 s0, s0
; CHECK-BE-NEXT: vstr.16 s0, [r0, #2]
; CHECK-BE-NEXT: lsls r2, r1, #30
-; CHECK-BE-NEXT: bpl .LBB25_3
-; CHECK-BE-NEXT: .LBB25_7: @ %cond.store3
-; CHECK-BE-NEXT: vstr.16 s1, [r0, #4]
-; CHECK-BE-NEXT: lsls r1, r1, #31
-; CHECK-BE-NEXT: beq .LBB25_4
-; CHECK-BE-NEXT: .LBB25_8: @ %cond.store5
-; CHECK-BE-NEXT: vmovx.f16 s0, s1
-; CHECK-BE-NEXT: vstr.16 s0, [r0, #6]
-; CHECK-BE-NEXT: add sp, #4
-; CHECK-BE-NEXT: bx lr
+; CHECK-BE-NEXT: bmi .LBB25_3
+; CHECK-BE-NEXT: b .LBB25_4
entry:
%c = fcmp ogt <4 x float> %a, zeroinitializer
%trunc = fptrunc <4 x float> %a to <4 x half>
@@ -1301,120 +1199,122 @@ entry:
define arm_aapcs_vfpcc void @masked_v4f16_align2(ptr %dest, <4 x float> %a) {
; CHECK-LE-LABEL: masked_v4f16_align2:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.f32 s0, #0
-; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: movs r2, #0
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s1, #0
; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0
; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2
; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1
; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: bfi r2, r1, #0, #4
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s3, #0
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: bfi r2, r1, #4, #4
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT: bfi r1, r2, #2, #1
-; CHECK-LE-NEXT: csetm r2, gt
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: bfi r2, r1, #8, #4
+; CHECK-LE-NEXT: csetm r1, gt
+; CHECK-LE-NEXT: bfi r2, r1, #12, #4
+; CHECK-LE-NEXT: and r3, r2, #1
+; CHECK-LE-NEXT: ubfx r1, r2, #4, #1
+; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
-; CHECK-LE-NEXT: bne .LBB26_5
+; CHECK-LE-NEXT: bne .LBB26_6
; CHECK-LE-NEXT: @ %bb.1: @ %else
; CHECK-LE-NEXT: lsls r2, r1, #30
-; CHECK-LE-NEXT: bmi .LBB26_6
+; CHECK-LE-NEXT: bmi .LBB26_7
; CHECK-LE-NEXT: .LBB26_2: @ %else2
; CHECK-LE-NEXT: lsls r2, r1, #29
-; CHECK-LE-NEXT: bmi .LBB26_7
-; CHECK-LE-NEXT: .LBB26_3: @ %else4
+; CHECK-LE-NEXT: bpl .LBB26_4
+; CHECK-LE-NEXT: .LBB26_3: @ %cond.store3
+; CHECK-LE-NEXT: vstr.16 s5, [r0, #4]
+; CHECK-LE-NEXT: .LBB26_4: @ %else4
; CHECK-LE-NEXT: lsls r1, r1, #28
-; CHECK-LE-NEXT: bmi .LBB26_8
-; CHECK-LE-NEXT: .LBB26_4: @ %else6
-; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: it pl
+; CHECK-LE-NEXT: bxpl lr
+; CHECK-LE-NEXT: .LBB26_5: @ %cond.store5
+; CHECK-LE-NEXT: vmovx.f16 s0, s5
+; CHECK-LE-NEXT: vstr.16 s0, [r0, #6]
; CHECK-LE-NEXT: bx lr
-; CHECK-LE-NEXT: .LBB26_5: @ %cond.store
+; CHECK-LE-NEXT: .LBB26_6: @ %cond.store
; CHECK-LE-NEXT: vstr.16 s4, [r0]
; CHECK-LE-NEXT: lsls r2, r1, #30
; CHECK-LE-NEXT: bpl .LBB26_2
-; CHECK-LE-NEXT: .LBB26_6: @ %cond.store1
+; CHECK-LE-NEXT: .LBB26_7: @ %cond.store1
; CHECK-LE-NEXT: vmovx.f16 s0, s4
; CHECK-LE-NEXT: vstr.16 s0, [r0, #2]
; CHECK-LE-NEXT: lsls r2, r1, #29
-; CHECK-LE-NEXT: bpl .LBB26_3
-; CHECK-LE-NEXT: .LBB26_7: @ %cond.store3
-; CHECK-LE-NEXT: vstr.16 s5, [r0, #4]
-; CHECK-LE-NEXT: lsls r1, r1, #28
-; CHECK-LE-NEXT: bpl .LBB26_4
-; CHECK-LE-NEXT: .LBB26_8: @ %cond.store5
-; CHECK-LE-NEXT: vmovx.f16 s0, s5
-; CHECK-LE-NEXT: vstr.16 s0, [r0, #6]
-; CHECK-LE-NEXT: add sp, #4
-; CHECK-LE-NEXT: bx lr
+; CHECK-LE-NEXT: bmi .LBB26_3
+; CHECK-LE-NEXT: b .LBB26_4
;
; CHECK-BE-LABEL: masked_v4f16_align2:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vcmp.f32 s7, #0
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: vcmp.f32 s4, #0
; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s6, #0
+; CHECK-BE-NEXT: vcmp.f32 s5, #0
; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6
; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5
; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s5, #0
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: vcmp.f32 s6, #0
+; CHECK-BE-NEXT: bfi r2, r1, #0, #4
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s4, #0
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: vcmp.f32 s7, #0
+; CHECK-BE-NEXT: bfi r2, r1, #4, #4
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: bfi r1, r2, #2, #1
-; CHECK-BE-NEXT: csetm r2, gt
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: bfi r2, r1, #8, #4
+; CHECK-BE-NEXT: csetm r1, gt
+; CHECK-BE-NEXT: bfi r2, r1, #12, #4
+; CHECK-BE-NEXT: ubfx r1, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #12, #1
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: and r2, r2, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
-; CHECK-BE-NEXT: bmi .LBB26_5
+; CHECK-BE-NEXT: bmi .LBB26_6
; CHECK-BE-NEXT: @ %bb.1: @ %else
; CHECK-BE-NEXT: lsls r2, r1, #29
-; CHECK-BE-NEXT: bmi .LBB26_6
+; CHECK-BE-NEXT: bmi .LBB26_7
; CHECK-BE-NEXT: .LBB26_2: @ %else2
; CHECK-BE-NEXT: lsls r2, r1, #30
-; CHECK-BE-NEXT: bmi .LBB26_7
-; CHECK-BE-NEXT: .LBB26_3: @ %else4
+; CHECK-BE-NEXT: bpl .LBB26_4
+; CHECK-BE-NEXT: .LBB26_3: @ %cond.store3
+; CHECK-BE-NEXT: vstr.16 s1, [r0, #4]
+; CHECK-BE-NEXT: .LBB26_4: @ %else4
; CHECK-BE-NEXT: lsls r1, r1, #31
-; CHECK-BE-NEXT: bne .LBB26_8
-; CHECK-BE-NEXT: .LBB26_4: @ %else6
-; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: it eq
+; CHECK-BE-NEXT: bxeq lr
+; CHECK-BE-NEXT: .LBB26_5: @ %cond.store5
+; CHECK-BE-NEXT: vmovx.f16 s0, s1
+; CHECK-BE-NEXT: vstr.16 s0, [r0, #6]
; CHECK-BE-NEXT: bx lr
-; CHECK-BE-NEXT: .LBB26_5: @ %cond.store
+; CHECK-BE-NEXT: .LBB26_6: @ %cond.store
; CHECK-BE-NEXT: vstr.16 s0, [r0]
; CHECK-BE-NEXT: lsls r2, r1, #29
; CHECK-BE-NEXT: bpl .LBB26_2
-; CHECK-BE-NEXT: .LBB26_6: @ %cond.store1
+; CHECK-BE-NEXT: .LBB26_7: @ %cond.store1
; CHECK-BE-NEXT: vmovx.f16 s0, s0
; CHECK-BE-NEXT: vstr.16 s0, [r0, #2]
; CHECK-BE-NEXT: lsls r2, r1, #30
-; CHECK-BE-NEXT: bpl .LBB26_3
-; CHECK-BE-NEXT: .LBB26_7: @ %cond.store3
-; CHECK-BE-NEXT: vstr.16 s1, [r0, #4]
-; CHECK-BE-NEXT: lsls r1, r1, #31
-; CHECK-BE-NEXT: beq .LBB26_4
-; CHECK-BE-NEXT: .LBB26_8: @ %cond.store5
-; CHECK-BE-NEXT: vmovx.f16 s0, s1
-; CHECK-BE-NEXT: vstr.16 s0, [r0, #6]
-; CHECK-BE-NEXT: add sp, #4
-; CHECK-BE-NEXT: bx lr
+; CHECK-BE-NEXT: bmi .LBB26_3
+; CHECK-BE-NEXT: b .LBB26_4
entry:
%c = fcmp ogt <4 x float> %a, zeroinitializer
%trunc = fptrunc <4 x float> %a to <4 x half>
@@ -1425,29 +1325,36 @@ entry:
define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) {
; CHECK-LE-LABEL: masked_v4f16_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #20
-; CHECK-LE-NEXT: sub sp, #20
+; CHECK-LE-NEXT: .pad #16
+; CHECK-LE-NEXT: sub sp, #16
; CHECK-LE-NEXT: vcmp.f32 s0, #0
-; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: movs r2, #0
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s1, #0
; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0
; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2
; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1
; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s2, #0
-; CHECK-LE-NEXT: bfi r1, r2, #0, #1
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: bfi r2, r1, #0, #4
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-LE-NEXT: vcmp.f32 s3, #0
-; CHECK-LE-NEXT: bfi r1, r2, #1, #1
-; CHECK-LE-NEXT: csetm r2, gt
+; CHECK-LE-NEXT: bfi r2, r1, #4, #4
+; CHECK-LE-NEXT: csetm r1, gt
; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-LE-NEXT: bfi r1, r2, #2, #1
-; CHECK-LE-NEXT: csetm r2, gt
-; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: bfi r2, r1, #8, #4
+; CHECK-LE-NEXT: csetm r1, gt
+; CHECK-LE-NEXT: bfi r2, r1, #12, #4
+; CHECK-LE-NEXT: and r3, r2, #1
+; CHECK-LE-NEXT: ubfx r1, r2, #4, #1
+; CHECK-LE-NEXT: orr.w r1, r3, r1, lsl #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r3, lsl #2
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3
; CHECK-LE-NEXT: lsls r2, r1, #31
; CHECK-LE-NEXT: bne .LBB27_5
; CHECK-LE-NEXT: @ %bb.1: @ %else
@@ -1460,7 +1367,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) {
; CHECK-LE-NEXT: lsls r1, r1, #28
; CHECK-LE-NEXT: bmi .LBB27_8
; CHECK-LE-NEXT: .LBB27_4: @ %else6
-; CHECK-LE-NEXT: add sp, #20
+; CHECK-LE-NEXT: add sp, #16
; CHECK-LE-NEXT: bx lr
; CHECK-LE-NEXT: .LBB27_5: @ %cond.store
; CHECK-LE-NEXT: vstr.16 s4, [sp, #12]
@@ -1486,35 +1393,42 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) {
; CHECK-LE-NEXT: vstr.16 s0, [sp]
; CHECK-LE-NEXT: ldrh.w r1, [sp]
; CHECK-LE-NEXT: strh r1, [r0, #6]
-; CHECK-LE-NEXT: add sp, #20
+; CHECK-LE-NEXT: add sp, #16
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f16_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #20
-; CHECK-BE-NEXT: sub sp, #20
+; CHECK-BE-NEXT: .pad #16
+; CHECK-BE-NEXT: sub sp, #16
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: movs r1, #0
-; CHECK-BE-NEXT: vcmp.f32 s7, #0
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: vcmp.f32 s4, #0
; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s6, #0
+; CHECK-BE-NEXT: vcmp.f32 s5, #0
; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6
; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5
; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s5, #0
-; CHECK-BE-NEXT: bfi r1, r2, #0, #1
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: vcmp.f32 s6, #0
+; CHECK-BE-NEXT: bfi r2, r1, #0, #4
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: vcmp.f32 s4, #0
-; CHECK-BE-NEXT: bfi r1, r2, #1, #1
-; CHECK-BE-NEXT: csetm r2, gt
+; CHECK-BE-NEXT: vcmp.f32 s7, #0
+; CHECK-BE-NEXT: bfi r2, r1, #4, #4
+; CHECK-BE-NEXT: csetm r1, gt
; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-BE-NEXT: bfi r1, r2, #2, #1
-; CHECK-BE-NEXT: csetm r2, gt
-; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: bfi r2, r1, #8, #4
+; CHECK-BE-NEXT: csetm r1, gt
+; CHECK-BE-NEXT: bfi r2, r1, #12, #4
+; CHECK-BE-NEXT: ubfx r1, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #12, #1
+; CHECK-BE-NEXT: orr.w r1, r3, r1, lsl #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: and r2, r2, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r3, lsl #2
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3
; CHECK-BE-NEXT: lsls r2, r1, #28
; CHECK-BE-NEXT: bmi .LBB27_5
; CHECK-BE-NEXT: @ %bb.1: @ %else
@@ -1527,7 +1441,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) {
; CHECK-BE-NEXT: lsls r1, r1, #31
; CHECK-BE-NEXT: bne .LBB27_8
; CHECK-BE-NEXT: .LBB27_4: @ %else6
-; CHECK-BE-NEXT: add sp, #20
+; CHECK-BE-NEXT: add sp, #16
; CHECK-BE-NEXT: bx lr
; CHECK-BE-NEXT: .LBB27_5: @ %cond.store
; CHECK-BE-NEXT: vstr.16 s0, [sp, #12]
@@ -1553,7 +1467,7 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) {
; CHECK-BE-NEXT: vstr.16 s0, [sp]
; CHECK-BE-NEXT: ldrh.w r1, [sp]
; CHECK-BE-NEXT: strh r1, [r0, #6]
-; CHECK-BE-NEXT: add sp, #20
+; CHECK-BE-NEXT: add sp, #16
; CHECK-BE-NEXT: bx lr
entry:
%c = fcmp ogt <4 x float> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
index 0d0e45956080de..6aea38cde4e692 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll
@@ -5,47 +5,44 @@
define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) {
; CHECK-LE-LABEL: bitcast_to_v4i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: and r0, r0, #15
-; CHECK-LE-NEXT: vmov.i8 q1, #0x0
-; CHECK-LE-NEXT: vmov.i8 q2, #0xff
-; CHECK-LE-NEXT: vmsr p0, r0
-; CHECK-LE-NEXT: vpsel q1, q2, q1
-; CHECK-LE-NEXT: vmov.u8 r0, q1[2]
-; CHECK-LE-NEXT: vmov.u8 r1, q1[0]
-; CHECK-LE-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-LE-NEXT: vmov.u8 r0, q1[3]
-; CHECK-LE-NEXT: vmov.u8 r1, q1[1]
+; CHECK-LE-NEXT: and r3, r0, #1
+; CHECK-LE-NEXT: and r1, r0, #8
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: sub.w r1, r2, r1, lsr #3
+; CHECK-LE-NEXT: bfi r2, r3, #0, #4
+; CHECK-LE-NEXT: ubfx r3, r0, #1, #1
+; CHECK-LE-NEXT: ubfx r0, r0, #2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: rsbs r0, r0, #0
+; CHECK-LE-NEXT: bfi r2, r3, #4, #4
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
-; CHECK-LE-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr
+; CHECK-LE-NEXT: bfi r2, r0, #8, #4
+; CHECK-LE-NEXT: bfi r2, r1, #12, #4
+; CHECK-LE-NEXT: vmsr p0, r2
; CHECK-LE-NEXT: vpsel q0, q0, q1
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_to_v4i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: rbit r0, r0
-; CHECK-BE-NEXT: vmov.i8 q1, #0x0
-; CHECK-BE-NEXT: vmov.i8 q2, #0xff
-; CHECK-BE-NEXT: lsrs r0, r0, #28
-; CHECK-BE-NEXT: vmsr p0, r0
-; CHECK-BE-NEXT: vpsel q1, q2, q1
-; CHECK-BE-NEXT: vmov.u8 r0, q1[2]
-; CHECK-BE-NEXT: vmov.u8 r1, q1[0]
-; CHECK-BE-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-BE-NEXT: vmov.u8 r0, q1[3]
-; CHECK-BE-NEXT: vmov.u8 r1, q1[1]
+; CHECK-BE-NEXT: and r1, r0, #8
+; CHECK-BE-NEXT: movs r2, #0
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vmov q2[3], q2[1], r1, r0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
-; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr
+; CHECK-BE-NEXT: sub.w r1, r2, r1, lsr #3
+; CHECK-BE-NEXT: bfi r2, r1, #0, #4
+; CHECK-BE-NEXT: ubfx r1, r0, #2, #1
+; CHECK-BE-NEXT: rsbs r1, r1, #0
+; CHECK-BE-NEXT: bfi r2, r1, #4, #4
+; CHECK-BE-NEXT: ubfx r1, r0, #1, #1
+; CHECK-BE-NEXT: and r0, r0, #1
+; CHECK-BE-NEXT: rsbs r1, r1, #0
+; CHECK-BE-NEXT: bfi r2, r1, #8, #4
+; CHECK-BE-NEXT: rsbs r0, r0, #0
+; CHECK-BE-NEXT: bfi r2, r0, #12, #4
+; CHECK-BE-NEXT: vmsr p0, r2
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = bitcast i4 %b to <4 x i1>
@@ -56,68 +53,68 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) {
; CHECK-LE-LABEL: bitcast_to_v8i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: uxtb r0, r0
-; CHECK-LE-NEXT: vmov.i8 q1, #0x0
-; CHECK-LE-NEXT: vmov.i8 q2, #0xff
-; CHECK-LE-NEXT: vmsr p0, r0
-; CHECK-LE-NEXT: vpsel q2, q2, q1
-; CHECK-LE-NEXT: vmov.u8 r0, q2[0]
-; CHECK-LE-NEXT: vmov.16 q1[0], r0
-; CHECK-LE-NEXT: vmov.u8 r0, q2[1]
-; CHECK-LE-NEXT: vmov.16 q1[1], r0
-; CHECK-LE-NEXT: vmov.u8 r0, q2[2]
-; CHECK-LE-NEXT: vmov.16 q1[2], r0
-; CHECK-LE-NEXT: vmov.u8 r0, q2[3]
-; CHECK-LE-NEXT: vmov.16 q1[3], r0
-; CHECK-LE-NEXT: vmov.u8 r0, q2[4]
-; CHECK-LE-NEXT: vmov.16 q1[4], r0
-; CHECK-LE-NEXT: vmov.u8 r0, q2[5]
-; CHECK-LE-NEXT: vmov.16 q1[5], r0
-; CHECK-LE-NEXT: vmov.u8 r0, q2[6]
-; CHECK-LE-NEXT: vmov.16 q1[6], r0
-; CHECK-LE-NEXT: vmov.u8 r0, q2[7]
-; CHECK-LE-NEXT: vmov.16 q1[7], r0
-; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr
+; CHECK-LE-NEXT: and r3, r0, #1
+; CHECK-LE-NEXT: uxtb r2, r0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: sub.w r2, r1, r2, lsr #7
+; CHECK-LE-NEXT: bfi r1, r3, #0, #2
+; CHECK-LE-NEXT: ubfx r3, r0, #1, #1
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #2
+; CHECK-LE-NEXT: ubfx r3, r0, #2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #4, #2
+; CHECK-LE-NEXT: ubfx r3, r0, #3, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #6, #2
+; CHECK-LE-NEXT: ubfx r3, r0, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #8, #2
+; CHECK-LE-NEXT: ubfx r3, r0, #5, #1
+; CHECK-LE-NEXT: ubfx r0, r0, #6, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #10, #2
+; CHECK-LE-NEXT: rsbs r0, r0, #0
+; CHECK-LE-NEXT: bfi r1, r0, #12, #2
+; CHECK-LE-NEXT: bfi r1, r2, #14, #2
+; CHECK-LE-NEXT: vmsr p0, r1
; CHECK-LE-NEXT: vpsel q0, q0, q1
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_to_v8i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: uxtb r0, r0
-; CHECK-BE-NEXT: vmov.i8 q1, #0x0
-; CHECK-BE-NEXT: rbit r0, r0
-; CHECK-BE-NEXT: vmov.i8 q2, #0xff
-; CHECK-BE-NEXT: lsrs r0, r0, #24
-; CHECK-BE-NEXT: vmsr p0, r0
-; CHECK-BE-NEXT: vpsel q2, q2, q1
-; CHECK-BE-NEXT: vmov.u8 r0, q2[0]
-; CHECK-BE-NEXT: vmov.16 q1[0], r0
-; CHECK-BE-NEXT: vmov.u8 r0, q2[1]
-; CHECK-BE-NEXT: vmov.16 q1[1], r0
-; CHECK-BE-NEXT: vmov.u8 r0, q2[2]
-; CHECK-BE-NEXT: vmov.16 q1[2], r0
-; CHECK-BE-NEXT: vmov.u8 r0, q2[3]
-; CHECK-BE-NEXT: vmov.16 q1[3], r0
-; CHECK-BE-NEXT: vmov.u8 r0, q2[4]
-; CHECK-BE-NEXT: vmov.16 q1[4], r0
-; CHECK-BE-NEXT: vmov.u8 r0, q2[5]
-; CHECK-BE-NEXT: vmov.16 q1[5], r0
-; CHECK-BE-NEXT: vmov.u8 r0, q2[6]
-; CHECK-BE-NEXT: vmov.16 q1[6], r0
-; CHECK-BE-NEXT: vmov.u8 r0, q2[7]
-; CHECK-BE-NEXT: vmov.16 q1[7], r0
-; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr
+; CHECK-BE-NEXT: uxtb r2, r0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: sub.w r2, r1, r2, lsr #7
; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: bfi r1, r2, #0, #2
+; CHECK-BE-NEXT: ubfx r2, r0, #6, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
+; CHECK-BE-NEXT: bfi r1, r2, #2, #2
+; CHECK-BE-NEXT: ubfx r2, r0, #5, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #4, #2
+; CHECK-BE-NEXT: ubfx r2, r0, #4, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #6, #2
+; CHECK-BE-NEXT: ubfx r2, r0, #3, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #8, #2
+; CHECK-BE-NEXT: ubfx r2, r0, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #10, #2
+; CHECK-BE-NEXT: ubfx r2, r0, #1, #1
+; CHECK-BE-NEXT: and r0, r0, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #12, #2
+; CHECK-BE-NEXT: rsbs r0, r0, #0
+; CHECK-BE-NEXT: bfi r1, r0, #14, #2
+; CHECK-BE-NEXT: vmsr p0, r1
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.16 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = bitcast i8 %b to <8 x i1>
@@ -128,27 +125,116 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) {
; CHECK-LE-LABEL: bitcast_to_v16i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: vmsr p0, r0
+; CHECK-LE-NEXT: and r3, r0, #1
+; CHECK-LE-NEXT: uxth r2, r0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: sub.w r2, r1, r2, lsr #15
+; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #1, #1
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #3, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #3, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #4, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #5, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #5, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #6, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #6, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #7, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #7, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #8, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #8, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #9, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #9, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #10, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #10, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #11, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #11, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #12, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #12, #1
+; CHECK-LE-NEXT: ubfx r3, r0, #13, #1
+; CHECK-LE-NEXT: ubfx r0, r0, #14, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #13, #1
+; CHECK-LE-NEXT: rsbs r0, r0, #0
+; CHECK-LE-NEXT: bfi r1, r0, #14, #1
+; CHECK-LE-NEXT: bfi r1, r2, #15, #1
+; CHECK-LE-NEXT: vmsr p0, r1
; CHECK-LE-NEXT: vpsel q0, q0, q1
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_to_v16i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: uxth r0, r0
+; CHECK-BE-NEXT: uxth r2, r0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: sub.w r2, r1, r2, lsr #15
; CHECK-BE-NEXT: vrev64.8 q1, q0
-; CHECK-BE-NEXT: rbit r0, r0
+; CHECK-BE-NEXT: bfi r1, r2, #0, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #14, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
-; CHECK-BE-NEXT: lsrs r0, r0, #16
-; CHECK-BE-NEXT: vmsr p0, r0
+; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #13, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #2, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #12, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #11, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #4, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #10, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #5, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #9, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #6, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #8, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #7, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #7, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #6, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #9, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #5, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #10, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #4, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #11, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #3, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #12, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #13, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #1, #1
+; CHECK-BE-NEXT: and r0, r0, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #14, #1
+; CHECK-BE-NEXT: rsbs r0, r0, #0
+; CHECK-BE-NEXT: bfi r1, r0, #15, #1
+; CHECK-BE-NEXT: vmsr p0, r1
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.8 q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = bitcast i16 %b to <16 x i1>
@@ -159,41 +245,30 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) {
; CHECK-LE-LABEL: bitcast_to_v2i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
-; CHECK-LE-NEXT: and r0, r0, #3
-; CHECK-LE-NEXT: vmov.i8 q1, #0x0
-; CHECK-LE-NEXT: vmov.i8 q2, #0xff
-; CHECK-LE-NEXT: vmsr p0, r0
-; CHECK-LE-NEXT: vpsel q1, q2, q1
-; CHECK-LE-NEXT: vmov.u8 r0, q1[1]
-; CHECK-LE-NEXT: vmov.u8 r1, q1[0]
-; CHECK-LE-NEXT: vmov q1[2], q1[0], r1, r0
-; CHECK-LE-NEXT: vmov q1[3], q1[1], r1, r0
-; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr
+; CHECK-LE-NEXT: and r1, r0, #2
+; CHECK-LE-NEXT: and r0, r0, #1
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: rsbs r0, r0, #0
+; CHECK-LE-NEXT: sub.w r1, r2, r1, lsr #1
+; CHECK-LE-NEXT: bfi r2, r0, #0, #8
+; CHECK-LE-NEXT: bfi r2, r1, #8, #8
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
+; CHECK-LE-NEXT: vmsr p0, r2
; CHECK-LE-NEXT: vpsel q0, q0, q1
-; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_to_v2i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
-; CHECK-BE-NEXT: rbit r0, r0
-; CHECK-BE-NEXT: vmov.i8 q1, #0x0
-; CHECK-BE-NEXT: vmov.i8 q2, #0xff
-; CHECK-BE-NEXT: lsrs r0, r0, #30
-; CHECK-BE-NEXT: vmsr p0, r0
-; CHECK-BE-NEXT: vpsel q1, q2, q1
-; CHECK-BE-NEXT: vmov.u8 r0, q1[1]
-; CHECK-BE-NEXT: vmov.u8 r1, q1[0]
-; CHECK-BE-NEXT: vmov q1[2], q1[0], r1, r0
-; CHECK-BE-NEXT: vmov q1[3], q1[1], r1, r0
-; CHECK-BE-NEXT: vcmp.i32 ne, q1, zr
+; CHECK-BE-NEXT: and r1, r0, #2
+; CHECK-BE-NEXT: and r0, r0, #1
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: rsbs r0, r0, #0
+; CHECK-BE-NEXT: sub.w r1, r2, r1, lsr #1
; CHECK-BE-NEXT: vmov.i32 q1, #0x0
+; CHECK-BE-NEXT: bfi r2, r1, #0, #8
+; CHECK-BE-NEXT: bfi r2, r0, #8, #8
+; CHECK-BE-NEXT: vmsr p0, r2
; CHECK-BE-NEXT: vpsel q0, q0, q1
-; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = bitcast i2 %b to <2 x i1>
@@ -205,47 +280,29 @@ entry:
define arm_aapcs_vfpcc i4 @bitcast_from_v4i1(<4 x i32> %a) {
; CHECK-LE-LABEL: bitcast_from_v4i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr
-; CHECK-LE-NEXT: vmrs r1, p0
-; CHECK-LE-NEXT: and r0, r1, #1
-; CHECK-LE-NEXT: rsbs r2, r0, #0
-; CHECK-LE-NEXT: movs r0, #0
-; CHECK-LE-NEXT: bfi r0, r2, #0, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #1, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #8, #1
-; CHECK-LE-NEXT: ubfx r1, r1, #12, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #2, #1
-; CHECK-LE-NEXT: rsbs r1, r1, #0
-; CHECK-LE-NEXT: bfi r0, r1, #3, #1
-; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: vmrs r0, p0
+; CHECK-LE-NEXT: and r2, r0, #1
+; CHECK-LE-NEXT: ubfx r1, r0, #4, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
+; CHECK-LE-NEXT: ubfx r2, r0, #8, #1
+; CHECK-LE-NEXT: ubfx r0, r0, #12, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #2
+; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #3
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_from_v4i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr
-; CHECK-BE-NEXT: vmrs r1, p0
-; CHECK-BE-NEXT: ubfx r0, r1, #12, #1
-; CHECK-BE-NEXT: rsbs r2, r0, #0
-; CHECK-BE-NEXT: movs r0, #0
-; CHECK-BE-NEXT: bfi r0, r2, #0, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #1, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #4, #1
-; CHECK-BE-NEXT: and r1, r1, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #2, #1
-; CHECK-BE-NEXT: rsbs r1, r1, #0
-; CHECK-BE-NEXT: bfi r0, r1, #3, #1
-; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: vmrs r0, p0
+; CHECK-BE-NEXT: ubfx r1, r0, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #12, #1
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1
+; CHECK-BE-NEXT: ubfx r2, r0, #4, #1
+; CHECK-BE-NEXT: and r0, r0, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #2
+; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #3
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %a, zeroinitializer
@@ -256,73 +313,45 @@ entry:
define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) {
; CHECK-LE-LABEL: bitcast_from_v8i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr
-; CHECK-LE-NEXT: vmrs r1, p0
-; CHECK-LE-NEXT: and r0, r1, #1
-; CHECK-LE-NEXT: rsbs r2, r0, #0
-; CHECK-LE-NEXT: movs r0, #0
-; CHECK-LE-NEXT: bfi r0, r2, #0, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #2, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #1, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #4, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #2, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #6, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #3, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #8, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #4, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #10, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #5, #1
-; CHECK-LE-NEXT: ubfx r2, r1, #12, #1
-; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
-; CHECK-LE-NEXT: rsbs r2, r2, #0
-; CHECK-LE-NEXT: bfi r0, r2, #6, #1
-; CHECK-LE-NEXT: rsbs r1, r1, #0
-; CHECK-LE-NEXT: bfi r0, r1, #7, #1
-; CHECK-LE-NEXT: uxtb r0, r0
-; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: vmrs r0, p0
+; CHECK-LE-NEXT: and r2, r0, #1
+; CHECK-LE-NEXT: ubfx r1, r0, #2, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
+; CHECK-LE-NEXT: ubfx r2, r0, #4, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #2
+; CHECK-LE-NEXT: ubfx r2, r0, #6, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3
+; CHECK-LE-NEXT: ubfx r2, r0, #8, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #4
+; CHECK-LE-NEXT: ubfx r2, r0, #10, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #5
+; CHECK-LE-NEXT: ubfx r2, r0, #12, #1
+; CHECK-LE-NEXT: ubfx r0, r0, #14, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #6
+; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #7
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_from_v8i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr
-; CHECK-BE-NEXT: vmrs r1, p0
-; CHECK-BE-NEXT: ubfx r0, r1, #14, #1
-; CHECK-BE-NEXT: rsbs r2, r0, #0
-; CHECK-BE-NEXT: movs r0, #0
-; CHECK-BE-NEXT: bfi r0, r2, #0, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #12, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #1, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #10, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #2, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #8, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #3, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #6, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #4, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #4, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #5, #1
-; CHECK-BE-NEXT: ubfx r2, r1, #2, #1
-; CHECK-BE-NEXT: and r1, r1, #1
-; CHECK-BE-NEXT: rsbs r2, r2, #0
-; CHECK-BE-NEXT: bfi r0, r2, #6, #1
-; CHECK-BE-NEXT: rsbs r1, r1, #0
-; CHECK-BE-NEXT: bfi r0, r1, #7, #1
-; CHECK-BE-NEXT: uxtb r0, r0
-; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: vmrs r0, p0
+; CHECK-BE-NEXT: ubfx r1, r0, #12, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #14, #1
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1
+; CHECK-BE-NEXT: ubfx r2, r0, #10, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #2
+; CHECK-BE-NEXT: ubfx r2, r0, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3
+; CHECK-BE-NEXT: ubfx r2, r0, #6, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #4
+; CHECK-BE-NEXT: ubfx r2, r0, #4, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #5
+; CHECK-BE-NEXT: ubfx r2, r0, #2, #1
+; CHECK-BE-NEXT: and r0, r0, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #6
+; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #7
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %a, zeroinitializer
@@ -333,24 +362,77 @@ entry:
define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) {
; CHECK-LE-LABEL: bitcast_from_v16i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr
; CHECK-LE-NEXT: vmrs r0, p0
-; CHECK-LE-NEXT: uxth r0, r0
-; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: and r2, r0, #1
+; CHECK-LE-NEXT: ubfx r1, r0, #1, #1
+; CHECK-LE-NEXT: orr.w r1, r2, r1, lsl #1
+; CHECK-LE-NEXT: ubfx r2, r0, #2, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #2
+; CHECK-LE-NEXT: ubfx r2, r0, #3, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #3
+; CHECK-LE-NEXT: ubfx r2, r0, #4, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #4
+; CHECK-LE-NEXT: ubfx r2, r0, #5, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #5
+; CHECK-LE-NEXT: ubfx r2, r0, #6, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #6
+; CHECK-LE-NEXT: ubfx r2, r0, #7, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #7
+; CHECK-LE-NEXT: ubfx r2, r0, #8, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #8
+; CHECK-LE-NEXT: ubfx r2, r0, #9, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #9
+; CHECK-LE-NEXT: ubfx r2, r0, #10, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #10
+; CHECK-LE-NEXT: ubfx r2, r0, #11, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #11
+; CHECK-LE-NEXT: ubfx r2, r0, #12, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #12
+; CHECK-LE-NEXT: ubfx r2, r0, #13, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #13
+; CHECK-LE-NEXT: ubfx r2, r0, #14, #1
+; CHECK-LE-NEXT: ubfx r0, r0, #15, #1
+; CHECK-LE-NEXT: orr.w r1, r1, r2, lsl #14
+; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #15
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_from_v16i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vcmp.i8 eq, q1, zr
; CHECK-BE-NEXT: vmrs r0, p0
-; CHECK-BE-NEXT: rbit r0, r0
-; CHECK-BE-NEXT: lsrs r0, r0, #16
-; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: ubfx r1, r0, #14, #1
+; CHECK-BE-NEXT: ubfx r2, r0, #15, #1
+; CHECK-BE-NEXT: orr.w r1, r2, r1, lsl #1
+; CHECK-BE-NEXT: ubfx r2, r0, #13, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #2
+; CHECK-BE-NEXT: ubfx r2, r0, #12, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #3
+; CHECK-BE-NEXT: ubfx r2, r0, #11, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #4
+; CHECK-BE-NEXT: ubfx r2, r0, #10, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #5
+; CHECK-BE-NEXT: ubfx r2, r0, #9, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #6
+; CHECK-BE-NEXT: ubfx r2, r0, #8, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #7
+; CHECK-BE-NEXT: ubfx r2, r0, #7, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #8
+; CHECK-BE-NEXT: ubfx r2, r0, #6, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #9
+; CHECK-BE-NEXT: ubfx r2, r0, #5, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #10
+; CHECK-BE-NEXT: ubfx r2, r0, #4, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #11
+; CHECK-BE-NEXT: ubfx r2, r0, #3, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #12
+; CHECK-BE-NEXT: ubfx r2, r0, #2, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #13
+; CHECK-BE-NEXT: ubfx r2, r0, #1, #1
+; CHECK-BE-NEXT: and r0, r0, #1
+; CHECK-BE-NEXT: orr.w r1, r1, r2, lsl #14
+; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #15
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %a, zeroinitializer
@@ -361,35 +443,35 @@ entry:
define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) {
; CHECK-LE-LABEL: bitcast_from_v2i1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: .pad #4
-; CHECK-LE-NEXT: sub sp, #4
; CHECK-LE-NEXT: vmov r0, r1, d0
; CHECK-LE-NEXT: orrs r0, r1
-; CHECK-LE-NEXT: csetm r1, eq
-; CHECK-LE-NEXT: movs r0, #0
-; CHECK-LE-NEXT: bfi r0, r1, #0, #1
-; CHECK-LE-NEXT: vmov r1, r2, d1
-; CHECK-LE-NEXT: orrs r1, r2
-; CHECK-LE-NEXT: csetm r1, eq
-; CHECK-LE-NEXT: bfi r0, r1, #1, #1
-; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: mov.w r1, #0
+; CHECK-LE-NEXT: csetm r0, eq
+; CHECK-LE-NEXT: bfi r1, r0, #0, #8
+; CHECK-LE-NEXT: vmov r0, r2, d1
+; CHECK-LE-NEXT: orrs r0, r2
+; CHECK-LE-NEXT: csetm r0, eq
+; CHECK-LE-NEXT: bfi r1, r0, #8, #8
+; CHECK-LE-NEXT: ubfx r0, r1, #8, #1
+; CHECK-LE-NEXT: and r1, r1, #1
+; CHECK-LE-NEXT: orr.w r0, r1, r0, lsl #1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: bitcast_from_v2i1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: .pad #4
-; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vmov r0, r1, d3
+; CHECK-BE-NEXT: vmov r0, r1, d2
; CHECK-BE-NEXT: orrs r0, r1
-; CHECK-BE-NEXT: csetm r1, eq
-; CHECK-BE-NEXT: movs r0, #0
-; CHECK-BE-NEXT: bfi r0, r1, #0, #1
-; CHECK-BE-NEXT: vmov r1, r2, d2
-; CHECK-BE-NEXT: orrs r1, r2
-; CHECK-BE-NEXT: csetm r1, eq
-; CHECK-BE-NEXT: bfi r0, r1, #1, #1
-; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: mov.w r1, #0
+; CHECK-BE-NEXT: csetm r0, eq
+; CHECK-BE-NEXT: bfi r1, r0, #0, #8
+; CHECK-BE-NEXT: vmov r0, r2, d3
+; CHECK-BE-NEXT: orrs r0, r2
+; CHECK-BE-NEXT: csetm r0, eq
+; CHECK-BE-NEXT: bfi r1, r0, #8, #8
+; CHECK-BE-NEXT: and r0, r1, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #8, #1
+; CHECK-BE-NEXT: orr.w r0, r1, r0, lsl #1
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp eq <2 x i64> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
index ef87ac31fcf48c..2285150620b1a9 100644
--- a/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxneconvert-intrinsics.ll
@@ -219,111 +219,93 @@ define <8 x bfloat> @select(i8 %x, <8 x bfloat> %y) nounwind {
; X64-LABEL: select:
; X64: # %bb.0:
; X64-NEXT: vmovaps %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x28,0xc8]
-; X64-NEXT: movb %dil, %al # encoding: [0x40,0x88,0xf8]
-; X64-NEXT: movb %al, -{{[0-9]+}}(%rsp) # encoding: [0x88,0x44,0x24,0xff]
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0xff]
-; X64-NEXT: movl %eax, %ecx # encoding: [0x89,0xc1]
-; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X64-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1]
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X64-NEXT: shrb %cl # encoding: [0xd0,0xe9]
-; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X64-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X64-NEXT: shrb $2, %cl # encoding: [0xc0,0xe9,0x02]
-; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X64-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X64-NEXT: shrb $3, %cl # encoding: [0xc0,0xe9,0x03]
-; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X64-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
-; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X64-NEXT: shrb $4, %cl # encoding: [0xc0,0xe9,0x04]
-; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X64-NEXT: shrb $5, %cl # encoding: [0xc0,0xe9,0x05]
-; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X64-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X64-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X64-NEXT: shrb $6, %cl # encoding: [0xc0,0xe9,0x06]
-; X64-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X64-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X64-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X64-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X64-NEXT: shrb $7, %al # encoding: [0xc0,0xe8,0x07]
-; X64-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; X64-NEXT: negl %eax # encoding: [0xf7,0xd8]
-; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X64-NEXT: movb %dil, %cl # encoding: [0x40,0x88,0xf9]
+; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X64-NEXT: shrb %dl # encoding: [0xd0,0xea]
+; X64-NEXT: # implicit-def: $eax
+; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X64-NEXT: # implicit-def: $edx
+; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X64-NEXT: vmovd %edx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc2]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X64-NEXT: shrb $2, %dl # encoding: [0xc0,0xea,0x02]
+; X64-NEXT: # implicit-def: $eax
+; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X64-NEXT: shrb $3, %dl # encoding: [0xc0,0xea,0x03]
+; X64-NEXT: # implicit-def: $eax
+; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X64-NEXT: shrb $4, %dl # encoding: [0xc0,0xea,0x04]
+; X64-NEXT: # implicit-def: $eax
+; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X64-NEXT: shrb $5, %dl # encoding: [0xc0,0xea,0x05]
+; X64-NEXT: # implicit-def: $eax
+; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; X64-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X64-NEXT: shrb $6, %dl # encoding: [0xc0,0xea,0x06]
+; X64-NEXT: # implicit-def: $eax
+; X64-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; X64-NEXT: shrb $7, %cl # encoding: [0xc0,0xe9,0x07]
+; X64-NEXT: # implicit-def: $eax
+; X64-NEXT: movb %cl, %al # encoding: [0x88,0xc8]
+; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; X64-NEXT: vpsllw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
+; X64-NEXT: vpsraw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xe0,0x0f]
; X64-NEXT: vpandn %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdf,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: select:
; X86: # %bb.0:
-; X86-NEXT: pushl %eax # encoding: [0x50]
; X86-NEXT: vmovaps %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x28,0xc8]
-; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x08]
-; X86-NEXT: movb %al, {{[0-9]+}}(%esp) # encoding: [0x88,0x44,0x24,0x03]
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x03]
-; X86-NEXT: movl %eax, %ecx # encoding: [0x89,0xc1]
-; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X86-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X86-NEXT: shrb %cl # encoding: [0xd0,0xe9]
-; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X86-NEXT: shrb $2, %cl # encoding: [0xc0,0xe9,0x02]
-; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X86-NEXT: shrb $3, %cl # encoding: [0xc0,0xe9,0x03]
-; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X86-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
-; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X86-NEXT: shrb $4, %cl # encoding: [0xc0,0xe9,0x04]
-; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X86-NEXT: shrb $5, %cl # encoding: [0xc0,0xe9,0x05]
-; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X86-NEXT: movb %al, %cl # encoding: [0x88,0xc1]
-; X86-NEXT: shrb $6, %cl # encoding: [0xc0,0xe9,0x06]
-; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT: andl $1, %ecx # encoding: [0x83,0xe1,0x01]
-; X86-NEXT: negl %ecx # encoding: [0xf7,0xd9]
-; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X86-NEXT: shrb $7, %al # encoding: [0xc0,0xe8,0x07]
-; X86-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; X86-NEXT: negl %eax # encoding: [0xf7,0xd8]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl # encoding: [0x8a,0x4c,0x24,0x04]
+; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X86-NEXT: shrb %dl # encoding: [0xd0,0xea]
+; X86-NEXT: # implicit-def: $eax
+; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X86-NEXT: # implicit-def: $edx
+; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X86-NEXT: vmovd %edx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc2]
+; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X86-NEXT: shrb $2, %dl # encoding: [0xc0,0xea,0x02]
+; X86-NEXT: # implicit-def: $eax
+; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X86-NEXT: shrb $3, %dl # encoding: [0xc0,0xea,0x03]
+; X86-NEXT: # implicit-def: $eax
+; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X86-NEXT: shrb $4, %dl # encoding: [0xc0,0xea,0x04]
+; X86-NEXT: # implicit-def: $eax
+; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X86-NEXT: shrb $5, %dl # encoding: [0xc0,0xea,0x05]
+; X86-NEXT: # implicit-def: $eax
+; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; X86-NEXT: movb %cl, %dl # encoding: [0x88,0xca]
+; X86-NEXT: shrb $6, %dl # encoding: [0xc0,0xea,0x06]
+; X86-NEXT: # implicit-def: $eax
+; X86-NEXT: movb %dl, %al # encoding: [0x88,0xd0]
+; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; X86-NEXT: shrb $7, %cl # encoding: [0xc0,0xe9,0x07]
+; X86-NEXT: # implicit-def: $eax
+; X86-NEXT: movb %cl, %al # encoding: [0x88,0xc8]
+; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; X86-NEXT: vpsllw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
+; X86-NEXT: vpsraw $15, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xe0,0x0f]
; X86-NEXT: vpandn %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdf,0xc1]
-; X86-NEXT: popl %eax # encoding: [0x58]
; X86-NEXT: retl # encoding: [0xc3]
%1 = bitcast i8 %x to <8 x i1>
%2 = select <8 x i1> %1, <8 x bfloat> zeroinitializer, <8 x bfloat> %y
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 501e73c46af9cf..599a2514683d2f 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -140,26 +140,36 @@ define i1 @trunc_v4i32_cmp(<4 x i32> %a0) nounwind {
}
define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind {
-; SSE-LABEL: bitcast_v8i16_to_v2i4:
-; SSE: # %bb.0:
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrb $4, %cl
-; SSE-NEXT: andb $15, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: # kill: def $al killed $al killed $eax
-; SSE-NEXT: retq
+; SSE2-SSSE3-LABEL: bitcast_v8i16_to_v2i4:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: shrb $4, %al
+; SSE2-SSSE3-NEXT: movzbl %al, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm1
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: retq
+;
+; SSE41-LABEL: bitcast_v8i16_to_v2i4:
+; SSE41: # %bb.0:
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %ecx
+; SSE41-NEXT: movl %ecx, %eax
+; SSE41-NEXT: shrb $4, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: retq
;
; AVX12-LABEL: bitcast_v8i16_to_v2i4:
; AVX12: # %bb.0:
; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrb $4, %cl
-; AVX12-NEXT: andb $15, %al
+; AVX12-NEXT: vpmovmskb %xmm0, %ecx
+; AVX12-NEXT: movl %ecx, %eax
+; AVX12-NEXT: shrb $4, %al
; AVX12-NEXT: addb %cl, %al
-; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
;
; AVX512-LABEL: bitcast_v8i16_to_v2i4:
@@ -362,26 +372,41 @@ define i1 @trunc_v4i64_cmp(<4 x i64> %a0) nounwind {
}
define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind {
-; SSE-LABEL: bitcast_v8i32_to_v2i4:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrb $4, %cl
-; SSE-NEXT: andb $15, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: # kill: def $al killed $al killed $eax
-; SSE-NEXT: retq
+; SSE2-SSSE3-LABEL: bitcast_v8i32_to_v2i4:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2
+; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: shrb $4, %al
+; SSE2-SSSE3-NEXT: movzbl %al, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm1
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: retq
+;
+; SSE41-LABEL: bitcast_v8i32_to_v2i4:
+; SSE41: # %bb.0:
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %ecx
+; SSE41-NEXT: movl %ecx, %eax
+; SSE41-NEXT: shrb $4, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: bitcast_v8i32_to_v2i4:
; AVX: # %bb.0:
-; AVX-NEXT: vmovmskps %ymm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrb $4, %cl
-; AVX-NEXT: andb $15, %al
+; AVX-NEXT: vmovmskps %ymm0, %ecx
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: shrb $4, %al
; AVX-NEXT: addb %cl, %al
-; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%1 = icmp slt <8 x i32> %a0, zeroinitializer
@@ -632,19 +657,46 @@ define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind {
;
define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
-; SSE-LABEL: bitcast_v8i64_to_v2i4:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packssdw %xmm2, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrb $4, %cl
-; SSE-NEXT: andb $15, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: # kill: def $al killed $al killed $eax
-; SSE-NEXT: retq
+; SSE2-SSSE3-LABEL: bitcast_v8i64_to_v2i4:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm4
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm4
+; SSE2-SSSE3-NEXT: packsswb %xmm4, %xmm4
+; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: shrb $4, %al
+; SSE2-SSSE3-NEXT: movzbl %al, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm1
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: retq
+;
+; SSE41-LABEL: bitcast_v8i64_to_v2i4:
+; SSE41: # %bb.0:
+; SSE41-NEXT: packssdw %xmm3, %xmm2
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: packssdw %xmm2, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %ecx
+; SSE41-NEXT: movl %ecx, %eax
+; SSE41-NEXT: shrb $4, %al
+; SSE41-NEXT: addb %cl, %al
+; SSE41-NEXT: retq
;
; AVX1-LABEL: bitcast_v8i64_to_v2i4:
; AVX1: # %bb.0:
@@ -656,12 +708,10 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vmovmskps %ymm0, %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrb $4, %cl
-; AVX1-NEXT: andb $15, %al
+; AVX1-NEXT: vmovmskps %ymm0, %ecx
+; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: shrb $4, %al
; AVX1-NEXT: addb %cl, %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -669,12 +719,10 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrb $4, %cl
-; AVX2-NEXT: andb $15, %al
+; AVX2-NEXT: vmovmskps %ymm0, %ecx
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: shrb $4, %al
; AVX2-NEXT: addb %cl, %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pr64655.ll b/llvm/test/CodeGen/X86/pr64655.ll
index f2929527c88f2e..350a7e7d30b680 100644
--- a/llvm/test/CodeGen/X86/pr64655.ll
+++ b/llvm/test/CodeGen/X86/pr64655.ll
@@ -7,32 +7,32 @@ define void @f(ptr %0) {
; AVX2: # %bb.0:
; AVX2-NEXT: movzbl (%rdi), %eax
; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrb $2, %cl
-; AVX2-NEXT: andb $1, %cl
; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: andb $1, %dl
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrb $3, %cl
-; AVX2-NEXT: andb $1, %cl
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrb $4, %cl
-; AVX2-NEXT: andb $1, %cl
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrb $5, %cl
-; AVX2-NEXT: andb $1, %cl
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: movl %eax, %esi
+; AVX2-NEXT: movl %eax, %r8d
+; AVX2-NEXT: movl %eax, %r9d
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: movl $1, %r10d
+; AVX2-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: shrb $2, %r9b
+; AVX2-NEXT: movzbl %r9b, %r9d
+; AVX2-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shrb $3, %r8b
+; AVX2-NEXT: movzbl %r8b, %r8d
+; AVX2-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: shrb $4, %sil
+; AVX2-NEXT: movzbl %sil, %esi
+; AVX2-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
+; AVX2-NEXT: shrb $5, %dl
+; AVX2-NEXT: movzbl %dl, %edx
+; AVX2-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
; AVX2-NEXT: shrb $6, %cl
-; AVX2-NEXT: andb $1, %cl
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; AVX2-NEXT: shrb $7, %al
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
>From 87530e40910359367712e44a79f32277ceb15b24 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Wed, 22 Jan 2025 20:13:17 +0000
Subject: [PATCH 12/15] Code review fixes
---
.../CodeGen/SelectionDAG/LegalizeTypes.cpp | 27 ++++++++++++-------
1 file changed, 17 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 3ffb4427d77d58..4b6f66b13de4dd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -14,6 +14,7 @@
#include "LegalizeTypes.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
@@ -934,11 +935,16 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), ElemBits * NumElems);
SDValue Packed = DAG.getConstant(0, DL, PackVT);
+ EVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+
for (unsigned I = 0; I < NumElems; ++I) {
unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
- SDValue Elem =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, N->getOperand(0),
- DAG.getIntPtrConstant(ElementIndex, DL));
+
+ SDValue Index = DAG.getConstant(ElementIndex, DL, IdxTy);
+
+ SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+ N->getOperand(0), Index);
+
SDValue ExtElem = DAG.getNode(ISD::ZERO_EXTEND, DL, PackVT, Elem);
SDValue ShiftAmount =
DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL);
@@ -949,8 +955,9 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
}
return DAG.getBitcast(ToVT, Packed);
+ }
- } else if (FromVT.isScalarInteger() && ToVT.isVector()) {
+ if (FromVT.isScalarInteger() && ToVT.isVector()) {
EVT ElemVT = ToVT.getVectorElementType();
unsigned NumElems = ToVT.getVectorNumElements();
@@ -960,20 +967,20 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
assert(PackedBits >= ElemBits * NumElems &&
"Vector does not have enough bits to unpack scalar type.");
- SmallVector<SDValue, 8> Elements;
- Elements.reserve(NumElems);
+ SmallVector<SDValue, 8> Elements(NumElems);
+
+ EVT ShiftTy = TLI.getShiftAmountTy(FromVT, DAG.getDataLayout());
for (unsigned I = 0; I < NumElems; ++I) {
unsigned ElementIndex = IsBigEndian ? (NumElems - 1 - I) : I;
unsigned ShiftAmountVal = ElemBits * ElementIndex;
- SDValue ShiftAmount =
- DAG.getShiftAmountConstant(ShiftAmountVal, FromVT, DL);
+ SDValue ShiftAmount = DAG.getConstant(ShiftAmountVal, DL, ShiftTy);
SDValue Shifted =
DAG.getNode(ISD::SRL, DL, FromVT, N->getOperand(0), ShiftAmount);
SDValue Element = DAG.getNode(ISD::TRUNCATE, DL, ElemVT, Shifted);
- Elements.push_back(Element);
- }
+ Elements[I] = Element;
+ }
return DAG.getBuildVector(ToVT, DL, Elements);
}
>From 7a5cfe4e073ac15111c28c9ac91e5704654e5604 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Wed, 22 Jan 2025 20:14:13 +0000
Subject: [PATCH 13/15] Applied formatting
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 4b6f66b13de4dd..f3505018ca42c8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -944,7 +944,7 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
N->getOperand(0), Index);
-
+
SDValue ExtElem = DAG.getNode(ISD::ZERO_EXTEND, DL, PackVT, Elem);
SDValue ShiftAmount =
DAG.getShiftAmountConstant(ElemBits * I, PackVT, DL);
@@ -980,7 +980,7 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
DAG.getNode(ISD::SRL, DL, FromVT, N->getOperand(0), ShiftAmount);
SDValue Element = DAG.getNode(ISD::TRUNCATE, DL, ElemVT, Shifted);
Elements[I] = Element;
- }
+ }
return DAG.getBuildVector(ToVT, DL, Elements);
}
>From 66bfc9900dbc286b10e1b72bacefa096eca94b82 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Sat, 25 Jan 2025 15:47:26 +0000
Subject: [PATCH 14/15] Addressed code review comments
---
.../lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index f3505018ca42c8..3e7bb87f27f8e8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -924,15 +924,25 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
if (FromVT.isVector() && ToVT.isScalarInteger()) {
+ if (!IsBigEndian) {
+
+ EVT ToVecVT = EVT::getVectorVT(*DAG.getContext(), ToVT, 1);
+ // If ISD::EXTRACT_VECTOR_ELT is a legal or custom op then return
+ if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ToVecVT))
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ToVT,
+ DAG.getBitcast(ToVecVT, N->getOperand(0)),
+ DAG.getVectorIdxConstant(0, DL));
+ }
+
EVT ElemVT = FromVT.getVectorElementType();
unsigned NumElems = FromVT.getVectorNumElements();
unsigned ElemBits = ElemVT.getSizeInBits();
-
+ unsigned NeededBits = ElemBits * NumElems;
unsigned PackedBits = ToVT.getSizeInBits();
- assert(PackedBits >= ElemBits * NumElems &&
- "Scalar type does not have enough bits to pack vector values.");
- EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), ElemBits * NumElems);
+ assert(PackedBits >= NeededBits && "Scalar type does not have enough bits to pack vector values.");
+
+ EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), PackedBits);
SDValue Packed = DAG.getConstant(0, DL, PackVT);
EVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
>From f278d564d875a48f640db48985998e1b6d8e3f55 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <oliver61 at live.co.uk>
Date: Sat, 25 Jan 2025 15:47:49 +0000
Subject: [PATCH 15/15] Formatting
---
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 3e7bb87f27f8e8..5eb1470a0957b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -940,7 +940,8 @@ SDValue DAGTypeLegalizer::LowerBitcastInRegister(SDNode *N) const {
unsigned NeededBits = ElemBits * NumElems;
unsigned PackedBits = ToVT.getSizeInBits();
- assert(PackedBits >= NeededBits && "Scalar type does not have enough bits to pack vector values.");
+ assert(PackedBits >= NeededBits &&
+ "Scalar type does not have enough bits to pack vector values.");
EVT PackVT = EVT::getIntegerVT(*DAG.getContext(), PackedBits);
SDValue Packed = DAG.getConstant(0, DL, PackVT);
More information about the llvm-commits
mailing list