[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
Petar Avramovic via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 5 05:31:31 PST 2024
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882
>From f563ccef965406f4df3a4481142795de32e596f4 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Wed, 30 Oct 2024 15:37:59 +0100
Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load
Add IDs for bit width that cover multiple LLTs: B32 B64 etc.
"Predicate" wrapper class for bool predicate functions used to
write pretty rules. Predicates can be combined using &&, || and !.
Lowering for splitting and widening loads.
Write rules for loads to not change existing mir tests from old
regbankselect.
---
.../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 288 +++++++++++++++-
.../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 +
.../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 278 ++++++++++++++-
.../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++-
.../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++++++++++++++---
.../GlobalISel/regbankselect-zextload.mir | 9 +-
6 files changed, 900 insertions(+), 65 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 6d9cf487c6dd25..6e78e29555ee11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
lower(MI, Mapping, WaterfallSgprs);
}
+void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
+ ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
+ MachineFunction &MF = B.getMF();
+ assert(MI.getNumMemOperands() == 1);
+ MachineMemOperand &BaseMMO = **MI.memoperands_begin();
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
+ Register Base = MI.getOperand(1).getReg();
+ LLT PtrTy = MRI.getType(Base);
+ const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
+ LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+ SmallVector<Register, 4> LoadPartRegs;
+
+ unsigned ByteOffset = 0;
+ for (LLT PartTy : LLTBreakdown) {
+ Register BasePlusOffset;
+ if (ByteOffset == 0) {
+ BasePlusOffset = Base;
+ } else {
+ auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
+ BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0);
+ }
+ auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
+ auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
+ LoadPartRegs.push_back(LoadPart.getReg(0));
+ ByteOffset += PartTy.getSizeInBytes();
+ }
+
+ if (!MergeTy.isValid()) {
+ // Loads are of same size, concat or merge them together.
+ B.buildMergeLikeInstr(Dst, LoadPartRegs);
+ } else {
+ // Loads are not all of same size, need to unmerge them to smaller pieces
+ // of MergeTy type, then merge pieces to Dst.
+ SmallVector<Register, 4> MergeTyParts;
+ for (Register Reg : LoadPartRegs) {
+ if (MRI.getType(Reg) == MergeTy) {
+ MergeTyParts.push_back(Reg);
+ } else {
+ auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
+ for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
+ MergeTyParts.push_back(Unmerge.getReg(i));
+ }
+ }
+ B.buildMergeLikeInstr(Dst, MergeTyParts);
+ }
+ MI.eraseFromParent();
+}
+
+void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
+ LLT MergeTy) {
+ MachineFunction &MF = B.getMF();
+ assert(MI.getNumMemOperands() == 1);
+ MachineMemOperand &BaseMMO = **MI.memoperands_begin();
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
+ Register Base = MI.getOperand(1).getReg();
+
+ MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
+ auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
+
+ if (WideTy.isScalar()) {
+ B.buildTrunc(Dst, WideLoad);
+ } else {
+ SmallVector<Register, 4> MergeTyParts;
+ auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
+
+ LLT DstTy = MRI.getType(Dst);
+ unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ MergeTyParts.push_back(Unmerge.getReg(i));
+ }
+ B.buildMergeLikeInstr(Dst, MergeTyParts);
+ }
+ MI.eraseFromParent();
+}
+
void RegBankLegalizeHelper::lower(MachineInstr &MI,
const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -116,6 +193,54 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
MI.eraseFromParent();
break;
}
+ case SplitLoad: {
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ unsigned Size = DstTy.getSizeInBits();
+ // Even split to 128-bit loads
+ if (Size > 128) {
+ LLT B128;
+ if (DstTy.isVector()) {
+ LLT EltTy = DstTy.getElementType();
+ B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
+ } else {
+ B128 = LLT::scalar(128);
+ }
+ if (Size / 128 == 2)
+ splitLoad(MI, {B128, B128});
+ else if (Size / 128 == 4)
+ splitLoad(MI, {B128, B128, B128, B128});
+ else {
+ LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
+ llvm_unreachable("SplitLoad type not supported for MI");
+ }
+ }
+ // 64 and 32 bit load
+ else if (DstTy == S96)
+ splitLoad(MI, {S64, S32}, S32);
+ else if (DstTy == V3S32)
+ splitLoad(MI, {V2S32, S32}, S32);
+ else if (DstTy == V6S16)
+ splitLoad(MI, {V4S16, V2S16}, V2S16);
+ else {
+ LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
+ llvm_unreachable("SplitLoad type not supported for MI");
+ }
+ break;
+ }
+ case WidenLoad: {
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ if (DstTy == S96)
+ widenLoad(MI, S128);
+ else if (DstTy == V3S32)
+ widenLoad(MI, V4S32, S32);
+ else if (DstTy == V6S16)
+ widenLoad(MI, V8S16, V2S16);
+ else {
+ LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
+ llvm_unreachable("WidenLoad type not supported for MI");
+ }
+ break;
+ }
}
// TODO: executeInWaterfallLoop(... WaterfallSgprs)
@@ -139,12 +264,73 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMapingApplyID ID) {
case Sgpr64:
case Vgpr64:
return LLT::scalar(64);
+ case SgprP1:
+ case VgprP1:
+ return LLT::pointer(1, 64);
+ case SgprP3:
+ case VgprP3:
+ return LLT::pointer(3, 32);
+ case SgprP4:
+ case VgprP4:
+ return LLT::pointer(4, 64);
+ case SgprP5:
+ case VgprP5:
+ return LLT::pointer(5, 32);
case SgprV4S32:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
- case VgprP1:
- return LLT::pointer(1, 64);
+ default:
+ return LLT();
+ }
+}
+
+LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMapingApplyID ID, LLT Ty) {
+ switch (ID) {
+ case SgprB32:
+ case VgprB32:
+ case UniInVgprB32:
+ if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
+ Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) ||
+ Ty == LLT::pointer(6, 32))
+ return Ty;
+ return LLT();
+ case SgprB64:
+ case VgprB64:
+ case UniInVgprB64:
+ if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
+ Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(0, 64) ||
+ Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64))
+ return Ty;
+ return LLT();
+ case SgprB96:
+ case VgprB96:
+ case UniInVgprB96:
+ if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
+ Ty == LLT::fixed_vector(6, 16))
+ return Ty;
+ return LLT();
+ case SgprB128:
+ case VgprB128:
+ case UniInVgprB128:
+ if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
+ Ty == LLT::fixed_vector(2, 64))
+ return Ty;
+ return LLT();
+ case SgprB256:
+ case VgprB256:
+ case UniInVgprB256:
+ if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
+ Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
+ return Ty;
+ return LLT();
+ case SgprB512:
+ case VgprB512:
+ case UniInVgprB512:
+ if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
+ Ty == LLT::fixed_vector(8, 64))
+ return Ty;
+ return LLT();
default:
return LLT();
}
@@ -158,10 +344,26 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMapingApplyID ID) {
case Sgpr16:
case Sgpr32:
case Sgpr64:
+ case SgprP1:
+ case SgprP3:
+ case SgprP4:
+ case SgprP5:
case SgprV4S32:
+ case SgprB32:
+ case SgprB64:
+ case SgprB96:
+ case SgprB128:
+ case SgprB256:
+ case SgprB512:
case UniInVcc:
case UniInVgprS32:
case UniInVgprV4S32:
+ case UniInVgprB32:
+ case UniInVgprB64:
+ case UniInVgprB96:
+ case UniInVgprB128:
+ case UniInVgprB256:
+ case UniInVgprB512:
case Sgpr32Trunc:
case Sgpr32AExt:
case Sgpr32AExtBoolInReg:
@@ -170,7 +372,16 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMapingApplyID ID) {
case Vgpr32:
case Vgpr64:
case VgprP1:
+ case VgprP3:
+ case VgprP4:
+ case VgprP5:
case VgprV4S32:
+ case VgprB32:
+ case VgprB64:
+ case VgprB96:
+ case VgprB128:
+ case VgprB256:
+ case VgprB512:
return VgprRB;
default:
return nullptr;
@@ -195,16 +406,40 @@ void RegBankLegalizeHelper::applyMappingDst(
case Sgpr16:
case Sgpr32:
case Sgpr64:
+ case SgprP1:
+ case SgprP3:
+ case SgprP4:
+ case SgprP5:
case SgprV4S32:
case Vgpr32:
case Vgpr64:
case VgprP1:
+ case VgprP3:
+ case VgprP4:
+ case VgprP5:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
break;
}
- // uniform in vcc/vgpr: scalars and vectors
+ // sgpr and vgpr B-types
+ case SgprB32:
+ case SgprB64:
+ case SgprB96:
+ case SgprB128:
+ case SgprB256:
+ case SgprB512:
+ case VgprB32:
+ case VgprB64:
+ case VgprB96:
+ case VgprB128:
+ case VgprB256:
+ case VgprB512: {
+ assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
+ assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
+ break;
+ }
+ // uniform in vcc/vgpr: scalars, vectors and B-types
case UniInVcc: {
assert(Ty == S1);
assert(RB == SgprRB);
@@ -224,6 +459,19 @@ void RegBankLegalizeHelper::applyMappingDst(
buildReadAnyLane(B, Reg, NewVgprDst, RBI);
break;
}
+ case UniInVgprB32:
+ case UniInVgprB64:
+ case UniInVgprB96:
+ case UniInVgprB128:
+ case UniInVgprB256:
+ case UniInVgprB512: {
+ assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
+ assert(RB == SgprRB);
+ Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
+ Op.setReg(NewVgprDst);
+ AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
+ break;
+ }
// sgpr trunc
case Sgpr32Trunc: {
assert(Ty.getSizeInBits() < 32);
@@ -272,15 +520,33 @@ void RegBankLegalizeHelper::applyMappingSrc(
case Sgpr16:
case Sgpr32:
case Sgpr64:
+ case SgprP1:
+ case SgprP3:
+ case SgprP4:
+ case SgprP5:
case SgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[i]));
assert(RB == getRegBankFromID(MethodIDs[i]));
break;
}
+ // sgpr B-types
+ case SgprB32:
+ case SgprB64:
+ case SgprB96:
+ case SgprB128:
+ case SgprB256:
+ case SgprB512: {
+ assert(Ty == getBTyFromID(MethodIDs[i], Ty));
+ assert(RB == getRegBankFromID(MethodIDs[i]));
+ break;
+ }
// vgpr scalars, pointers and vectors
case Vgpr32:
case Vgpr64:
case VgprP1:
+ case VgprP3:
+ case VgprP4:
+ case VgprP5:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[i]));
if (RB != VgprRB) {
@@ -289,6 +555,20 @@ void RegBankLegalizeHelper::applyMappingSrc(
}
break;
}
+ // vgpr B-types
+ case VgprB32:
+ case VgprB64:
+ case VgprB96:
+ case VgprB128:
+ case VgprB256:
+ case VgprB512: {
+ assert(Ty == getBTyFromID(MethodIDs[i], Ty));
+ if (RB != VgprRB) {
+ auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
+ Op.setReg(CopyToVgpr.getReg(0));
+ }
+ break;
+ }
// sgpr and vgpr scalars with extend
case Sgpr32AExt: {
// Note: this ext allows S1, and it is meant to be combined away.
@@ -361,7 +641,7 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
// We accept all types that can fit in some register class.
// Uniform G_PHIs have all sgpr registers.
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
- if (Ty == LLT::scalar(32)) {
+ if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) {
return;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index d76896dc31310a..63d35907109c9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -90,6 +90,7 @@ class RegBankLegalizeHelper {
SmallSet<Register, 4> &SgprOperandRegs);
LLT getTyFromID(RegBankLLTMapingApplyID ID);
+ LLT getBTyFromID(RegBankLLTMapingApplyID ID, LLT Ty);
const RegisterBank *getRegBankFromID(RegBankLLTMapingApplyID ID);
@@ -102,6 +103,10 @@ class RegBankLegalizeHelper {
const SmallVectorImpl<RegBankLLTMapingApplyID> &MethodIDs,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
+ void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
+ LLT MergeTy = LLT());
+ void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
+
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index ed2e3458eea452..cb3c561bb7411e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -14,9 +14,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPURegBankLegalizeRules.h"
+#include "AMDGPUInstrInfo.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-regbanklegalize"
@@ -49,6 +51,24 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(64);
case P1:
return MRI.getType(Reg) == LLT::pointer(1, 64);
+ case P3:
+ return MRI.getType(Reg) == LLT::pointer(3, 32);
+ case P4:
+ return MRI.getType(Reg) == LLT::pointer(4, 64);
+ case P5:
+ return MRI.getType(Reg) == LLT::pointer(5, 32);
+ case B32:
+ return MRI.getType(Reg).getSizeInBits() == 32;
+ case B64:
+ return MRI.getType(Reg).getSizeInBits() == 64;
+ case B96:
+ return MRI.getType(Reg).getSizeInBits() == 96;
+ case B128:
+ return MRI.getType(Reg).getSizeInBits() == 128;
+ case B256:
+ return MRI.getType(Reg).getSizeInBits() == 256;
+ case B512:
+ return MRI.getType(Reg).getSizeInBits() == 512;
case UniS1:
return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg);
case UniS16:
@@ -57,6 +77,26 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
case UniS64:
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
+ case UniP1:
+ return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
+ case UniP3:
+ return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg);
+ case UniP4:
+ return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
+ case UniP5:
+ return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+ case UniB32:
+ return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
+ case UniB64:
+ return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniform(Reg);
+ case UniB96:
+ return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg);
+ case UniB128:
+ return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg);
+ case UniB256:
+ return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
+ case UniB512:
+ return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
case DivS1:
return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
case DivS32:
@@ -65,6 +105,24 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
case DivP1:
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
+ case DivP3:
+ return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg);
+ case DivP4:
+ return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
+ case DivP5:
+ return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
+ case DivB32:
+ return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
+ case DivB64:
+ return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergent(Reg);
+ case DivB96:
+ return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg);
+ case DivB128:
+ return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg);
+ case DivB256:
+ return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
+ case DivB512:
+ return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
case _:
return true;
default:
@@ -122,6 +180,22 @@ UniformityLLTOpPredicateID LLTToId(LLT Ty) {
return _;
}
+UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
+ if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
+ Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) ||
+ Ty == LLT::pointer(6, 32))
+ return B32;
+ if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
+ Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(1, 64) ||
+ Ty == LLT::pointer(4, 64))
+ return B64;
+ if (Ty == LLT::fixed_vector(3, 32))
+ return B96;
+ if (Ty == LLT::fixed_vector(4, 32))
+ return B128;
+ return _;
+}
+
const RegBankLLTMapping &
SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
const MachineRegisterInfo &MRI,
@@ -132,7 +206,12 @@ SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
// returned which results in failure, does not search "Slow Rules".
if (FastTypes != NoFastRules) {
Register Reg = MI.getOperand(0).getReg();
- int Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
+ int Slot;
+ if (FastTypes == StandardB)
+ Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
+ else
+ Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
+
if (Slot != -1)
return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot];
}
@@ -181,7 +260,22 @@ int SetOfRulesForOpcode::getFastPredicateSlot(
default:
return -1;
}
- case Vector:
+ }
+ case StandardB: {
+ switch (Ty) {
+ case B32:
+ return 0;
+ case B64:
+ return 1;
+ case B96:
+ return 2;
+ case B128:
+ return 3;
+ default:
+ return -1;
+ }
+ }
+ case Vector: {
switch (Ty) {
case S32:
return 0;
@@ -233,6 +327,95 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
return GRules.at(GRulesAlias.at(Opc));
}
+// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
+class Predicate {
+private:
+ struct Elt {
+ // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
+ // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
+ // Sequences of && and || will be represented by jumps, for example:
+ // (A && B && ... X) or (A && B && ... X) || Y
+ // A == true jump to B
+ // A == false jump to end or Y, result is A(false) or Y
+ // (A || B || ... X) or (A || B || ... X) && Y
+ // A == true jump to end or Y, result is A(true) or Y
+ // A == false jump to B
+ // Notice that when negating expression, we simply flip Neg on each Pred
+ // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
+ std::function<bool(const MachineInstr &)> Pred;
+ bool Neg; // Neg of Pred is calculated before jump
+ unsigned TJumpOffset;
+ unsigned FJumpOffset;
+ };
+
+ SmallVector<Elt, 8> Expression;
+
+ Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
+
+public:
+ Predicate(std::function<bool(const MachineInstr &)> Pred) {
+ Expression.push_back({Pred, false, 1, 1});
+ };
+
+ bool operator()(const MachineInstr &MI) const {
+ unsigned Idx = 0;
+ unsigned ResultIdx = Expression.size();
+ bool Result;
+ do {
+ Result = Expression[Idx].Pred(MI);
+ Result = Expression[Idx].Neg ? !Result : Result;
+ if (Result) {
+ Idx += Expression[Idx].TJumpOffset;
+ } else {
+ Idx += Expression[Idx].FJumpOffset;
+ }
+ } while ((Idx != ResultIdx));
+
+ return Result;
+ };
+
+ Predicate operator!() const {
+ SmallVector<Elt, 8> NegExpression;
+ for (const Elt &ExprElt : Expression) {
+ NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
+ ExprElt.TJumpOffset});
+ }
+ return Predicate(std::move(NegExpression));
+ };
+
+ Predicate operator&&(const Predicate &RHS) const {
+ SmallVector<Elt, 8> AndExpression = Expression;
+
+ unsigned RHSSize = RHS.Expression.size();
+ unsigned ResultIdx = Expression.size();
+ for (unsigned i = 0; i < ResultIdx; ++i) {
+ // LHS results in false, whole expression results in false.
+ if (i + AndExpression[i].FJumpOffset == ResultIdx)
+ AndExpression[i].FJumpOffset += RHSSize;
+ }
+
+ AndExpression.append(RHS.Expression);
+
+ return Predicate(std::move(AndExpression));
+ }
+
+ Predicate operator||(const Predicate &RHS) const {
+ SmallVector<Elt, 8> OrExpression = Expression;
+
+ unsigned RHSSize = RHS.Expression.size();
+ unsigned ResultIdx = Expression.size();
+ for (unsigned i = 0; i < ResultIdx; ++i) {
+ // LHS results in true, whole expression results in true.
+ if (i + OrExpression[i].TJumpOffset == ResultIdx)
+ OrExpression[i].TJumpOffset += RHSSize;
+ }
+
+ OrExpression.append(RHS.Expression);
+
+ return Predicate(std::move(OrExpression));
+ }
+};
+
// Initialize rules
RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
MachineRegisterInfo &_MRI)
@@ -244,10 +427,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
- addRulesForGOpcs({G_XOR, G_OR, G_AND}, Standard)
+ addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
.Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
.Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
- .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, SplitTo32});
+ .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
addRulesForGOpcs({G_SHL}, Standard)
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
@@ -272,9 +455,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
- addRulesForGOpcs({G_SELECT}, Standard)
- .Div(S32, {{Vgpr32}, {Vcc, Vgpr32, Vgpr32}})
- .Uni(S32, {{Sgpr32}, {Sgpr32AExtBoolInReg, Sgpr32, Sgpr32}});
+ addRulesForGOpcs({G_SELECT}, StandardB)
+ .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
+ .Uni(B32, {{SgprB32}, {Sgpr32AExtBoolInReg, SgprB32, SgprB32}});
addRulesForGOpcs({G_ANYEXT}).Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}});
@@ -290,7 +473,86 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
- addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}});
+ bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
+ bool hasSMRDSmall = ST->hasScalarSubwordLoads();
+
+ Predicate isAlign16([](const MachineInstr &MI) -> bool {
+ return (*MI.memoperands_begin())->getAlign() >= Align(16);
+ });
+
+ Predicate isAlign4([](const MachineInstr &MI) -> bool {
+ return (*MI.memoperands_begin())->getAlign() >= Align(4);
+ });
+
+ Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
+ return (*MI.memoperands_begin())->isAtomic();
+ });
+
+ Predicate isUniMMO([](const MachineInstr &MI) -> bool {
+ return AMDGPUInstrInfo::isUniformMMO(*MI.memoperands_begin());
+ });
+
+ Predicate isConst([](const MachineInstr &MI) -> bool {
+ // Address space in MMO be different then address space on pointer.
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ const unsigned AS = MMO->getAddrSpace();
+ return AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
+ });
+
+ Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
+ return (*MI.memoperands_begin())->isVolatile();
+ });
+
+ Predicate isInvMMO([](const MachineInstr &MI) -> bool {
+ return (*MI.memoperands_begin())->isInvariant();
+ });
+
+ Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
+ return (*MI.memoperands_begin())->getFlags() & MONoClobber;
+ });
+
+ Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ const unsigned MemSize = 8 * MMO->getSize().getValue();
+ return (MemSize == 16 && MMO->getAlign() >= Align(2)) ||
+ (MemSize == 8 && MMO->getAlign() >= Align(1));
+ });
+
+ auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
+ (isConst || isInvMMO || isNoClobberMMO);
+
+ // clang-format off
+ addRulesForGOpcs({G_LOAD})
+ .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
+ .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
+ .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
+ .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
+ .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
+
+ .Any({{DivB32, UniP3}, {{VgprB32}, {VgprP3}}})
+ .Any({{{UniB32, UniP3}, isAlign4 && isUL}, {{SgprB32}, {SgprP3}}})
+ .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
+
+ .Any({{{DivB256, DivP4}}, {{VgprB256}, {VgprP4}, SplitLoad}})
+ .Any({{{UniB32, UniP4}, isNaturalAlignedSmall && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) // i8 and i16 load
+ .Any({{{UniB32, UniP4}, isAlign4 && isUL}, {{SgprB32}, {SgprP4}}})
+ .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
+ .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
+ .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
+ .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
+ .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
+ .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
+ .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}})
+ .Any({{{UniB256, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP4}, SplitLoad}})
+ .Any({{{UniB512, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP4}, SplitLoad}})
+
+ .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}});
+
+ addRulesForGOpcs({G_ZEXTLOAD}) // i8 and i16 zero-extending loads
+ .Any({{{UniB32, UniP3}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP3}}})
+ .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
+ // clang-format on
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
.Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index afd953377fcd9e..1468cbedc784b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -42,14 +42,47 @@ enum UniformityLLTOpPredicateID {
// pointers
P1,
+ P3,
+ P4,
+ P5,
+
+ UniP1,
+ UniP3,
+ UniP4,
+ UniP5,
DivP1,
+ DivP3,
+ DivP4,
+ DivP5,
// vectors
V2S16,
V2S32,
V3S32,
V4S32,
+
+ // B types
+ B32,
+ B64,
+ B96,
+ B128,
+ B256,
+ B512,
+
+ UniB32,
+ UniB64,
+ UniB96,
+ UniB128,
+ UniB256,
+ UniB512,
+
+ DivB32,
+ DivB64,
+ DivB96,
+ DivB128,
+ DivB256,
+ DivB512,
};
// How to apply register bank on register operand.
@@ -67,18 +100,43 @@ enum RegBankLLTMapingApplyID {
Sgpr16,
Sgpr32,
Sgpr64,
+ SgprP1,
+ SgprP3,
+ SgprP4,
+ SgprP5,
SgprV4S32,
+ SgprB32,
+ SgprB64,
+ SgprB96,
+ SgprB128,
+ SgprB256,
+ SgprB512,
// vgpr scalars, pointers, vectors and B-types
Vgpr32,
Vgpr64,
VgprP1,
+ VgprP3,
+ VgprP4,
+ VgprP5,
+ VgprB32,
+ VgprB64,
+ VgprB96,
+ VgprB128,
+ VgprB256,
+ VgprB512,
VgprV4S32,
// Dst only modifiers: read-any-lane and truncs
UniInVcc,
UniInVgprS32,
UniInVgprV4S32,
+ UniInVgprB32,
+ UniInVgprB64,
+ UniInVgprB96,
+ UniInVgprB128,
+ UniInVgprB256,
+ UniInVgprB512,
Sgpr32Trunc,
@@ -100,12 +158,15 @@ enum LoweringMethodID {
SplitTo32,
Ext32To64,
UniCstExt,
+ SplitLoad,
+ WidenLoad,
};
enum FastRulesTypes {
NoFastRules,
- Standard, // S16, S32, S64, V2S16
- Vector, // S32, V2S32, V3S32, V4S32
+ Standard, // S16, S32, S64, V2S16
+ StandardB, // B32, B64, B96, B128
+ Vector, // S32, V2S32, V3S32, V4S32
};
struct RegBankLLTMapping {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index b66560710e37e2..bf1dcad80e4ec7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -1,7 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -check-prefixes=GCN,GFX7
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s -check-prefixes=GCN,GFX7
-# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX7
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12
--- |
define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) {
@@ -110,6 +109,7 @@
---
name: load_global_v8i32_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -119,11 +119,21 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v8i32 + 16, basealign 32, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.global.not.uniform.v8i32)
...
@@ -131,6 +141,7 @@ body: |
---
name: load_global_v4i64_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -141,11 +152,29 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v4i64 + 16, basealign 32, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
+ ; GCN-NEXT: [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
+ ; GCN-NEXT: [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32)
+ ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32)
+ ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.global.not.uniform.v4i64)
...
@@ -153,6 +182,7 @@ body: |
---
name: load_global_v16i32_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -162,17 +192,35 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 16, basealign 64, addrspace 1)
; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 1)
; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<4 x s32>) from %ir.global.not.uniform.v16i32 + 48, basealign 64, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<16 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32), [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32), [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32), [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32), [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.global.not.uniform.v16i32)
...
@@ -180,6 +228,7 @@ body: |
---
name: load_global_v8i64_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -189,17 +238,51 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64, align 64, addrspace 1)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 16, basealign 64, addrspace 1)
; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 1)
; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY1]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p1) :: (load (<2 x s64>) from %ir.global.not.uniform.v8i64 + 48, basealign 64, addrspace 1)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
+ ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
+ ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32)
+ ; GCN-NEXT: [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]]
+ ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32)
+ ; GCN-NEXT: [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]]
+ ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
+ ; GCN-NEXT: [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV4]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV16]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV17]]
+ ; GCN-NEXT: [[MV4:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32)
+ ; GCN-NEXT: [[UV18:%[0-9]+]]:vgpr(s32), [[UV19:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV5]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV18]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV19]]
+ ; GCN-NEXT: [[MV5:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32)
+ ; GCN-NEXT: [[UV20:%[0-9]+]]:vgpr(s32), [[UV21:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV6]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV20]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV21]]
+ ; GCN-NEXT: [[MV6:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32)
+ ; GCN-NEXT: [[UV22:%[0-9]+]]:vgpr(s32), [[UV23:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV7]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV22]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV23]]
+ ; GCN-NEXT: [[MV7:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32)
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
%0:_(p1) = COPY $sgpr0_sgpr1
%1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.global.not.uniform.v8i64)
...
@@ -207,6 +290,7 @@ body: |
---
name: load_global_v8i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -223,6 +307,7 @@ body: |
---
name: load_global_v4i64_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -239,6 +324,7 @@ body: |
---
name: load_global_v16i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -255,6 +341,7 @@ body: |
---
name: load_global_v8i64_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -271,6 +358,7 @@ body: |
---
name: load_constant_v8i32_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -280,11 +368,21 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v8i32 + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>) from %ir.constant.not.uniform.v8i32)
...
@@ -292,6 +390,7 @@ body: |
---
name: load_constant_i256_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -301,11 +400,21 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY1]](p4) :: (load (s128) from %ir.constant.not.uniform, align 32, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load (s128) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s256)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s256) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s256) = G_LOAD %0 :: (load (s256) from %ir.constant.not.uniform)
...
@@ -313,6 +422,7 @@ body: |
---
name: load_constant_v16i16_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -323,11 +433,21 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY1]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform, align 32, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load (<8 x s16>) from %ir.constant.not.uniform + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>), [[UV2:%[0-9]+]]:vgpr(<2 x s16>), [[UV3:%[0-9]+]]:vgpr(<2 x s16>), [[UV4:%[0-9]+]]:vgpr(<2 x s16>), [[UV5:%[0-9]+]]:vgpr(<2 x s16>), [[UV6:%[0-9]+]]:vgpr(<2 x s16>), [[UV7:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:sgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_READANYLANE]](<2 x s16>), [[AMDGPU_READANYLANE1]](<2 x s16>), [[AMDGPU_READANYLANE2]](<2 x s16>), [[AMDGPU_READANYLANE3]](<2 x s16>), [[AMDGPU_READANYLANE4]](<2 x s16>), [[AMDGPU_READANYLANE5]](<2 x s16>), [[AMDGPU_READANYLANE6]](<2 x s16>), [[AMDGPU_READANYLANE7]](<2 x s16>)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<16 x s16>) = G_LOAD %0 :: (load (<16 x s16>) from %ir.constant.not.uniform)
...
@@ -335,6 +455,7 @@ body: |
---
name: load_constant_v4i64_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -344,11 +465,29 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v4i64 + 16, basealign 32, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
+ ; GCN-NEXT: [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
+ ; GCN-NEXT: [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32)
+ ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32)
+ ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>) from %ir.constant.not.uniform.v4i64)
...
@@ -356,6 +495,7 @@ body: |
---
name: load_constant_v16i32_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -365,17 +505,35 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 16, basealign 64, addrspace 4)
; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 32, align 32, basealign 64, addrspace 4)
; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<4 x s32>) from %ir.constant.not.uniform.v16i32 + 48, basealign 64, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV4]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV5]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV6]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV7]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]]
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<16 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32), [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32), [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32), [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32), [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32), [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32), [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>) from %ir.constant.not.uniform.v16i32)
...
@@ -383,6 +541,7 @@ body: |
---
name: load_constant_v8i64_non_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -392,17 +551,51 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
- ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
+ ; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
; GCN-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
- ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 16, basealign 64, addrspace 4)
; GCN-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
- ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C1]](s64)
; GCN-NEXT: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 32, align 32, basealign 64, addrspace 4)
; GCN-NEXT: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
- ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
+ ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY1]], [[C2]](s64)
; GCN-NEXT: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load (<2 x s64>) from %ir.constant.not.uniform.v8i64 + 48, basealign 64, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
+ ; GCN-NEXT: [[UV:%[0-9]+]]:vgpr(s64), [[UV1:%[0-9]+]]:vgpr(s64), [[UV2:%[0-9]+]]:vgpr(s64), [[UV3:%[0-9]+]]:vgpr(s64), [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64), [[UV6:%[0-9]+]]:vgpr(s64), [[UV7:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
+ ; GCN-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV8]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV9]]
+ ; GCN-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
+ ; GCN-NEXT: [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV1]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV10]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV11]]
+ ; GCN-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32)
+ ; GCN-NEXT: [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV2]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE4:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV12]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE5:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV13]]
+ ; GCN-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE4]](s32), [[AMDGPU_READANYLANE5]](s32)
+ ; GCN-NEXT: [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV3]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE6:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV14]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE7:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV15]]
+ ; GCN-NEXT: [[MV3:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE6]](s32), [[AMDGPU_READANYLANE7]](s32)
+ ; GCN-NEXT: [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV4]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE8:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV16]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE9:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV17]]
+ ; GCN-NEXT: [[MV4:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE8]](s32), [[AMDGPU_READANYLANE9]](s32)
+ ; GCN-NEXT: [[UV18:%[0-9]+]]:vgpr(s32), [[UV19:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV5]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE10:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV18]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE11:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV19]]
+ ; GCN-NEXT: [[MV5:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE10]](s32), [[AMDGPU_READANYLANE11]](s32)
+ ; GCN-NEXT: [[UV20:%[0-9]+]]:vgpr(s32), [[UV21:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV6]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE12:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV20]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE13:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV21]]
+ ; GCN-NEXT: [[MV6:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE12]](s32), [[AMDGPU_READANYLANE13]](s32)
+ ; GCN-NEXT: [[UV22:%[0-9]+]]:vgpr(s32), [[UV23:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV7]](s64)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE14:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV22]]
+ ; GCN-NEXT: [[AMDGPU_READANYLANE15:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV23]]
+ ; GCN-NEXT: [[MV7:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE14]](s32), [[AMDGPU_READANYLANE15]](s32)
+ ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>) from %ir.constant.not.uniform.v8i64)
...
@@ -410,6 +603,7 @@ body: |
---
name: load_constant_v8i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -426,6 +620,7 @@ body: |
---
name: load_constant_v16i16_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -442,6 +637,7 @@ body: |
---
name: load_constant_v4i64_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -458,6 +654,7 @@ body: |
---
name: load_constant_v16i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -474,6 +671,7 @@ body: |
---
name: load_constant_v8i64_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -490,6 +688,7 @@ body: |
---
name: load_local_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0
@@ -500,6 +699,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load (s32), addrspace 3)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p3) = COPY $sgpr0
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 3)
@@ -507,6 +707,7 @@ body: |
---
name: load_region_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0
@@ -525,6 +726,7 @@ body: |
---
name: extload_constant_i8_to_i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -535,6 +737,7 @@ body: |
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
;
; GFX12-LABEL: name: extload_constant_i8_to_i32_uniform
; GFX12: liveins: $sgpr0_sgpr1
@@ -548,6 +751,7 @@ body: |
---
name: extload_global_i8_to_i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -559,6 +763,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s8), addrspace 1)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s8), addrspace 1, align 1)
...
@@ -566,6 +771,7 @@ body: |
---
name: extload_constant_i16_to_i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -577,6 +783,7 @@ body: |
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ; GFX7-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
;
; GFX12-LABEL: name: extload_constant_i16_to_i32_uniform
; GFX12: liveins: $sgpr0_sgpr1
@@ -590,6 +797,7 @@ body: |
---
name: extload_global_i16_to_i32_uniform
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -601,6 +809,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s16), addrspace 1)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s16), addrspace 1, align 2)
...
@@ -608,6 +817,7 @@ body: |
---
name: load_constant_i32_uniform_align4
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -624,6 +834,7 @@ body: |
---
name: load_constant_i32_uniform_align2
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -635,6 +846,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 2, addrspace 4)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 2)
...
@@ -642,6 +854,7 @@ body: |
---
name: load_constant_i32_uniform_align1
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -653,6 +866,7 @@ body: |
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; GCN-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_LOAD %0 :: (load (s32), addrspace 4, align 1)
...
@@ -660,6 +874,7 @@ body: |
---
name: load_private_uniform_sgpr_i32
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -706,10 +921,10 @@ body: |
; GCN-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi
; GCN: bb.0:
; GCN-NEXT: successors: %bb.1(0x80000000)
- ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr(p4) = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY $vgpr2_vgpr3
; GCN-NEXT: G_BR %bb.1
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
@@ -721,14 +936,14 @@ body: |
; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PHI]], [[C]](s64)
; GCN-NEXT: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load (<4 x s32>) from unknown-address + 16, addrspace 4)
; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
- ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr(p4) = COPY [[COPY1]](p4)
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr(p4) = COPY [[COPY1]](p4)
; GCN-NEXT: G_BR %bb.1
bb.0:
- liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- %0:_(p4) = COPY $sgpr0_sgpr1
- %1:_(p4) = COPY $sgpr2_sgpr3
+ %0:_(p4) = COPY $vgpr0_vgpr1
+ %1:_(p4) = COPY $vgpr2_vgpr3
G_BR %bb.1
bb.1:
@@ -741,6 +956,7 @@ body: |
---
name: load_constant_v3i32_align4
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -771,6 +987,7 @@ body: |
---
name: load_constant_v3i32_align8
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -801,6 +1018,7 @@ body: |
---
name: load_constant_v3i32_align16
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -828,6 +1046,7 @@ body: |
---
name: load_constant_v6i16_align4
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -840,10 +1059,9 @@ body: |
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, addrspace 4)
- ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
- ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>)
- ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
- ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+ ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[LOAD1]](<2 x s16>)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
;
; GFX12-LABEL: name: load_constant_v6i16_align4
; GFX12: liveins: $sgpr0_sgpr1
@@ -859,6 +1077,7 @@ body: |
---
name: load_constant_v6i16_align8
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -871,10 +1090,9 @@ body: |
; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 8
; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; GFX7-NEXT: [[LOAD1:%[0-9]+]]:sgpr(<2 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (invariant load (<2 x s16>) from unknown-address + 8, align 8, addrspace 4)
- ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
- ; GFX7-NEXT: [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD1]](<2 x s16>)
- ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
- ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+ ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[LOAD1]](<2 x s16>)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
;
; GFX12-LABEL: name: load_constant_v6i16_align8
; GFX12: liveins: $sgpr0_sgpr1
@@ -890,6 +1108,7 @@ body: |
---
name: load_constant_v6i16_align16
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -899,9 +1118,9 @@ body: |
; GFX7-NEXT: {{ $}}
; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (invariant load (<8 x s16>), addrspace 4)
- ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16), [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16), [[UV4:%[0-9]+]]:sgpr(s16), [[UV5:%[0-9]+]]:sgpr(s16), [[UV6:%[0-9]+]]:sgpr(s16), [[UV7:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
- ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16)
- ; GFX7-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s16>)
+ ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(<2 x s16>), [[UV1:%[0-9]+]]:sgpr(<2 x s16>), [[UV2:%[0-9]+]]:sgpr(<2 x s16>), [[UV3:%[0-9]+]]:sgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+ ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>), [[UV2]](<2 x s16>)
+ ; GFX7-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
;
; GFX12-LABEL: name: load_constant_v6i16_align16
; GFX12: liveins: $sgpr0_sgpr1
@@ -917,6 +1136,7 @@ body: |
---
name: load_constant_i96_align4
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -947,6 +1167,7 @@ body: |
---
name: load_constant_i96_align8
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
@@ -977,6 +1198,7 @@ body: |
---
name: load_constant_i96_align16
legalized: true
+tracksRegLiveness: true
body: |
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir
index 821b78f8810b6f..29db4cf9eedf5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-zextload.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -verify-machineinstrs -o - %s | FileCheck %s
---
name: zextload_constant_i8_to_i32_uniform
@@ -15,6 +14,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 4)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 4, align 1)
...
@@ -33,6 +33,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s8), addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 1, align 1)
...
@@ -51,6 +52,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 4)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 4, align 2)
...
@@ -69,6 +71,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p4) :: (load (s16), addrspace 1)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p4) = COPY $sgpr0_sgpr1
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 1, align 2)
...
@@ -86,6 +89,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p3) :: (load (s8), addrspace 3)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p3) = COPY $sgpr0
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s8), addrspace 3, align 1)
...
@@ -104,6 +108,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p3) :: (load (s16), addrspace 3)
+ ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[ZEXTLOAD]]
%0:_(p3) = COPY $sgpr0
%1:_(s32) = G_ZEXTLOAD %0 :: (load (s16), addrspace 3, align 2)
...
More information about the llvm-branch-commits
mailing list