[llvm] r365586 - AMDGPU/GlobalISel: Add support for wide loads >= 256-bits
Tom Stellard via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 9 17:22:41 PDT 2019
Author: tstellar
Date: Tue Jul 9 17:22:41 2019
New Revision: 365586
URL: http://llvm.org/viewvc/llvm-project?rev=365586&view=rev
Log:
AMDGPU/GlobalISel: Add support for wide loads >= 256-bits
Summary:
This adds support for the most commonly used wide load types:
<8xi32>, <16xi32>, <4xi64>, and <8xi64>
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: hiraditya, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, volkan, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D57399
Added:
llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def?rev=365586&r1=365585&r2=365586&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def Tue Jul 9 17:22:41 2019
@@ -161,5 +161,77 @@ const RegisterBankInfo::ValueMapping *ge
return &ValMappingsSGPR64OnlyVGPR32[2];
}
+const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
+ /* 256-bit load */ {0, 256, SGPRRegBank},
+ /* 512-bit load */ {0, 512, SGPRRegBank},
+ /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+ {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+ {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+ {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+ /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+ {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+ {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+ {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+ {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank},
+ {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank},
+ {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank},
+ {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank},
+ /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+ {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+ /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+ {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+ {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank},
+ {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank},
+
+ /* FIXME: The generic register bank select does not support complex
+ * break downs where the number of vector elements does not equal the
+ * number of breakdowns.
+ * FIXME: register bank select now tries to handle complex break downs,
+ * but it emits an illegal instruction:
+ * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128)
+ */
+ /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+ /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+ {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank}
+};
+
+const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] {
+ /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1},
+ /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1},
+ /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8},
+ /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16},
+ /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4},
+ /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8}
+};
+
+const RegisterBankInfo::ValueMapping *
+getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) {
+ unsigned Size = SizeTy.getSizeInBits();
+ if (Size < 256 || BankID == AMDGPU::SGPRRegBankID)
+ return getValueMapping(BankID, Size);
+
+ assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID);
+
+ // Default to using the non-split ValueMappings, we will use these if
+ // the register bank is SGPR or if we don't know how to handle the vector
+ // type.
+ unsigned Idx = Size == 256 ? 0 : 1;
+
+ // We need to split this load if it has a vgpr pointer.
+ if (BankID == AMDGPU::VGPRRegBankID) {
+ if (SizeTy == LLT::vector(8, 32))
+ Idx = 2;
+ else if (SizeTy == LLT::vector(16, 32))
+ Idx = 3;
+ else if (SizeTy == LLT::vector(4, 64))
+ Idx = 4;
+ else if (SizeTy == LLT::vector(8, 64))
+ Idx = 5;
+ }
+
+ return &ValMappingsLoadSGPROnly[Idx];
+}
+
+
} // End AMDGPU namespace.
} // End llvm namespace.
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp?rev=365586&r1=365585&r2=365586&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp Tue Jul 9 17:22:41 2019
@@ -517,7 +517,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo
case 256:
case 512:
- // TODO: constant loads
+ // TODO: Possibly support loads of i256 and i512 . This will require
+ // adding i256 and i512 types to MVT in order for to be able to use
+ // TableGen.
+ // TODO: Add support for other vector types, this will require
+ // defining more value mappings for the new types.
+ return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
+ Ty0.getScalarType().getSizeInBits() == 64);
+
default:
return false;
}
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp?rev=365586&r1=365585&r2=365586&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp Tue Jul 9 17:22:41 2019
@@ -37,22 +37,23 @@ using namespace llvm;
namespace {
// Observer to apply a register bank to new registers created by LegalizerHelper.
-class ApplySALUMapping final : public GISelChangeObserver {
+class ApplyRegBankMapping final : public GISelChangeObserver {
private:
MachineRegisterInfo &MRI;
+ const RegisterBank *NewBank;
SmallVector<MachineInstr *, 4> NewInsts;
public:
- ApplySALUMapping(MachineRegisterInfo &MRI_)
- : MRI(MRI_) {}
+ ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
+ : MRI(MRI_), NewBank(RB) {}
- ~ApplySALUMapping() {
+ ~ApplyRegBankMapping() {
for (MachineInstr *MI : NewInsts)
- applySALUBank(*MI);
+ applyBank(*MI);
}
/// Set any registers that don't have a set register class or bank to SALU.
- void applySALUBank(MachineInstr &MI) {
+ void applyBank(MachineInstr &MI) {
for (MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;
@@ -61,10 +62,13 @@ public:
if (MRI.getRegClassOrRegBank(Reg))
continue;
+ const RegisterBank *RB = NewBank;
// FIXME: This might not be enough to detect when SCC should be used.
- const RegisterBank &RB = MRI.getType(Reg) == LLT::scalar(1) ?
- AMDGPU::SCCRegBank : AMDGPU::SGPRRegBank;
- MRI.setRegBank(Reg, RB);
+ if (MRI.getType(Reg) == LLT::scalar(1))
+ RB = (NewBank == &AMDGPU::SGPRRegBank ?
+ &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
+
+ MRI.setRegBank(Reg, *RB);
}
}
@@ -80,7 +84,6 @@ public:
};
}
-
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
: AMDGPUGenRegisterBankInfo(),
TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
@@ -128,6 +131,12 @@ unsigned AMDGPURegisterBankInfo::copyCos
unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const ValueMapping &ValMapping,
const RegisterBank *CurBank) const {
+ // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
+ // VGPR.
+ // FIXME: Is there a better way to do this?
+ if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
+ return 10; // This is expensive.
+
assert(ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 &&
ValMapping.BreakDown[0].StartIdx == 0 &&
@@ -302,6 +311,14 @@ AMDGPURegisterBankInfo::getInstrAlternat
}
}
+static bool isInstrUniform(const MachineInstr &MI) {
+ if (!MI.hasOneMemOperand())
+ return false;
+
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+ return AMDGPUInstrInfo::isUniformMMO(MMO);
+}
+
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
@@ -356,29 +373,29 @@ AMDGPURegisterBankInfo::getInstrAlternat
}
case TargetOpcode::G_LOAD: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
// FIXME: Should we be hard coding the size for these mappings?
- const InstructionMapping &SSMapping = getInstructionMapping(
- 1, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
- 2); // Num Operands
- AltMappings.push_back(&SSMapping);
+ if (isInstrUniform(MI)) {
+ const InstructionMapping &SSMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+ 2); // Num Operands
+ AltMappings.push_back(&SSMapping);
+ }
const InstructionMapping &VVMapping = getInstructionMapping(
2, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
- // FIXME: Should this be the pointer-size (64-bits) or the size of the
- // register that will hold the bufffer resourc (128-bits).
- const InstructionMapping &VSMapping = getInstructionMapping(
- 3, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
- 2); // Num Operands
- AltMappings.push_back(&VSMapping);
+ // It may be possible to have a vgpr = load sgpr mapping here, because
+ // the mubuf instructions support this kind of load, but probably for only
+ // gfx7 and older. However, the addressing mode matching in the instruction
+ // selector should be able to do a better job of detecting and selecting
+ // these kinds of loads from the vgpr = load vgpr mapping.
return AltMappings;
@@ -874,6 +891,91 @@ void AMDGPURegisterBankInfo::constrainOp
MI.getOperand(OpIdx).setReg(SGPR);
}
+// When regbankselect repairs registers, it will insert a repair instruction
+// which defines the repaired register. Then it calls applyMapping and expects
+// that the targets will either delete or rewrite the originally wrote to the
+// repaired registers. Beccause of this, we end up in a situation where
+// we have 2 instructions defining the same registers.
+static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
+ Register Reg,
+ const MachineInstr &MI) {
+ // Is there some way we can assert that there are exactly 2 def instructions?
+ for (MachineInstr &Other : MRI.def_instructions(Reg)) {
+ if (&Other != &MI)
+ return &Other;
+ }
+
+ return nullptr;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ const LLT LoadTy = MRI.getType(DstReg);
+ unsigned LoadSize = LoadTy.getSizeInBits();
+ const unsigned MaxNonSmrdLoadSize = 128;
+ // 128-bit loads are supported for all instruction types.
+ if (LoadSize <= MaxNonSmrdLoadSize)
+ return false;
+
+ SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
+
+ // If the pointer is an SGPR, we have nothing to do.
+ if (SrcRegs.empty())
+ return false;
+
+ assert(LoadSize % MaxNonSmrdLoadSize == 0);
+
+ // We want to get the repair instruction now, because it will help us
+ // determine which instruction the legalizer inserts that will also
+ // write to DstReg.
+ MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
+
+ // RegBankSelect only emits scalar types, so we need to reset the pointer
+ // operand to a pointer type.
+ Register BasePtrReg = SrcRegs[0];
+ LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
+ MRI.setType(BasePtrReg, PtrTy);
+
+ MachineIRBuilder B(MI);
+
+ unsigned SplitElts =
+ MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
+ const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
+ ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
+ GISelObserverWrapper Observer(&O);
+ B.setChangeObserver(Observer);
+ LegalizerHelper Helper(B.getMF(), Observer, B);
+ if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+ return false;
+
+ // At this point, the legalizer has split the original load into smaller
+ // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
+ // that combines the outputs of the lower loads and writes it to DstReg.
+ // The register bank selector has also added the RepairInst which writes to
+ // DstReg as well.
+
+ MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
+
+ // Replace the output of the LegalizedInst with a temporary register, since
+ // RepairInst already defines DstReg.
+ Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
+ LegalizedInst->getOperand(0).setReg(TmpReg);
+ B.setInsertPt(*RepairInst->getParent(), RepairInst);
+
+ for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
+ Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+ B.buildConstant(IdxReg, DefIdx);
+ MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
+ B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
+ }
+
+ MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+ return true;
+}
+
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static void substituteSimpleCopyRegs(
@@ -1008,7 +1110,7 @@ void AMDGPURegisterBankInfo::applyMappin
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
- ApplySALUMapping ApplySALU(MRI);
+ ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);
@@ -1028,7 +1130,7 @@ void AMDGPURegisterBankInfo::applyMappin
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
- ApplySALUMapping ApplySALU(MRI);
+ ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);
@@ -1212,6 +1314,11 @@ void AMDGPURegisterBankInfo::applyMappin
}
break;
}
+ case AMDGPU::G_LOAD: {
+ if (applyMappingWideLoad(MI, OpdMapper, MRI))
+ return;
+ break;
+ }
default:
break;
}
@@ -1219,14 +1326,6 @@ void AMDGPURegisterBankInfo::applyMappin
return applyDefaultMapping(OpdMapper);
}
-static bool isInstrUniform(const MachineInstr &MI) {
- if (!MI.hasOneMemOperand())
- return false;
-
- const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPUInstrInfo::isUniformMMO(MMO);
-}
-
bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1322,6 +1421,7 @@ AMDGPURegisterBankInfo::getInstrMappingF
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
const ValueMapping *ValMapping;
@@ -1332,7 +1432,7 @@ AMDGPURegisterBankInfo::getInstrMappingF
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {
- ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
// FIXME: What would happen if we used SGPRRegBankID here?
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h?rev=365586&r1=365585&r2=365586&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h Tue Jul 9 17:22:41 2019
@@ -44,6 +44,9 @@ class AMDGPURegisterBankInfo : public AM
void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
unsigned OpIdx) const;
+ bool applyMappingWideLoad(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const;
/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir?rev=365586&r1=365585&r2=365586&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir Tue Jul 9 17:22:41 2019
@@ -4,6 +4,7 @@
--- |
define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void }
+ define amdgpu_kernel void @smrd_wide() { ret void }
...
---
@@ -155,3 +156,32 @@ body: |
...
---
+
+name: smrd_wide
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ %0:sgpr(p4) = COPY $sgpr0_sgpr1
+ %1:sgpr(p1) = COPY $sgpr2_sgpr3
+
+ ; CHECK: [[CONSTANT_PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; CHECK: [[GLOBAL_PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK: s_load_dwordx8 [[CONSTANT_PTR]]
+ %2:sgpr(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4)
+ $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %2
+
+ ; CHECK: s_load_dwordx16 [[CONSTANT_PTR]]
+ %3:sgpr(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4)
+ $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %3
+
+ ; CHECK: s_load_dwordx8 [[GLOBAL_PTR]]
+ %4:sgpr(<8 x s32>) = G_LOAD %1 :: (load 32, addrspace 1)
+ $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %4
+
+ ; CHECK s_load_dwordx16 [[GLOBAL_PTR]]
+ %5:sgpr(<16 x s32>) = G_LOAD %1 :: (load 64, addrspace 1)
+ $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %5
+...
Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir?rev=365586&r1=365585&r2=365586&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir Tue Jul 9 17:22:41 2019
@@ -390,3 +390,33 @@ body: |
%1:_(<3 x s32>) = G_LOAD %0 :: (load 12, addrspace 1, align 16)
$vgpr0_vgpr1_vgpr2 = COPY %1
...
+
+---
+name: test_load_constant_v8i32
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: test_load_global_v8i32
+ ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load 32, addrspace 4)
+ ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[LOAD]](<8 x s32>)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4)
+ $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
+...
+
+---
+name: test_load_constant_v16i32
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: test_load_global_v16i32
+ ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load 64, addrspace 4)
+ ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[LOAD]](<16 x s32>)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4)
+ $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
+...
Added: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir?rev=365586&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir Tue Jul 9 17:22:41 2019
@@ -0,0 +1,488 @@
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+
+# REQUIRES: global-isel
+
+--- |
+ define amdgpu_kernel void @load_global_v8i32_non_uniform(<8 x i32> addrspace(1)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %global.not.uniform.v8i32 = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tmp0
+ %tmp2 = load <8 x i32>, <8 x i32> addrspace(1)* %global.not.uniform.v8i32
+ ret void
+ }
+ define amdgpu_kernel void @load_global_v4i64_non_uniform(<4 x i64> addrspace(1)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %global.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tmp0
+ %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %global.not.uniform.v4i64
+ ret void
+ }
+ define amdgpu_kernel void @load_global_v16i32_non_uniform(<16 x i32> addrspace(1)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %global.not.uniform.v16i32 = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tmp0
+ %tmp2 = load <16 x i32>, <16 x i32> addrspace(1)* %global.not.uniform.v16i32
+ ret void
+ }
+ define amdgpu_kernel void @load_global_v8i64_non_uniform(<8 x i64> addrspace(1)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %global.not.uniform.v8i64 = getelementptr <8 x i64>, <8 x i64> addrspace(1)* %in, i32 %tmp0
+ %tmp2 = load <8 x i64>, <8 x i64> addrspace(1)* %global.not.uniform.v8i64
+ ret void
+ }
+ define amdgpu_kernel void @load_global_v8i32_uniform() {ret void}
+ define amdgpu_kernel void @load_global_v4i64_uniform() {ret void}
+ define amdgpu_kernel void @load_global_v16i32_uniform() {ret void}
+ define amdgpu_kernel void @load_global_v8i64_uniform() {ret void}
+ define amdgpu_kernel void @load_constant_v8i32_non_uniform(<8 x i32> addrspace(4)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %constant.not.uniform.v8i32 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %in, i32 %tmp0
+ %tmp2 = load <8 x i32>, <8 x i32> addrspace(4)* %constant.not.uniform.v8i32
+ ret void
+ }
+ define amdgpu_kernel void @load_constant_v4i64_non_uniform(<4 x i64> addrspace(4)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %constant.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(4)* %in, i32 %tmp0
+ %tmp2 = load <4 x i64>, <4 x i64> addrspace(4)* %constant.not.uniform.v4i64
+ ret void
+ }
+ define amdgpu_kernel void @load_constant_v16i32_non_uniform(<16 x i32> addrspace(4)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %constant.not.uniform.v16i32 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %in, i32 %tmp0
+ %tmp2 = load <16 x i32>, <16 x i32> addrspace(4)* %constant.not.uniform.v16i32
+ ret void
+ }
+ define amdgpu_kernel void @load_constant_v8i64_non_uniform(<8 x i64> addrspace(4)* %in) {
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %constant.not.uniform.v8i64 = getelementptr <8 x i64>, <8 x i64> addrspace(4)* %in, i32 %tmp0
+ %tmp2 = load <8 x i64>, <8 x i64> addrspace(4)* %constant.not.uniform.v8i64
+ ret void
+ }
+ define amdgpu_kernel void @load_constant_v8i32_uniform() {ret void}
+ define amdgpu_kernel void @load_constant_v4i64_uniform() {ret void}
+ define amdgpu_kernel void @load_constant_v16i32_uniform() {ret void}
+ define amdgpu_kernel void @load_constant_v8i64_uniform() {ret void}
+ declare i32 @llvm.amdgcn.workitem.id.x() #0
+ attributes #0 = { nounwind readnone }
+...
+
+---
+name : load_global_v8i32_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_global_v8i32_non_uniform
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v8i32 + 16, align 32, addrspace 1)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX3]]
+ ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4
+ ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX4]]
+ ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5
+ ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX5]]
+ ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6
+ ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX6]]
+ ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7
+ ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX7]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v8i32)
+...
+
+---
+name : load_global_v4i64_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v4i64 + 16, align 32, addrspace 1)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX3]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64)
+
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<4 x s64>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v4i64)
+...
+
+---
+name : load_global_v16i32_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_global_v16i32_non_uniform
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 16, align 64, addrspace 1)
+ ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET32]](s64)
+ ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP32]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 32, align 64, addrspace 1)
+ ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET48]](s64)
+ ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP48]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 48, align 64, addrspace 1)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>), [[LOAD32]](<4 x s32>), [[LOAD48]](<4 x s32>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX3]]
+ ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4
+ ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX4]]
+ ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5
+ ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX5]]
+ ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6
+ ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX6]]
+ ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7
+ ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX7]]
+ ; CHECK: [[IDX8:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8
+ ; CHECK: [[OUT8:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX8]]
+ ; CHECK: [[IDX9:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 9
+ ; CHECK: [[OUT9:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX9]]
+ ; CHECK: [[IDX10:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10
+ ; CHECK: [[OUT10:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX10]]
+ ; CHECK: [[IDX11:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 11
+ ; CHECK: [[OUT11:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX11]]
+ ; CHECK: [[IDX12:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK: [[OUT12:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX12]]
+ ; CHECK: [[IDX13:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 13
+ ; CHECK: [[OUT13:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX13]]
+ ; CHECK: [[IDX14:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 14
+ ; CHECK: [[OUT14:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX14]]
+ ; CHECK: [[IDX15:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 15
+ ; CHECK: [[OUT15:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX15]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32), [[OUT8]](s32), [[OUT9]](s32), [[OUT10]](s32), [[OUT11]](s32), [[OUT12]](s32), [[OUT13]](s32), [[OUT14]](s32), [[OUT15]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.global.not.uniform.v16i32)
+...
+
+name : load_global_v8i64_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_global_v8i64_non_uniform
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v8i64, align 64, addrspace 1)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 16, align 64, addrspace 1)
+ ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET32]](s64)
+ ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP32]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 32, align 64, addrspace 1)
+ ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET48]](s64)
+ ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP48]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 48, align 64, addrspace 1)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>), [[LOAD32]](<2 x s64>), [[LOAD48]](<2 x s64>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX3]]
+ ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4
+ ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX4]]
+ ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5
+ ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX5]]
+ ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6
+ ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX6]]
+ ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7
+ ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX7]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64), [[OUT4]](s64), [[OUT5]](s64), [[OUT6]](s64), [[OUT7]](s64)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s64>) = G_LOAD %0 :: (load 64 from %ir.global.not.uniform.v8i64)
+...
+
+---
+name : load_global_v8i32_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_global_v8i32_uniform
+ ; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (load 32, addrspace 1)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 1)
+...
+
+---
+name : load_global_v4i64_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_global_v4i64_uniform
+ ; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (load 32, addrspace 1)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 1)
+...
+
+---
+name : load_global_v16i32_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_global_v16i32_uniform
+ ; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (load 64, addrspace 1)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 1)
+...
+
+---
+name : load_global_v8i64_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_global_v8i64_uniform
+ ; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (load 64, addrspace 1)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 1)
+...
+
+---
+name : load_constant_v8i32_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v8i32_non_uniform
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32 + 16, align 32, addrspace 4)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX3]]
+ ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4
+ ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX4]]
+ ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5
+ ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX5]]
+ ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6
+ ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX6]]
+ ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7
+ ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX7]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v8i32)
+...
+
+---
+name : load_constant_v4i64_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v4i64_non_uniform
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64 + 16, align 32, addrspace 4)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX3]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<4 x s64>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v4i64)
+...
+
+---
+name : load_constant_v16i32_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v16i32_non_uniform
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 16, align 64, addrspace 4)
+ ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET32]](s64)
+ ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP32]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 32, align 64, addrspace 4)
+ ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET48]](s64)
+ ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP48]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 48, align 64, addrspace 4)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>), [[LOAD32]](<4 x s32>), [[LOAD48]](<4 x s32>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX3]]
+ ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4
+ ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX4]]
+ ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5
+ ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX5]]
+ ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6
+ ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX6]]
+ ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7
+ ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX7]]
+ ; CHECK: [[IDX8:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8
+ ; CHECK: [[OUT8:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX8]]
+ ; CHECK: [[IDX9:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32
+ ; CHECK: [[OUT9:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX9]]
+ ; CHECK: [[IDX10:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10
+ ; CHECK: [[OUT10:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX10]]
+ ; CHECK: [[IDX11:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 11
+ ; CHECK: [[OUT11:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX11]]
+ ; CHECK: [[IDX12:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK: [[OUT12:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX12]]
+ ; CHECK: [[IDX13:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 13
+ ; CHECK: [[OUT13:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX13]]
+ ; CHECK: [[IDX14:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 14
+ ; CHECK: [[OUT14:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX14]]
+ ; CHECK: [[IDX15:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 15
+ ; CHECK: [[OUT15:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX15]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32), [[OUT8]](s32), [[OUT9]](s32), [[OUT10]](s32), [[OUT11]](s32), [[OUT12]](s32), [[OUT13]](s32), [[OUT14]](s32), [[OUT15]](s32)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v16i32)
+...
+
+---
+name : load_constant_v8i64_non_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v8i64_non_uniform
+ ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
+ ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+ ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64)
+ ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 16, align 64, addrspace 4)
+ ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+ ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET32]](s64)
+ ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP32]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 32, align 64, addrspace 4)
+ ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+ ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET48]](s64)
+ ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP48]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 48, align 64, addrspace 4)
+ ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>), [[LOAD32]](<2 x s64>), [[LOAD48]](<2 x s64>)
+ ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX0]]
+ ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX1]]
+ ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX2]]
+ ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3
+ ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX3]]
+ ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4
+ ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX4]]
+ ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5
+ ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX5]]
+ ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6
+ ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX6]]
+ ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7
+ ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX7]]
+ ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64), [[OUT4]](s64), [[OUT5]](s64), [[OUT6]](s64), [[OUT7]](s64)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s64>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v8i64)
+...
+
+---
+name : load_constant_v8i32_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v8i32_uniform
+ ; CHECK (<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4)
+...
+
+---
+name : load_constant_v4i64_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v4i64_uniform
+ ; CHECK (<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 4)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 4)
+...
+
+---
+name : load_constant_v16i32_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v16i32_uniform
+ ; CHECK (<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4)
+...
+
+---
+name : load_constant_v8i64_uniform
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; CHECK-LABEL: name: load_constant_v8i64_uniform
+ ; CHECK (<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 4)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 4)
+...
More information about the llvm-commits
mailing list