[llvm] 533d650 - AMDGPU/GlobalISel: Move llvm.amdgcn.raw.buffer.store handling
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 11:59:40 PST 2020
Author: Matt Arsenault
Date: 2020-01-27T14:59:30-05:00
New Revision: 533d650e947a2f7216a315aeb8c79ac1d4740e5f
URL: https://github.com/llvm/llvm-project/commit/533d650e947a2f7216a315aeb8c79ac1d4740e5f
DIFF: https://github.com/llvm/llvm-project/commit/533d650e947a2f7216a315aeb8c79ac1d4740e5f.diff
LOG: AMDGPU/GlobalISel: Move llvm.amdgcn.raw.buffer.store handling
Treat this the same way as loads. There's less value to the
intermediate nodes, but it's good to be consistent.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/BUFInstructions.td
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index f314f9064d98..b48db0e8163a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -139,6 +139,11 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT, SItbuffer_load>;
def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT_D16, SItbuffer_load_d16>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE, SIbuffer_store>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_SHORT, SIbuffer_store_short>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_BYTE, SIbuffer_store_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT, SIbuffer_store_format>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT_D16, SIbuffer_store_format_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 9e7a70caeb4a..93c06745a76f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -856,177 +856,6 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
return Ret;
}
-static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
- int64_t C;
- if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
- return true;
-
- // FIXME: matcher should ignore copies
- return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
-}
-
-static unsigned extractGLC(unsigned AuxiliaryData) {
- return AuxiliaryData & 1;
-}
-
-static unsigned extractSLC(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 1) & 1;
-}
-
-static unsigned extractDLC(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 2) & 1;
-}
-
-static unsigned extractSWZ(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 3) & 1;
-}
-
-static unsigned getBufferStoreOpcode(LLT Ty,
- const unsigned MemSize,
- const bool Offen) {
- const int Size = Ty.getSizeInBits();
- switch (8 * MemSize) {
- case 8:
- return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
- AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
- case 16:
- return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
- AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
- default:
- unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
- AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
- if (Size > 32)
- Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
- return Opc;
- }
-}
-
-static unsigned getBufferStoreFormatOpcode(LLT Ty,
- const unsigned MemSize,
- const bool Offen) {
- bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
- bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
- int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
-
- if (IsD16Packed) {
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
- default:
- return -1;
- }
- }
-
- if (IsD16Unpacked) {
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
- default:
- return -1;
- }
- }
-
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
- default:
- return -1;
- }
-
- llvm_unreachable("unhandled buffer store");
-}
-
-// TODO: Move this to combiner
-// Returns base register, imm offset, total constant offset.
-std::tuple<Register, unsigned, unsigned>
-AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
- Register OrigOffset) const {
- const unsigned MaxImm = 4095;
- Register BaseReg;
- unsigned TotalConstOffset;
- MachineInstr *OffsetDef;
-
- std::tie(BaseReg, TotalConstOffset, OffsetDef)
- = AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset);
-
- unsigned ImmOffset = TotalConstOffset;
-
- // If the immediate value is too big for the immoffset field, put the value
- // and -4096 into the immoffset field so that the value that is copied/added
- // for the voffset field is a multiple of 4096, and it stands more chance
- // of being CSEd with the copy/add for another similar load/store.f
- // However, do not do that rounding down to a multiple of 4096 if that is a
- // negative number, as it appears to be illegal to have a negative offset
- // in the vgpr, even if adding the immediate offset makes it positive.
- unsigned Overflow = ImmOffset & ~MaxImm;
- ImmOffset -= Overflow;
- if ((int32_t)Overflow < 0) {
- Overflow += ImmOffset;
- ImmOffset = 0;
- }
-
- if (Overflow != 0) {
- // In case this is in a waterfall loop, insert offset code at the def point
- // of the offset, not inside the loop.
- MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
- MachineBasicBlock &OldMBB = B.getMBB();
- B.setInstr(*OffsetDef);
-
- if (!BaseReg) {
- BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(BaseReg)
- .addImm(Overflow);
- } else {
- Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(OverflowVal)
- .addImm(Overflow);
-
- Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
- .addReg(BaseReg)
- .addReg(OverflowVal, RegState::Kill)
- .addImm(0);
- BaseReg = NewBaseReg;
- }
-
- B.setInsertPt(OldMBB, OldInsPt);
- }
-
- return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
-}
-
bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
// FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
@@ -1042,60 +871,6 @@ bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
return true;
}
-bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
- bool IsFormat) const {
- MachineIRBuilder B(MI);
- MachineFunction &MF = B.getMF();
- Register VData = MI.getOperand(1).getReg();
- LLT Ty = MRI->getType(VData);
-
- int Size = Ty.getSizeInBits();
- if (Size % 32 != 0)
- return false;
-
- // FIXME: Verifier should enforce 1 MMO for these intrinsics.
- MachineMemOperand *MMO = *MI.memoperands_begin();
- const int MemSize = MMO->getSize();
-
- Register RSrc = MI.getOperand(2).getReg();
- Register VOffset = MI.getOperand(3).getReg();
- Register SOffset = MI.getOperand(4).getReg();
- unsigned AuxiliaryData = MI.getOperand(5).getImm();
- unsigned ImmOffset;
- unsigned TotalOffset;
-
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
-
- const bool Offen = !isZero(VOffset, *MRI);
-
- int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
- getBufferStoreOpcode(Ty, MemSize, Offen);
- if (Opc == -1)
- return false;
-
- MachineInstrBuilder MIB = B.buildInstr(Opc)
- .addUse(VData);
-
- if (Offen)
- MIB.addUse(VOffset);
-
- MIB.addUse(RSrc)
- .addUse(SOffset)
- .addImm(ImmOffset)
- .addImm(extractGLC(AuxiliaryData))
- .addImm(extractSLC(AuxiliaryData))
- .addImm(0) // tfe: FIXME: Remove from inst
- .addImm(extractDLC(AuxiliaryData))
- .addImm(extractSWZ(AuxiliaryData))
- .addMemOperand(MMO);
-
- MI.eraseFromParent();
-
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-}
-
static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
switch (MF.getFunction().getCallingConv()) {
case CallingConv::AMDGPU_PS:
@@ -1325,10 +1100,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
switch (IntrinsicID) {
case Intrinsic::amdgcn_end_cf:
return selectEndCfIntrinsic(I);
- case Intrinsic::amdgcn_raw_buffer_store:
- return selectStoreIntrinsic(I, false);
- case Intrinsic::amdgcn_raw_buffer_store_format:
- return selectStoreIntrinsic(I, true);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
return selectDSOrderedIntrinsic(I, IntrinsicID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 3ba1b61cd545..cc4d987979f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -100,11 +100,7 @@ class AMDGPUInstructionSelector : public InstructionSelector {
bool selectInterpP1F16(MachineInstr &MI) const;
bool selectG_INTRINSIC(MachineInstr &I) const;
- std::tuple<Register, unsigned, unsigned>
- splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
-
bool selectEndCfIntrinsic(MachineInstr &MI) const;
- bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const;
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ab5c9ee2f585..d03eee69c195 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2402,37 +2402,119 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
}
-bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
- bool IsFormat) const {
- // TODO: Reject f16 format on targets where unsupported.
- Register VData = MI.getOperand(1).getReg();
- LLT Ty = MRI.getType(VData);
+Register AMDGPULegalizerInfo::fixStoreSourceType(
+ MachineIRBuilder &B, Register VData, bool IsFormat) const {
+ MachineRegisterInfo *MRI = B.getMRI();
+ LLT Ty = MRI->getType(VData);
- B.setInstr(MI);
-
- const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
// Fixup illegal register types for i8 stores.
if (Ty == LLT::scalar(8) || Ty == S16) {
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
- MI.getOperand(1).setReg(AnyExt);
- return true;
+ return AnyExt;
}
if (Ty.isVector()) {
if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
if (IsFormat)
- MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
- return true;
+ return handleD16VData(B, *MRI, VData);
}
+ }
+
+ return VData;
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ bool IsTyped,
+ bool IsFormat) const {
+ B.setInstr(MI);
+
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(VData);
+ LLT EltTy = Ty.getScalarType();
+ const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
+ const LLT S32 = LLT::scalar(32);
+
+ VData = fixStoreSourceType(B, VData, IsFormat);
+ Register RSrc = MI.getOperand(2).getReg();
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ const int MemSize = MMO->getSize();
- return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
+ unsigned ImmOffset;
+ unsigned TotalOffset;
+
+ // The typed intrinsics add an immediate after the registers.
+ const unsigned NumVIndexOps = IsTyped ? 8 : 7;
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+ Register VIndex;
+ int OpOffset = 0;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(3).getReg();
+ OpOffset = 1;
}
- return Ty == S32;
+ Register VOffset = MI.getOperand(3 + OpOffset).getReg();
+ Register SOffset = MI.getOperand(4 + OpOffset).getReg();
+
+ unsigned Format = 0;
+ if (IsTyped) {
+ Format = MI.getOperand(5 + OpOffset).getImm();
+ ++OpOffset;
+ }
+
+ unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
+
+ std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+ if (TotalOffset != 0)
+ MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+ unsigned Opc;
+ if (IsTyped) {
+ Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
+ AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
+ } else if (IsFormat) {
+ Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
+ AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
+ } else {
+ switch (MemSize) {
+ case 1:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
+ break;
+ default:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
+ break;
+ }
+ }
+
+ if (!VIndex)
+ VIndex = B.buildConstant(S32, 0).getReg(0);
+
+ auto MIB = B.buildInstr(Opc)
+ .addUse(VData) // vdata
+ .addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset); // offset(imm)
+
+ if (IsTyped)
+ MIB.addImm(Format);
+
+ MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
+ .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+ .addMemOperand(MMO);
+
+ MI.eraseFromParent();
+ return true;
}
bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
@@ -2688,9 +2770,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
return true;
}
case Intrinsic::amdgcn_raw_buffer_store:
- return legalizeRawBufferStore(MI, MRI, B, false);
+ return legalizeBufferStore(MI, MRI, B, false, false);
case Intrinsic::amdgcn_raw_buffer_store_format:
- return legalizeRawBufferStore(MI, MRI, B, true);
+ return legalizeBufferStore(MI, MRI, B, false, true);
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
return legalizeBufferLoad(MI, MRI, B, false, false);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index c889575058b6..f393d9990a11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -114,9 +114,15 @@ class AMDGPULegalizerInfo : public LegalizerInfo {
MachineIRBuilder &B, bool IsFormat) const;
bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsFormat) const;
+ Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
+ bool IsFormat) const;
+
+ bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool IsTyped,
+ bool IsFormat) const;
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool IsFormat,
- bool IsTyped) const;
+ MachineIRBuilder &B, bool IsTyped,
+ bool IsFormat) const;
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
bool IsInc) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index e09831caf1a0..c7408b45f8a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2251,7 +2251,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
- case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: {
+ case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {1, 4});
return;
@@ -3077,7 +3082,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
- case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: {
+ case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
// rsrc
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index e14433cb0ce6..eeefa40ba38b 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1258,7 +1258,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1266,8 +1266,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $auxiliary),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_glc $auxiliary),
(extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1275,8 +1275,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $auxiliary),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_glc $auxiliary),
(extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1285,9 +1285,9 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
- $vdata,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary),
+ getVregSrcForVT<vt>.ret:$vdata,
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary),
(extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1305,6 +1305,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMA
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
@@ -1312,6 +1313,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3b9d30e1a63b..ccb4e9ef2bfd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2174,6 +2174,32 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
+class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
+
+class TBufferStoreGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$format,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
+
+def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction;
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index ce71a89adacb..b5372ef76a7a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+ ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
; UNPACKED: S_ENDPGM 0
; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
; PACKED: bb.1 (%ir-block.0):
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+ ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
; UNPACKED: S_ENDPGM 0
; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+ ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
; PACKED: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
ret void
@@ -378,13 +378,14 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; UNPACKED: %23:vgpr_32, dead %24:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
+ ; UNPACKED: %11:vgpr_32, dead %24:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec
; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
- ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; UNPACKED: S_ENDPGM 0
; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
; PACKED: bb.1 (%ir-block.0):
@@ -397,9 +398,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; PACKED: %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; PACKED: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; PACKED: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -422,30 +424,31 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; UNPACKED: %48:vgpr_32, dead %49:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec
- ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
+ ; UNPACKED: %13:vgpr_32, dead %49:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
+ ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec
+ ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec
; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
- ; UNPACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; UNPACKED: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; UNPACKED: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; UNPACKED: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; UNPACKED: bb.2:
; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
- ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+ ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
+ ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
- ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
- ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
- ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
+ ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY11]], implicit $exec
+ ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
+ ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
- ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
+ ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec
; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -468,24 +471,25 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; PACKED: %32:vgpr_32, dead %33:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; PACKED: %13:vgpr_32, dead %33:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
+ ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; PACKED: bb.2:
; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
- ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
+ ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+ ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
- ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
- ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
- ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
+ ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
+ ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
+ ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
- ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
+ ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec
; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index aea37fd08b40..94360c0d1868 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+ ; CHECK: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
ret void
@@ -241,9 +241,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; CHECK: %16:vgpr_32, dead %17:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK: %13:vgpr_32, dead %17:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
+ ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -269,24 +270,25 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; CHECK: %34:vgpr_32, dead %35:sreg_64 = V_ADD_I32_e64 [[COPY8]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK: %15:vgpr_32, dead %35:sreg_64_xexec = V_ADD_I32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec
+ ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
- ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
+ ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY11]], implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
- ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
+ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index c5aa36df8675..4473d64dfa2a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+ ; CHECK: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
ret void
@@ -515,8 +515,9 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
ret void
@@ -572,9 +573,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; CHECK: %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -667,9 +669,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; CHECK: %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -690,24 +693,25 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; CHECK: %30:vgpr_32, dead %31:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
- ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK: %13:vgpr_32, dead %33:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
- ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
+ ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
- ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
+ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
+ ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %13, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -734,23 +738,24 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; CHECK: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
- ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
- ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY6]], implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec
- ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
+ ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec
; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
- ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
+ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
+ ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
More information about the llvm-commits
mailing list