[llvm] c3075e6 - AMDGPU/GlobalISel: Select buffer atomics
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 12:23:00 PST 2020
Author: Matt Arsenault
Date: 2020-01-27T15:16:44-05:00
New Revision: c3075e6171b428ddec8b0ff04c45be6d0455bd54
URL: https://github.com/llvm/llvm-project/commit/c3075e6171b428ddec8b0ff04c45be6d0455bd54
DIFF: https://github.com/llvm/llvm-project/commit/c3075e6171b428ddec8b0ff04c45be6d0455bd54.diff
LOG: AMDGPU/GlobalISel: Select buffer atomics
The cmpswap handling is incomplete and fails to select.
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/BUFInstructions.td
llvm/lib/Target/AMDGPU/SIInstructions.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 3d793d6660fe..ce65717f2b29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -153,6 +153,20 @@ def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB, SIbuffer_atomic_sub>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMIN, SIbuffer_atomic_smin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMIN, SIbuffer_atomic_umin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMAX, SIbuffer_atomic_smax>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMAX, SIbuffer_atomic_umax>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_AND, SIbuffer_atomic_and>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
+
class GISelSop2Pat <
SDPatternOperator node,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 678f69873aec..5ea35bd0da11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2654,6 +2654,114 @@ bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
return true;
}
+static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
+ case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
+ MachineIRBuilder &B,
+ Intrinsic::ID IID) const {
+ B.setInstr(MI);
+
+ const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
+ IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register VData = MI.getOperand(2).getReg();
+
+ Register CmpVal;
+ int OpOffset = 0;
+
+ if (IsCmpSwap) {
+ CmpVal = MI.getOperand(3 + OpOffset).getReg();
+ ++OpOffset;
+ }
+
+ Register RSrc = MI.getOperand(3 + OpOffset).getReg();
+ const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+ Register VIndex;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(4).getReg();
+ ++OpOffset;
+ }
+
+ Register VOffset = MI.getOperand(4 + OpOffset).getReg();
+ Register SOffset = MI.getOperand(5 + OpOffset).getReg();
+ unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ unsigned ImmOffset;
+ unsigned TotalOffset;
+ std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+ if (TotalOffset != 0)
+ MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
+
+ if (!VIndex)
+ VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
+
+ auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
+ .addDef(Dst)
+ .addUse(VData); // vdata
+
+ if (IsCmpSwap)
+ MIB.addReg(CmpVal);
+
+ MIB.addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset) // offset(imm)
+ .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
+ .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+ .addMemOperand(MMO);
+
+ MI.eraseFromParent();
+ return true;
+}
+
// FIMXE: Needs observer like custom
bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
@@ -2787,6 +2895,33 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_struct_tbuffer_load:
return legalizeBufferLoad(MI, MRI, B, true, true);
+ case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_atomic_inc:
return legalizeAtomicIncDec(MI, B, true);
case Intrinsic::amdgcn_atomic_dec:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index f393d9990a11..7a610a812b7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -123,6 +123,8 @@ class AMDGPULegalizerInfo : public LegalizerInfo {
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsTyped,
bool IsFormat) const;
+ bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
+ Intrinsic::ID IID) const;
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
bool IsInc) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5ee80d93c319..476ab798d004 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2263,6 +2263,27 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(MI, MRI, {1, 4});
return;
}
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {2, 5});
+ return;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {3, 6});
+ return;
+ }
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
@@ -3095,6 +3116,40 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// initialized.
break;
}
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+ // vdata_out
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ // vdata_in
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+ // rsrc
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+ // vindex
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+
+ // voffset
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+ // soffset
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+
+ // Any remaining operands are immediates and were correctly null
+ // initialized.
+ break;
+ }
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
default:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 38d5aa4dbe84..42d96b71a31f 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1342,37 +1342,37 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
+ timm:$offset, timm:$cachepolicy, 0)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN)
+ getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm)),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
+ timm:$offset, timm:$cachepolicy, timm)),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
+ i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm)),
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
+ i32:$soffset, timm:$offset, timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
- $vdata_in,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
+ getVregSrcForVT<vt>.ret:$vdata_in,
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (extract_slc $cachepolicy))
>;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ccb4e9ef2bfd..d6f69faa4f38 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2216,3 +2216,36 @@ let Namespace = "AMDGPU" in {
def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
}
+
+class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
+
+def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex,
+ type2:$voffset, type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
new file mode 100644
index 000000000000..38b6b55b39d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+
+; Natural mapping
+define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+define amdgpu_ps float @raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+define amdgpu_ps <2 x float> @raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; CHECK: [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub0
+ ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub1
+ ; CHECK: $vgpr0 = COPY [[COPY8]]
+ ; CHECK: $vgpr1 = COPY [[COPY9]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i64 %ret to <2 x float>
+ ret <2 x float> %cast
+}
+
+define amdgpu_ps void @raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; CHECK: [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: S_ENDPGM 0
+ %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+; All operands need regbank legalization
+define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
+ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+ ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+ ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+ ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
+ ; CHECK: bb.3:
+ ; CHECK: successors: %bb.4(0x80000000)
+ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+ ; CHECK: bb.4:
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+; All operands need regbank legalization
+define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec
+ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+ ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+ ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+ ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
+ ; CHECK: bb.3:
+ ; CHECK: successors: %bb.4(0x80000000)
+ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+ ; CHECK: bb.4:
+ ; CHECK: S_ENDPGM 0
+ %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %voffset = add i32 %voffset.base, 4095
+ %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+; Natural mapping + slc
+define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32 immarg) #0
+declare i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32 immarg) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
new file mode 100644
index 000000000000..26792cd0d135
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+
+; Natural mapping
+define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+define amdgpu_ps float @struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+define amdgpu_ps <2 x float> @struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; CHECK: [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN]].sub0
+ ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN]].sub1
+ ; CHECK: $vgpr0 = COPY [[COPY9]]
+ ; CHECK: $vgpr1 = COPY [[COPY10]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i64 %ret to <2 x float>
+ ret <2 x float> %cast
+}
+
+define amdgpu_ps void @struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; CHECK: [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: S_ENDPGM 0
+ %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+; All register operands need legalization
+define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
+ ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec
+ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+ ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+ ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+ ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
+ ; CHECK: bb.3:
+ ; CHECK: successors: %bb.4(0x80000000)
+ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+ ; CHECK: bb.4:
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+; All register operands need legalization
+define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i32 inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
+ ; CHECK-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec
+ ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec
+ ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec
+ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
+ ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
+ ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+ ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec
+ ; CHECK: bb.3:
+ ; CHECK: successors: %bb.4(0x80000000)
+ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+ ; CHECK: bb.4:
+ ; CHECK: S_ENDPGM 0
+ %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+; Natural mapping + slc
+define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+ ; CHECK-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; CHECK: [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+ ; CHECK: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
+ %cast = bitcast i32 %ret to float
+ ret float %cast
+}
+
+declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg) #0
+declare i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i32, i32 immarg) #0
+
+attributes #0 = { nounwind }
More information about the llvm-commits
mailing list