[llvm] r347008 - [AMDGPU] Add FixupVectorISel pass, currently Supports SREGs in GLOBAL LD/ST
Ron Lieberman via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 15 17:13:34 PST 2018
Author: ronlieb
Date: Thu Nov 15 17:13:34 2018
New Revision: 347008
URL: http://llvm.org/viewvc/llvm-project?rev=347008&view=rev
Log:
[AMDGPU] Add FixupVectorISel pass, currently Supports SREGs in GLOBAL LD/ST
Add a pass to fixup various vector ISel issues.
Currently we handle converting GLOBAL_{LOAD|STORE}_*
and GLOBAL_Atomic_* instructions into their _SADDR variants.
This involves feeding the sreg into the saddr field of the new instruction.
Added:
llvm/trunk/lib/Target/AMDGPU/SIFixupVectorISel.cpp
llvm/trunk/test/CodeGen/AMDGPU/global-load-store-atomics.mir
llvm/trunk/test/CodeGen/AMDGPU/global-saddr.ll
llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll
llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll
llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
llvm/trunk/test/CodeGen/AMDGPU/madak.ll
llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll
llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll
llvm/trunk/test/CodeGen/AMDGPU/memory_clause.ll
llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Thu Nov 15 17:13:34 2018
@@ -41,6 +41,7 @@ FunctionPass *createSIAnnotateControlFlo
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIFixupVectorISelPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
@@ -122,6 +123,9 @@ extern char &SIFixSGPRCopiesID;
void initializeSIFixVGPRCopiesPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
+void initializeSIFixupVectorISelPass(PassRegistry &);
+extern char &SIFixupVectorISelID;
+
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Thu Nov 15 17:13:34 2018
@@ -161,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarg
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
+ initializeSIFixupVectorISelPass(*PR);
initializeSIFoldOperandsPass(*PR);
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
@@ -813,6 +814,7 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
+ addPass(createSIFixupVectorISelPass());
return false;
}
Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Thu Nov 15 17:13:34 2018
@@ -96,6 +96,7 @@ add_llvm_target(AMDGPUCodeGen
SIAnnotateControlFlow.cpp
SIDebuggerInsertNops.cpp
SIFixSGPRCopies.cpp
+ SIFixupVectorISel.cpp
SIFixVGPRCopies.cpp
SIFixWWMLiveness.cpp
SIFoldOperands.cpp
Modified: llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td Thu Nov 15 17:13:34 2018
@@ -121,6 +121,11 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo
let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
}
+class GlobalSaddrTable <bit is_saddr, string Name = ""> {
+ bit IsSaddr = is_saddr;
+ string SaddrOp = Name;
+}
+
// TODO: Is exec allowed for saddr? The disabled value 0x7f is the
// same encoding value as exec_hi, so it isn't possible to use that if
// saddr is 32-bit (which isn't handled here yet).
@@ -171,15 +176,19 @@ class FLAT_Store_Pseudo <string opName,
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
let is_flat_global = 1 in {
- def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
- def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>,
+ GlobalSaddrTable<1, opName>;
}
}
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
let is_flat_global = 1 in {
- def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>;
- def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>;
+ def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>,
+ GlobalSaddrTable<1, opName>;
}
}
@@ -262,6 +271,7 @@ multiclass FLAT_Atomic_Pseudo<
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
" $vaddr, $vdata$offset$slc">,
+ GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
}
@@ -272,6 +282,7 @@ multiclass FLAT_Atomic_Pseudo<
" $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+ GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1>;
}
@@ -287,6 +298,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, off$offset$slc">,
+ GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
let PseudoInstr = NAME;
@@ -296,6 +308,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, $saddr$offset$slc">,
+ GlobalSaddrTable<1, opName>,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
let enabled_saddr = 1;
@@ -317,6 +330,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN
" $vdst, $vaddr, $vdata, off$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+ GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
let has_saddr = 1;
}
@@ -325,6 +339,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN
(outs vdst_rc:$vdst),
(ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+ GlobalSaddrTable<1, opName#"_rtn">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
let enabled_saddr = 1;
Added: llvm/trunk/lib/Target/AMDGPU/SIFixupVectorISel.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFixupVectorISel.cpp?rev=347008&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFixupVectorISel.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/SIFixupVectorISel.cpp Thu Nov 15 17:13:34 2018
@@ -0,0 +1,224 @@
+//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
+/// Currently this will convert GLOBAL_{LOAD|STORE}_*
+/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
+/// feeding the sreg into the saddr field of the new instruction.
+/// We currently handle a REG_SEQUENCE feeding the vaddr
+/// and decompose it into a base and index.
+///
+/// Transform:
+/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
+/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
+/// %24:vgpr_32, %19:sreg_64_xexec
+/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
+/// %11:vreg_64 = COPY %16:vreg_64
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
+/// Into:
+/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
+/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-fixup-vector-isel"
+
+using namespace llvm;
+
+STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities");
+STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted");
+
+namespace {
+
+class SIFixupVectorISel : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIFixupVectorISel() : MachineFunctionPass(ID) {
+ initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
+ "SI Fixup Vector ISel", false, false)
+
+char SIFixupVectorISel::ID = 0;
+
+char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
+
+FunctionPass *llvm::createSIFixupVectorISelPass() {
+ return new SIFixupVectorISel();
+}
+
+static bool findSRegBaseAndIndex(MachineOperand *Op,
+ unsigned &BaseReg,
+ unsigned &IndexReg,
+ MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI) {
+ SmallVector<MachineOperand *, 8> Worklist;
+ Worklist.push_back(Op);
+ while (!Worklist.empty()) {
+ MachineOperand *WOp = Worklist.pop_back_val();
+ if (!WOp->isReg() ||
+ !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+ continue;
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
+ switch (DefInst->getOpcode()) {
+ default:
+ continue;
+ case AMDGPU::COPY:
+ Worklist.push_back(&DefInst->getOperand(1));
+ break;
+ case AMDGPU::REG_SEQUENCE:
+ if (DefInst->getNumOperands() != 5)
+ continue;
+ Worklist.push_back(&DefInst->getOperand(1));
+ Worklist.push_back(&DefInst->getOperand(3));
+ break;
+ case AMDGPU::V_ADD_I32_e64:
+ // The V_ADD_* and its analogous V_ADDCV_* are generated by
+ // a previous pass which lowered from an ADD_64_PSEUDO,
+ // which generates subregs to break up the 64 bit args.
+ if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
+ continue;
+ BaseReg = DefInst->getOperand(2).getReg();
+ if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
+ continue;
+ IndexReg = DefInst->getOperand(3).getReg();
+ // Chase the IndexReg.
+ MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg);
+ if (!MI || !MI->isCopy())
+ continue;
+ // Make sure the reg class is 64 bit for Index.
+ // If the Index register is a subreg, we want it to reference
+ // a 64 bit register which we will use as the Index reg.
+ const TargetRegisterClass *IdxRC, *BaseRC;
+ IdxRC = MRI.getRegClass(MI->getOperand(1).getReg());
+ if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64)
+ continue;
+ IndexReg = MI->getOperand(1).getReg();
+ // Chase the BaseReg.
+ MI = MRI.getUniqueVRegDef(BaseReg);
+ if (!MI || !MI->isCopy())
+ continue;
+ // Make sure the register class is 64 bit for Base.
+ BaseReg = MI->getOperand(1).getReg();
+ BaseRC = MRI.getRegClass(BaseReg);
+ if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64)
+ continue;
+ // Make sure Base is SReg and Index is VReg.
+ if (!TRI->isSGPRReg(MRI, BaseReg))
+ return false;
+ if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
+ return false;
+ // clear any killed flags on Index and Base regs, used later.
+ MRI.clearKillFlags(IndexReg);
+ MRI.clearKillFlags(BaseReg);
+ return true;
+ }
+ }
+ return false;
+}
+
+// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
+static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
+ MachineFunction &MF,
+ MachineRegisterInfo &MRI,
+ const GCNSubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI) {
+ bool FuncModified = false;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+ int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
+ if (NewOpcd < 0)
+ continue;
+ // Update our statistics on opportunities seen.
+ ++NumSGPRGlobalOccurs;
+ LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
+ // Need a Base and Index or we cant transform to _SADDR.
+ unsigned BaseReg = 0;
+ unsigned IndexReg = 0;
+ MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
+ continue;
+ ++NumSGPRGlobalSaddrs;
+ FuncModified = true;
+ // Create the new _SADDR Memory instruction.
+ bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
+ MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ MachineInstr *NewGlob = nullptr;
+ NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
+ if (HasVdst)
+ NewGlob->addOperand(MF, MI.getOperand(0));
+ NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
+ if (VData)
+ NewGlob->addOperand(MF, *VData);
+ NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
+ NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
+
+ MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
+ // Atomics dont have a GLC, so omit the field if not there.
+ if (Glc)
+ NewGlob->addOperand(MF, *Glc);
+ NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
+ // _D16 have an vdst_in operand, copy it in.
+ MachineOperand *VDstInOp = TII->getNamedOperand(MI,
+ AMDGPU::OpName::vdst_in);
+ if (VDstInOp)
+ NewGlob->addOperand(MF, *VDstInOp);
+ NewGlob->copyImplicitOps(MF, MI);
+ NewGlob->cloneMemRefs(MF, MI);
+ // Remove the old Global Memop instruction.
+ MI.eraseFromParent();
+ LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
+ }
+ return FuncModified;
+}
+
+bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ bool FuncModified = false;
+ for (MachineBasicBlock &MBB : MF) {
+ // Cleanup missed Saddr opportunites from ISel.
+ FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
+ }
+ return FuncModified;
+}
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Thu Nov 15 17:13:34 2018
@@ -954,6 +954,9 @@ namespace AMDGPU {
LLVM_READONLY
int getSOPKOp(uint16_t Opcode);
+ LLVM_READONLY
+ int getGlobalSaddrOp(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Thu Nov 15 17:13:34 2018
@@ -2017,6 +2017,15 @@ def getAtomicNoRetOp : InstrMapping {
let ValueCols = [["0"]];
}
+// Maps a GLOBAL to its SADDR form.
+def getGlobalSaddrOp : InstrMapping {
+ let FilterClass = "GlobalSaddrTable";
+ let RowFields = ["SaddrOp"];
+ let ColFields = ["IsSaddr"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
Modified: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp Thu Nov 15 17:13:34 2018
@@ -755,6 +755,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::VS_64RegClassID:
case AMDGPU::SReg_64RegClassID:
case AMDGPU::VReg_64RegClassID:
+ case AMDGPU::SReg_64_XEXECRegClassID:
return 64;
case AMDGPU::VReg_96RegClassID:
return 96;
Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll Thu Nov 15 17:13:34 2018
@@ -31,8 +31,8 @@ define amdgpu_kernel void @simple_write2
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
@@ -177,8 +177,8 @@ define amdgpu_kernel void @simple_write2
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
@@ -362,8 +362,8 @@ define amdgpu_kernel void @misaligned_si
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll Thu Nov 15 17:13:34 2018
@@ -30,8 +30,8 @@ define amdgpu_kernel void @simple_write2
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
@@ -59,8 +59,8 @@ define amdgpu_kernel void @simple_write2
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
@@ -87,8 +87,8 @@ define amdgpu_kernel void @simple_write2
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
Added: llvm/trunk/test/CodeGen/AMDGPU/global-load-store-atomics.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/global-load-store-atomics.mir?rev=347008&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/global-load-store-atomics.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/global-load-store-atomics.mir Thu Nov 15 17:13:34 2018
@@ -0,0 +1,249 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+# Coverage tests for GLOBAL_* to their _SADDR equivalent.
+
+# GCN-LABEL: name: global_load_store_atomics
+# GCN: GLOBAL_LOAD_DWORD_SADDR
+# GCN: GLOBAL_STORE_DWORD_SADDR
+# GCN: GLOBAL_LOAD_DWORDX2_SADDR
+# GCN: GLOBAL_STORE_DWORDX2_SADDR
+# GCN: GLOBAL_LOAD_DWORDX3_SADDR
+# GCN: GLOBAL_STORE_DWORDX3_SADDR
+# GCN: GLOBAL_LOAD_DWORDX4_SADDR
+# GCN: GLOBAL_STORE_DWORDX4_SADDR
+# GCN: GLOBAL_LOAD_SSHORT_SADDR
+# GCN: GLOBAL_STORE_SHORT_SADDR
+# GCN: GLOBAL_LOAD_USHORT_SADDR
+# GCN: GLOBAL_STORE_SHORT_SADDR
+# GCN: GLOBAL_LOAD_UBYTE_SADDR
+# GCN: GLOBAL_STORE_BYTE_SADDR
+# GCN: GLOBAL_LOAD_SBYTE_SADDR
+# GCN: GLOBAL_STORE_BYTE_SADDR
+# GCN: GLOBAL_LOAD_SBYTE_D16_SADDR
+# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN: GLOBAL_LOAD_UBYTE_D16_SADDR
+# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN: GLOBAL_LOAD_SBYTE_D16_HI_SADDR
+# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN: GLOBAL_LOAD_UBYTE_D16_HI_SADDR
+# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN: GLOBAL_LOAD_SHORT_D16_HI_SADDR
+# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR
+# GCN: GLOBAL_LOAD_SHORT_D16_SADDR
+# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR
+
+# GCN: GLOBAL_ATOMIC_XOR_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_XOR_SADDR %
+# GCN: GLOBAL_ATOMIC_SMIN_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SMIN_SADDR %
+# GCN: GLOBAL_ATOMIC_AND_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_AND_SADDR %
+# GCN: GLOBAL_ATOMIC_SWAP_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SWAP_SADDR %
+# GCN: GLOBAL_ATOMIC_SMAX_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SMAX_SADDR %
+# GCN: GLOBAL_ATOMIC_UMIN_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_UMIN_SADDR %
+# GCN: GLOBAL_ATOMIC_UMAX_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_UMAX_SADDR %
+# GCN: GLOBAL_ATOMIC_OR_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_OR_SADDR %
+# GCN: GLOBAL_ATOMIC_ADD_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_ADD_SADDR %
+# GCN: GLOBAL_ATOMIC_SUB_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SUB_SADDR %
+# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR %
+# GCN: GLOBAL_ATOMIC_INC_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_INC_SADDR %
+# GCN: GLOBAL_ATOMIC_DEC_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_DEC_SADDR %
+
+# GCN: GLOBAL_ATOMIC_OR_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_OR_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_AND_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_AND_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_INC_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_INC_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR %
+# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR_RTN
+# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR %
+
+name: global_load_store_atomics
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0_sgpr1
+
+ %1:sgpr_64 = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0 :: (dereferenceable invariant load 8 )
+ %5:sreg_32_xm0 = S_MOV_B32 2
+ %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec
+ %7:sreg_32_xm0 = S_MOV_B32 0
+ %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1
+ %21:sgpr_32 = COPY %4.sub0
+ %22:vgpr_32 = COPY %14.sub0
+ %23:sgpr_32 = COPY %4.sub1
+ %24:vgpr_32 = COPY %14.sub1
+ %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21, %22, implicit $exec
+ %25:vgpr_32 = COPY %23
+ %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, implicit $exec
+ %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1
+ %11:vreg_64 = COPY %16
+
+ %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, implicit $exec :: (load 4)
+ GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+ GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+ GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+ GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+ GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+ GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+ GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+
+ %53:vgpr_32 = GLOBAL_ATOMIC_XOR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %53, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_XOR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %54:vgpr_32 = GLOBAL_ATOMIC_SMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %54, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %55:vgpr_32 = GLOBAL_ATOMIC_AND_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %55, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_AND %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %56:vgpr_32 = GLOBAL_ATOMIC_SWAP_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %56, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SWAP %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %57:vgpr_32 = GLOBAL_ATOMIC_SMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %57, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %58:vgpr_32 = GLOBAL_ATOMIC_UMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %58, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_UMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %59:vgpr_32 = GLOBAL_ATOMIC_UMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %59, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_UMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %60:vgpr_32 = GLOBAL_ATOMIC_OR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %60, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_OR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %61:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %61, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_ADD %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %62:vgpr_32 = GLOBAL_ATOMIC_SUB_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %62, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SUB %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %63:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %63, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_CMPSWAP %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %64:vgpr_32 = GLOBAL_ATOMIC_INC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %64, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_INC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %65:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORD %11, %65, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_DEC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %66:vreg_64 = GLOBAL_ATOMIC_OR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %66, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_OR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %67:vreg_64 = GLOBAL_ATOMIC_XOR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %67, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_XOR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %68:vreg_64 = GLOBAL_ATOMIC_AND_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %68, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_AND_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %69:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %69, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_ADD_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %70:vreg_64 = GLOBAL_ATOMIC_SUB_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %70, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SUB_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %71:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %71, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_DEC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %72:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %72, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_INC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %73:vreg_64 = GLOBAL_ATOMIC_SMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %73, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %74:vreg_64 = GLOBAL_ATOMIC_SWAP_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %74, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SWAP_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %75:vreg_64 = GLOBAL_ATOMIC_SMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %75, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_SMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %76:vreg_64 = GLOBAL_ATOMIC_UMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %76, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_UMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %77:vreg_64 = GLOBAL_ATOMIC_UMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %77, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_UMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ %79:sreg_128 = REG_SEQUENCE %4, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3
+ %80:vreg_128 = COPY %79
+
+ %78:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+ GLOBAL_STORE_DWORDX2 %11, %78, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+ GLOBAL_ATOMIC_CMPSWAP_X2 %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
+
+ S_ENDPGM
+...
Added: llvm/trunk/test/CodeGen/AMDGPU/global-saddr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/global-saddr.ll?rev=347008&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/global-saddr.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/global-saddr.ll Thu Nov 15 17:13:34 2018
@@ -0,0 +1,102 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+
+; Test for a conv2d like sequence of loads.
+
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}}
+; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
+
+define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %idx = zext i32 %id to i64
+ %gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx
+ %ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1
+ %load0 = load i64, i64 addrspace(1)* %ptr0
+ %ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2
+ %load1 = load i64, i64 addrspace(1)* %ptr1
+ %ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3
+ %load2 = load i64, i64 addrspace(1)* %ptr2
+ %ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4
+ %load3 = load i64, i64 addrspace(1)* %ptr3
+ %ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4
+ %load4 = load i64, i64 addrspace(1)* %ptr4
+ %ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3
+ %load5 = load i64, i64 addrspace(1)* %ptr5
+ %ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2
+ %load6 = load i64, i64 addrspace(1)* %ptr6
+ %ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1
+ %load7 = load i64, i64 addrspace(1)* %ptr7
+ %ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0
+ %load8 = load i64, i64 addrspace(1)* %ptr8
+ %add0 = add i64 %load1, %load0
+ %add1 = add i64 %load3, %load2
+ %add2 = add i64 %load5, %load4
+ %add3 = add i64 %load7, %load6
+ %add4 = add i64 %add0, %load8
+ %add5 = add i64 %add2, %add1
+ %add6 = add i64 %add4, %add3
+ %add7 = add i64 %add6, %add5
+ %gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx
+ %ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1
+ store volatile i64 %add7, i64 addrspace(1)* %ptr9
+
+; Test various offset boundaries.
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
+ %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
+ %load11 = load i64, i64 addrspace(1)* %gep11
+ %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023
+ %load12 = load i64, i64 addrspace(1)* %gep12
+ %gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
+ %load13 = load i64, i64 addrspace(1)* %gep13
+ %add11 = add i64 %load11, %load12
+ %add12 = add i64 %add11, %load13
+ store volatile i64 %add12, i64 addrspace(1)* undef
+
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
+ %gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024
+ %load21 = load i64, i64 addrspace(1)* %gep21
+ %gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048
+ %load22 = load i64, i64 addrspace(1)* %gep22
+ %gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512
+ %load23 = load i64, i64 addrspace(1)* %gep23
+ %add21 = add i64 %load22, %load21
+ %add22 = add i64 %add21, %load23
+ store volatile i64 %add22, i64 addrspace(1)* undef
+
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
+ %gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257
+ %load31 = load i64, i64 addrspace(1)* %gep31
+ %gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256
+ %load32 = load i64, i64 addrspace(1)* %gep32
+ %gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
+ %load33 = load i64, i64 addrspace(1)* %gep33
+ %add34 = add i64 %load32, %load31
+ %add35 = add i64 %add34, %load33
+ store volatile i64 %add35, i64 addrspace(1)* undef
+ ret void
+}
+
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}}
+; GFX9-NEXT: s_waitcnt
+; NGFX9-NOT: global_load_dword
+
+define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) {
+bb:
+ %tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)*
+ %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16
+ store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind readnone speculatable }
Added: llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll?rev=347008&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll Thu Nov 15 17:13:34 2018
@@ -0,0 +1,85 @@
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
+
+; indexing of vectors.
+
+; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
+; to avoid gfx9 scheduling induced issues.
+
+
+; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
+; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
+
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:3]], s[[S_ELT0]]
+
+; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
+
+; GCN: s_mov_b64 [[MASK]], exec
+
+; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP1]]
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
+
+; GCN: buffer_store_dword [[INS0]]
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id.ext = zext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+ %idx0 = load volatile i32, i32 addrspace(1)* %gep
+ %idx1 = add i32 %idx0, 1
+ %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
+ %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
+ %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
+ store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
+ %cmp = icmp eq i32 %id, 0
+ br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+ store volatile i32 %live.out.val, i32 addrspace(1)* undef
+ br label %bb2
+
+bb2:
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
Added: llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll?rev=347008&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll Thu Nov 15 17:13:34 2018
@@ -0,0 +1,88 @@
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
+
+; Tests for indirect addressing on SI, which is implemented using dynamic
+; indexing of vectors.
+
+; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
+; to avoid gfx9 scheduling induced issues.
+
+
+; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
+; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
+
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
+
+; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
+
+; GCN: s_mov_b64 [[MASK]], exec
+
+; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
+; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
+
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
+; IDXMODE: s_set_gpr_idx_off
+
+; GCN-NEXT: s_xor_b64 exec, exec, vcc
+; GCN: s_cbranch_execnz [[LOOP1]]
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
+
+; GCN: buffer_store_dword [[INS0]]
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id.ext = zext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+ %idx0 = load volatile i32, i32 addrspace(1)* %gep
+ %idx1 = add i32 %idx0, 1
+ %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
+ %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
+ %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
+ store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
+ %cmp = icmp eq i32 %id, 0
+ br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+ store volatile i32 %live.out.val, i32 addrspace(1)* undef
+ br label %bb2
+
+bb2:
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
Modified: llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll Thu Nov 15 17:13:34 2018
@@ -392,76 +392,9 @@ bb2:
ret void
}
-; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
-; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
-
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
-
-; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
-; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-
-; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
-
-; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
-; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN-NEXT: s_xor_b64 exec, exec, vcc
-; GCN: s_cbranch_execnz [[LOOP0]]
-
-; FIXME: Redundant copy
-; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
-
-; GCN: s_mov_b64 [[MASK]], exec
-
-; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
-; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
-; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-
-; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
-
-; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
-; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN-NEXT: s_xor_b64 exec, exec, vcc
-; GCN: s_cbranch_execnz [[LOOP1]]
-
-; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
-
-; GCN: buffer_store_dword [[INS0]]
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
-entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x() #1
- %id.ext = zext i32 %id to i64
- %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
- %idx0 = load volatile i32, i32 addrspace(1)* %gep
- %idx1 = add i32 %idx0, 1
- %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
- %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
- %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
- store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
- %cmp = icmp eq i32 %id, 0
- br i1 %cmp, label %bb1, label %bb2
-
-bb1:
- store volatile i32 %live.out.val, i32 addrspace(1)* undef
- br label %bb2
-
-bb2:
- ret void
-}
+; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to
+; avoid very different schedule induced isses with gfx9.
+; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
; GCN-LABEL: {{^}}insert_adjacent_blocks:
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll Thu Nov 15 17:13:34 2018
@@ -444,40 +444,12 @@ define amdgpu_kernel void @v_inserteleme
ret void
}
-; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
-
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-
-; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
-
-; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
-
-; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
- %tid.ext = sext i32 %tid to i64
- %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
- %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
- %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
- %idx = load i32, i32 addrspace(1)* %idx.gep
- %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
- %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
- store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
- ret void
-}
-
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
Added: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll?rev=347008&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll Thu Nov 15 17:13:34 2018
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+ %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+ %idx = load i32, i32 addrspace(1)* %idx.gep
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+ %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+ store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Added: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll?rev=347008&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll Thu Nov 15 17:13:34 2018
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tid.ext = sext i32 %tid to i64
+ %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+ %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+ %idx = load i32, i32 addrspace(1)* %idx.gep
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+ %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+ store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/madak.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/madak.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/madak.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/madak.ll Thu Nov 15 17:13:34 2018
@@ -8,8 +8,10 @@ declare float @llvm.fabs.f32(float) noun
; GCN-LABEL: {{^}}madak_f32:
; GFX6: buffer_load_dword [[VA:v[0-9]+]]
; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -88,8 +90,10 @@ define amdgpu_kernel void @madak_m_inlin
; GCN-LABEL: {{^}}madak_inline_imm_f32:
; GFX6: buffer_load_dword [[VA:v[0-9]+]]
; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
Modified: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll Thu Nov 15 17:13:34 2018
@@ -319,7 +319,7 @@ entry:
; GCN-LABEL: {{^}}nontemporal_global_1:
; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
-; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
define amdgpu_kernel void @nontemporal_global_1(
i32 addrspace(1)* %in, i32* %out) {
entry:
Modified: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-store.ll Thu Nov 15 17:13:34 2018
@@ -240,7 +240,7 @@ entry:
; GCN-LABEL: {{^}}nontemporal_global_1:
; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
+; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
define amdgpu_kernel void @nontemporal_global_1(
i32* %in, i32 addrspace(1)* %out) {
entry:
Modified: llvm/trunk/test/CodeGen/AMDGPU/memory_clause.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory_clause.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory_clause.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory_clause.ll Thu Nov 15 17:13:34 2018
@@ -105,7 +105,7 @@ bb:
}
; GCN-LABEL: {{^}}vector_clause_indirect:
-; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], off
+; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll?rev=347008&r1=347007&r2=347008&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll Thu Nov 15 17:13:34 2018
@@ -265,16 +265,15 @@ define amdgpu_kernel void @reorder_globa
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20
-
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52
define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
%id = call i32 @llvm.amdgcn.workitem.id.x()
%id.ext = sext i32 %id to i64
More information about the llvm-commits
mailing list