[llvm] r284031 - AMDGPU: Initial implementation of VGPR indexing mode
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 12 11:49:06 PDT 2016
Author: arsenm
Date: Wed Oct 12 13:49:05 2016
New Revision: 284031
URL: http://llvm.org/viewvc/llvm-project?rev=284031&view=rev
Log:
AMDGPU: Initial implementation of VGPR indexing mode
This is the most basic handling of the indirect access
pseudos using GPR indexing mode. This currently only enables
the mode for a single v_mov_b32 and then disables it.
This is much more complicated to use than the movrel instructions,
so a new optimization pass is probably needed to fold the access
into the uses and keep the mode enabled for them.
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td
llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=284031&r1=284030&r2=284031&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Oct 12 13:49:05 2016
@@ -37,6 +37,12 @@
using namespace llvm;
+static cl::opt<bool> EnableVGPRIndexMode(
+ "amdgpu-vgpr-index-mode",
+ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
+ cl::init(false));
+
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -1129,7 +1135,8 @@ static MachineBasicBlock::iterator emitL
unsigned ResultReg,
unsigned PhiReg,
unsigned InitSaveExecReg,
- int Offset) {
+ int Offset,
+ bool UseGPRIdxMode) {
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -1158,14 +1165,31 @@ static MachineBasicBlock::iterator emitL
.addReg(CurrentIdxReg)
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
- // Move index from VCC into M0
- if (Offset == 0) {
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(CurrentIdxReg, RegState::Kill);
+ if (UseGPRIdxMode) {
+ unsigned IdxReg;
+ if (Offset == 0) {
+ IdxReg = CurrentIdxReg;
+ } else {
+ IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
+ }
+
+ MachineInstr *SetIdx =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
+ .addReg(IdxReg, RegState::Kill);
+ SetIdx->getOperand(2).setIsUndef(true);
} else {
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(CurrentIdxReg, RegState::Kill)
- .addImm(Offset);
+ // Move index from VCC into M0
+ if (Offset == 0) {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(CurrentIdxReg, RegState::Kill);
+ } else {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
+ }
}
// Update EXEC, save the original EXEC value to VCC.
@@ -1200,7 +1224,8 @@ static MachineBasicBlock::iterator loadM
MachineInstr &MI,
unsigned InitResultReg,
unsigned PhiReg,
- int Offset) {
+ int Offset,
+ bool UseGPRIdxMode) {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
@@ -1239,7 +1264,7 @@ static MachineBasicBlock::iterator loadM
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
- Offset);
+ Offset, UseGPRIdxMode);
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -1270,7 +1295,9 @@ computeIndirectRegAndOffset(const SIRegi
static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
MachineRegisterInfo &MRI,
MachineInstr &MI,
- int Offset) {
+ int Offset,
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
@@ -1283,6 +1310,32 @@ static bool setM0ToIndexFromSGPR(const S
if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
return false;
+ if (UseGPRIdxMode) {
+ unsigned IdxMode = IsIndirectSrc ?
+ VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ if (Offset == 0) {
+ MachineInstr *SetOn =
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addOperand(*Idx)
+ .addImm(IdxMode);
+
+ SetOn->getOperand(3).setIsUndef(AMDGPU::M0);
+ } else {
+ unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
+ .addOperand(*Idx)
+ .addImm(Offset);
+ MachineInstr *SetOn =
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(Tmp, RegState::Kill)
+ .addImm(IdxMode);
+
+ SetOn->getOperand(3).setIsUndef(AMDGPU::M0);
+ }
+
+ return true;
+ }
+
if (Offset == 0) {
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addOperand(*Idx);
@@ -1314,18 +1367,33 @@ static MachineBasicBlock *emitIndirectSr
std::tie(SubReg, Offset)
= computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
- if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset)) {
+ bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
+ if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
- .addReg(SrcVec->getReg(), RegState::Implicit);
+ if (UseGPRIdxMode) {
+ // TODO: Look at the uses to avoid the copy. This may require rescheduling
+ // to avoid interfering with other uses, so probably requires a new
+ // optimization pass.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
+ .addReg(SrcVec->getReg(), RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
+ .addReg(SrcVec->getReg(), RegState::Implicit);
+ }
+
MI.eraseFromParent();
return &MBB;
}
+
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
@@ -1334,15 +1402,32 @@ static MachineBasicBlock *emitIndirectSr
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+ if (UseGPRIdxMode) {
+ MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addImm(0) // Reset inside loop.
+ .addImm(VGPRIndexMode::SRC0_ENABLE);
+ SetOn->getOperand(3).setIsUndef(AMDGPU::M0);
+
+
+ // Disable again after the loop.
+ BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ }
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset);
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
- BuildMI(*InsPt->getParent(), InsPt, DL,
- TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
- .addReg(SrcVec->getReg(), RegState::Implicit);
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
+ .addReg(SrcVec->getReg(), RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ } else {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg)
+ .addReg(SrcVec->getReg(), RegState::Implicit);
+ }
- return InsPt->getParent();
+ return LoopBB;
}
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
@@ -1367,6 +1452,8 @@ static MachineBasicBlock *emitIndirectDs
std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
SrcVec->getReg(),
Offset);
+ bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
if (Idx->getReg() == AMDGPU::NoRegister) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
@@ -1382,23 +1469,36 @@ static MachineBasicBlock *emitIndirectDs
return &MBB;
}
- const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32);
- if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset)) {
+ if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
- MachineInstr *MovRel =
- BuildMI(MBB, I, DL, MovRelDesc)
- .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
- .addOperand(*Val)
- .addReg(Dst, RegState::ImplicitDefine)
- .addReg(SrcVec->getReg(), RegState::Implicit);
+ if (UseGPRIdxMode) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
+ .addOperand(*Val)
+ .addReg(Dst, RegState::ImplicitDefine)
+ .addReg(SrcVec->getReg(), RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
- const int ImpDefIdx = MovRelDesc.getNumOperands() +
- MovRelDesc.getNumImplicitUses();
- const int ImpUseIdx = ImpDefIdx + 1;
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32);
+
+ MachineInstr *MovRel =
+ BuildMI(MBB, I, DL, MovRelDesc)
+ .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
+ .addOperand(*Val)
+ .addReg(Dst, RegState::ImplicitDefine)
+ .addReg(SrcVec->getReg(), RegState::Implicit);
+
+ const int ImpDefIdx = MovRelDesc.getNumOperands() +
+ MovRelDesc.getNumImplicitUses();
+ const int ImpUseIdx = ImpDefIdx + 1;
+
+ MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
+ }
- MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
MI.eraseFromParent();
return &MBB;
}
@@ -1407,25 +1507,50 @@ static MachineBasicBlock *emitIndirectDs
MRI.clearKillFlags(Val->getReg());
const DebugLoc &DL = MI.getDebugLoc();
- unsigned PhiReg = MRI.createVirtualRegister(VecRC);
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset);
+ if (UseGPRIdxMode) {
+ MachineBasicBlock::iterator I(&MI);
+
+ MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addImm(0) // Reset inside loop.
+ .addImm(VGPRIndexMode::DST_ENABLE);
+ SetOn->getOperand(3).setIsUndef(AMDGPU::M0);
+
+ // Disable again after the loop.
+ BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ }
+
+ unsigned PhiReg = MRI.createVirtualRegister(VecRC);
- // vdst is not actually read and just provides the base register index.
- MachineInstr *MovRel =
- BuildMI(*InsPt->getParent(), InsPt, DL, MovRelDesc)
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
+ Offset, UseGPRIdxMode);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
+
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+ .addReg(PhiReg, RegState::Undef, SubReg) // vdst
+ .addOperand(*Val) // src0
+ .addReg(Dst, RegState::ImplicitDefine)
+ .addReg(PhiReg, RegState::Implicit)
+ .addReg(AMDGPU::M0, RegState::Implicit);
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32);
+ // vdst is not actually read and just provides the base register index.
+ MachineInstr *MovRel =
+ BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
.addReg(PhiReg, RegState::Undef, SubReg) // vdst
.addOperand(*Val)
.addReg(Dst, RegState::ImplicitDefine)
.addReg(PhiReg, RegState::Implicit);
- const int ImpDefIdx = MovRelDesc.getNumOperands() +
- MovRelDesc.getNumImplicitUses();
- const int ImpUseIdx = ImpDefIdx + 1;
+ const int ImpDefIdx = MovRelDesc.getNumOperands() +
+ MovRelDesc.getNumImplicitUses();
+ const int ImpUseIdx = ImpDefIdx + 1;
- MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
+ MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
+ }
- return InsPt->getParent();
+ return LoopBB;
}
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=284031&r1=284030&r2=284031&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Wed Oct 12 13:49:05 2016
@@ -1645,6 +1645,20 @@ MachineInstr *SIInstrInfo::convertToThre
.addImm(0); // omod
}
+// It's not generally safe to move VALU instructions across these since it will
+// start using the register as a base index rather than directly.
+// XXX - Why isn't hasSideEffects sufficient for these?
+static bool changesVGPRIndexingMode(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_SET_GPR_IDX_ON:
+ case AMDGPU::S_SET_GPR_IDX_MODE:
+ case AMDGPU::S_SET_GPR_IDX_OFF:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const {
@@ -1654,7 +1668,8 @@ bool SIInstrInfo::isSchedulingBoundary(c
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
- MI.modifiesRegister(AMDGPU::EXEC, &RI);
+ MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+ changesVGPRIndexingMode(MI);
}
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
Modified: llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td?rev=284031&r1=284030&r2=284031&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td Wed Oct 12 13:49:05 2016
@@ -527,6 +527,17 @@ defm V_FRACT_F16 : VOP1_Real_vi
defm V_SIN_F16 : VOP1_Real_vi <0x49>;
defm V_COS_F16 : VOP1_Real_vi <0x4a>;
+
+// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
+// indexing mode. vdst can't be treated as a def for codegen purposes,
+// and an implicit use and def of the super register should be added.
+def V_MOV_B32_indirect : VPseudoInstSI<(outs),
+ (ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
+ PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
+ getVOPSrc0ForVT<i32>.ret:$src0)> {
+ let VOP1 = 1;
+}
+
let Predicates = [isVI] in {
def : Pat <
Modified: llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll?rev=284031&r1=284030&r2=284031&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll Wed Oct 12 13:49:05 2016
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
@@ -10,8 +11,13 @@
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 2.0
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
-; GCN-DAG: s_mov_b32 m0, [[IN]]
-; GCN: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
+
+; MOVREL-DAG: s_mov_b32 m0, [[IN]]
+; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
+
+; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
+; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
+; IDXMODE-NEXT: s_set_gpr_idx_off
define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
entry:
%idx = add i32 %in, 1
@@ -22,7 +28,7 @@ entry:
; XXX: Could do v_or_b32 directly
; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
-; GCN: s_mov_b32 m0
+; MOVREL: s_mov_b32 m0
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
; GCN-DAG: s_or_b32
@@ -31,7 +37,12 @@ entry:
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_movrels_b32_e32
+
+; MOVREL: v_movrels_b32_e32
+
+; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
+; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; IDXMODE-NEXT: s_set_gpr_idx_off
define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
entry:
%idx = add i32 %in, 1
@@ -47,8 +58,13 @@ entry:
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
-; GCN-DAG: s_mov_b32 m0, [[IN]]
-; GCN: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
+
+; MOVREL-DAG: s_mov_b32 m0, [[IN]]
+; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
+
+; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
+; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
+; IDXMODE-NEXT: s_set_gpr_idx_off
define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
entry:
%elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
@@ -58,8 +74,13 @@ entry:
; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
; The offset depends on the register that holds the first element of the vector.
-; GCN: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; GCN: v_movrels_b32_e32 v{{[0-9]}}, v0
+; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
+
+; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
+; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
+; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; IDXMODE-NEXT: s_set_gpr_idx_off
define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
entry:
%index = add i32 %offset, -512
@@ -70,8 +91,13 @@ entry:
; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
; The offset depends on the register that holds the first element of the vector.
-; GCN: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; GCN: v_movrels_b32_e32 v{{[0-9]}}, v0
+; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
+
+; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
+; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
+; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; IDXMODE-NEXT: s_set_gpr_idx_off
define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
entry:
%index = add i32 %offset, -512
@@ -85,14 +111,24 @@ entry:
; The offset depends on the register that holds the first element of the vector.
; FIXME: The waitcnt for the argument load can go after the loop
+; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
; GCN: s_waitcnt lgkmcnt(0)
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
-; GCN: s_add_i32 m0, [[READLANE]], 0xfffffe0
-; GCN: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
+
+; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
+; MOVREL: s_and_saveexec_b64 vcc, vcc
+; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
+
+; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
+; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
+; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
+
; GCN: s_cbranch_execnz
+; IDXMODE: s_set_gpr_idx_off
; GCN: buffer_store_dword [[RESULT]]
define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
entry:
@@ -114,8 +150,8 @@ entry:
; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
; GCN-DAG: buffer_load_dwordx4
-; GCN-DAG: s_mov_b32 m0,
-; GCN: v_movreld_b32
+; MOVREL-DAG: s_mov_b32 m0,
+; MOVREL: v_movreld_b32
define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
entry:
%ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -126,14 +162,15 @@ entry:
; GCN-LABEL: {{^}}insert_w_offset:
; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
-; GCN-DAG: s_mov_b32 m0, [[IN]]
+; MOVREL-DAG: s_mov_b32 m0, [[IN]]
; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000
-; GCN: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
-; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
+
+; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
+; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
entry:
%0 = add i32 %in, 1
@@ -144,8 +181,14 @@ entry:
; GCN-LABEL: {{^}}insert_wo_offset:
; GCN: s_load_dword [[IN:s[0-9]+]]
-; GCN: s_mov_b32 m0, [[IN]]
-; GCN: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
+
+; MOVREL: s_mov_b32 m0, [[IN]]
+; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
+
+; IDXMODE: s_set_gpr_idx_on [[IN]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
+; IDXMODE-NEXT: s_set_gpr_idx_off
+
; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
entry:
@@ -156,8 +199,13 @@ entry:
; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
; The offset depends on the register that holds the first element of the vector.
-; GCN: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; GCN: v_movreld_b32_e32 v0, 5
+; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; MOVREL: v_movreld_b32_e32 v0, 5
+
+; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
+; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
+; IDXMODE-NEXT: s_set_gpr_idx_off
define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
entry:
%index = add i32 %offset, -512
@@ -171,8 +219,13 @@ entry:
; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
; The offset depends on the register that holds the first element of the vector.
-; GCN: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; GCN: v_movreld_b32_e32 v0, 5
+; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; MOVREL: v_movreld_b32_e32 v0, 5
+
+; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
+; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
+; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
+; IDXMODE-NEXT: s_set_gpr_idx_off
define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
entry:
%index = add i32 %offset, -512
@@ -194,11 +247,21 @@ entry:
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
-; GCN: s_add_i32 m0, [[READLANE]], 0xfffffe00
-; GCN: v_movreld_b32_e32 [[VEC_ELT0]], 5
-; GCN: s_cbranch_execnz [[LOOPBB]]
+; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
+; MOVREL: s_and_saveexec_b64 vcc, vcc
+; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
+
+; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
+; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
+; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
+
+; GCN: s_cbranch_execnz [[LOOPBB]]
; GCN: s_mov_b64 exec, [[SAVEEXEC]]
+
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: buffer_store_dword
define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
@@ -217,14 +280,24 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
+; IDXMODE: s_set_gpr_idx_on 0, dst
+
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_waitcnt lgkmcnt(0)
; The offset depends on the register that holds the first element of the vector.
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
-; GCN: s_add_i32 m0, [[READLANE]], -16
-; GCN: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
+
+; MOVREL: s_add_i32 m0, [[READLANE]], -16
+; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
+
+; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
+; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
+; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
+
; GCN: s_cbranch_execnz
+
+; IDXMODE: s_set_gpr_idx_off
define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -247,32 +320,52 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
+; IDXMODE: s_set_gpr_idx_on 0, src0
+
; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
; GCN: s_waitcnt vmcnt(0)
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_mov_b32 m0, [[READLANE]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-; GCN: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL: s_and_saveexec_b64 vcc, vcc
+; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+
+; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
+; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN-NEXT: s_cbranch_execnz [[LOOP0]]
; FIXME: Redundant copy
; GCN: s_mov_b64 exec, [[MASK]]
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]
+
+; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_mov_b32 m0, [[READLANE]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL: s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
+
+; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
+; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
+
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP1]]
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: buffer_store_dword [[MOVREL0]]
; GCN: buffer_store_dword [[MOVREL1]]
define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
@@ -308,25 +401,42 @@ bb2:
; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
+; IDXMODE: s_set_gpr_idx_on 0, dst
+
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_mov_b32 m0, [[READLANE]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL: s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
+
+; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
+; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
+
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP0]]
; FIXME: Redundant copy
; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
+; IDXMODE: s_set_gpr_idx_off
+
+; IDXMODE: s_set_gpr_idx_on 0, dst
; GCN: s_mov_b64 [[MASK]], exec
; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
-; GCN: s_mov_b32 m0, [[READLANE]]
-; GCN: s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
+
+; MOVREL: s_mov_b32 m0, [[READLANE]]
+; MOVREL: s_and_saveexec_b64 vcc, vcc
+; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
+
+; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
+; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
+
; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP1]]
@@ -361,14 +471,23 @@ bb2:
; GCN: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
; GCN: buffer_load_dwordx4
-; GCN: s_mov_b32 m0,
-; GCN: v_movrels_b32_e32
+; MOVREL: s_mov_b32 m0,
+; MOVREL: v_movrels_b32_e32
+
+; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0
+; IDXMODE: v_mov_b32_e32
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
; GCN: [[BB4]]:
; GCN: buffer_load_dwordx4
-; GCN: s_mov_b32 m0,
-; GCN: v_movrels_b32_e32
+; MOVREL: s_mov_b32 m0,
+; MOVREL: v_movrels_b32_e32
+
+; IDXMODE: s_set_gpr_idx_on
+; IDXMODE: v_mov_b32_e32
+; IDXMODE: s_set_gpr_idx_off
; GCN: [[ENDBB]]:
; GCN: buffer_store_dword
@@ -400,14 +519,23 @@ bb7:
; GCN: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
; GCN: buffer_load_dwordx4
-; GCN: s_mov_b32 m0,
-; GCN: v_movreld_b32_e32
+; MOVREL: s_mov_b32 m0,
+; MOVREL: v_movreld_b32_e32
+
+; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, dst
+; IDXMODE: v_mov_b32_e32
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
; GCN: [[BB4]]:
; GCN: buffer_load_dwordx4
-; GCN: s_mov_b32 m0,
-; GCN: v_movreld_b32_e32
+; MOVREL: s_mov_b32 m0,
+; MOVREL: v_movreld_b32_e32
+
+; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, dst
+; IDXMODE: v_mov_b32_e32
+; IDXMODE: s_set_gpr_idx_off
; GCN: [[ENDBB]]:
; GCN: buffer_store_dword
@@ -445,17 +573,27 @@ bb7:
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
; GCN-DAG: s_load_dword [[ARG:s[0-9]+]]
-; GCN-DAG: s_add_i32 m0, [[ARG]], -16
-; GCN: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0
+; MOVREL-DAG: s_add_i32 m0, [[ARG]], -16
+; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0
; GCN-NOT: m0
+; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16
+; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst
+; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT0]], 4.0
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: v_mov_b32_e32 v[[VEC0_ELT2]], 0x4188cccd
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x4190cccd
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x4198cccd
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a0cccd
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a8cccd
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
-; GCN: v_movreld_b32_e32 v[[VEC0_ELT2]], -4.0
+
+; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT2]], -4.0
+
+; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst
+; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT2]], -4.0
+; IDXMODE: s_set_gpr_idx_off
; GCN: s_mov_b32 m0, -1
; GCN: ds_write_b32
@@ -480,8 +618,13 @@ bb:
; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
-; GCN: s_mov_b32 m0, [[IDX]]
-; GCN: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
+; MOVREL: s_mov_b32 m0, [[IDX]]
+; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
+
+; IDXMODE: s_set_gpr_idx_on [[IDX]], src0
+; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: buffer_store_dword [[EXTRACT]]
define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
entry:
@@ -495,8 +638,14 @@ entry:
; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
-; GCN: s_add_i32 m0, [[IDX]], 4
-; GCN: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
+; MOVREL: s_add_i32 m0, [[IDX]], 4
+; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
+
+; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 4
+; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
+; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
+; IDXMODE: s_set_gpr_idx_off
+
; GCN: buffer_store_dword [[EXTRACT]]
define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
entry:
@@ -514,8 +663,13 @@ entry:
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
; GCN-NOT: [[IDX_SHL]]
-; GCN: s_mov_b32 m0, [[IDX_SHL]]
-; GCN: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
+; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
+; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; IDXMODE: s_set_gpr_idx_off
define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
entry:
%ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -530,8 +684,13 @@ entry:
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
; GCN-NOT: [[IDX_SHL]]
-; GCN: s_mov_b32 m0, [[IDX_SHL]]
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
+; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
+; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; IDXMODE: s_set_gpr_idx_off
define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
%idx.shl = shl i32 %idx.in, 2
%idx = or i32 %idx.shl, 1
@@ -547,13 +706,17 @@ define void @insertelement_v4f32_or_inde
; GCN: {{^BB[0-9]+_[0-9]+}}:
; GCN: s_mov_b64 exec,
+; IDXMODE: s_set_gpr_idx_off
; GCN: [[BB2]]:
; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
; GCN: buffer_load_dword
; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
-; GCN: v_movreld_b32_e32
+; MOVREL: v_movreld_b32_e32
+
+; IDXMODE: s_set_gpr_idx_idx
+; IDXMODE: v_mov_b32_e32
; GCN: s_cbranch_execnz [[REGLOOP]]
define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
bb:
More information about the llvm-commits
mailing list