[llvm] r325355 - AMDGPU/SI: Turn off GPR Indexing Mode immediately after the interested instruction.

Changpeng Fang via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 16 08:31:31 PST 2018


Author: chfang
Date: Fri Feb 16 08:31:30 2018
New Revision: 325355

URL: http://llvm.org/viewvc/llvm-project?rev=325355&view=rev
Log:
AMDGPU/SI: Turn off GPR Indexing Mode immediately after the interested instruction.

Summary:
  In the current implementation of GPR Indexing Mode when the index is of non-uniform, the s_set_gpr_idx_off instruction
is incorrectly inserted after the loop. This will lead the instructions with vgpr operands (v_readfirstlane for example) to read incorrect
vgpr.
 In this patch, we fix the issue by inserting s_set_gpr_idx_on/off immediately around the interested instruction.

Reviewers:
  rampitec

Differential Revision:
  https://reviews.llvm.org/D43297

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=325355&r1=325354&r2=325355&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Feb 16 08:31:30 2018
@@ -2744,7 +2744,8 @@ static MachineBasicBlock::iterator emitL
   unsigned PhiReg,
   unsigned InitSaveExecReg,
   int Offset,
-  bool UseGPRIdxMode) {
+  bool UseGPRIdxMode,
+  bool IsIndirectSrc) {
   MachineBasicBlock::iterator I = LoopBB.begin();
 
   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -2773,6 +2774,12 @@ static MachineBasicBlock::iterator emitL
     .addReg(CurrentIdxReg)
     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
 
+  // Update EXEC, save the original EXEC value to VCC.
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+    .addReg(CondReg, RegState::Kill);
+
+  MRI.setSimpleHint(NewExec, CondReg);
+
   if (UseGPRIdxMode) {
     unsigned IdxReg;
     if (Offset == 0) {
@@ -2783,11 +2790,13 @@ static MachineBasicBlock::iterator emitL
         .addReg(CurrentIdxReg, RegState::Kill)
         .addImm(Offset);
     }
-
-    MachineInstr *SetIdx =
-      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
-      .addReg(IdxReg, RegState::Kill);
-    SetIdx->getOperand(2).setIsUndef();
+    unsigned IdxMode = IsIndirectSrc ?
+      VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+    MachineInstr *SetOn =
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+      .addReg(IdxReg, RegState::Kill)
+      .addImm(IdxMode);
+    SetOn->getOperand(3).setIsUndef();
   } else {
     // Move index from VCC into M0
     if (Offset == 0) {
@@ -2800,12 +2809,6 @@ static MachineBasicBlock::iterator emitL
     }
   }
 
-  // Update EXEC, save the original EXEC value to VCC.
-  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
-    .addReg(CondReg, RegState::Kill);
-
-  MRI.setSimpleHint(NewExec, CondReg);
-
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
   MachineInstr *InsertPt =
     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
@@ -2833,7 +2836,8 @@ static MachineBasicBlock::iterator loadM
                                                   unsigned InitResultReg,
                                                   unsigned PhiReg,
                                                   int Offset,
-                                                  bool UseGPRIdxMode) {
+                                                  bool UseGPRIdxMode,
+                                                  bool IsIndirectSrc) {
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -2872,7 +2876,7 @@ static MachineBasicBlock::iterator loadM
 
   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
                                       InitResultReg, DstReg, PhiReg, TmpExec,
-                                      Offset, UseGPRIdxMode);
+                                      Offset, UseGPRIdxMode, IsIndirectSrc);
 
   MachineBasicBlock::iterator First = RemainderBB->begin();
   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -3007,17 +3011,8 @@ static MachineBasicBlock *emitIndirectSr
 
   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
 
-  if (UseGPRIdxMode) {
-    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addImm(0) // Reset inside loop.
-      .addImm(VGPRIndexMode::SRC0_ENABLE);
-    SetOn->getOperand(3).setIsUndef();
-
-    // Disable again after the loop.
-    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-  }
-
-  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
+                              Offset, UseGPRIdxMode, true);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
   if (UseGPRIdxMode) {
@@ -3025,6 +3020,7 @@ static MachineBasicBlock *emitIndirectSr
       .addReg(SrcReg, RegState::Undef, SubReg)
       .addReg(SrcReg, RegState::Implicit)
       .addReg(AMDGPU::M0, RegState::Implicit);
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
   } else {
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
       .addReg(SrcReg, RegState::Undef, SubReg)
@@ -3125,22 +3121,10 @@ static MachineBasicBlock *emitIndirectDs
 
   const DebugLoc &DL = MI.getDebugLoc();
 
-  if (UseGPRIdxMode) {
-    MachineBasicBlock::iterator I(&MI);
-
-    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-      .addImm(0) // Reset inside loop.
-      .addImm(VGPRIndexMode::DST_ENABLE);
-    SetOn->getOperand(3).setIsUndef();
-
-    // Disable again after the loop.
-    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-  }
-
   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
 
   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
-                              Offset, UseGPRIdxMode);
+                              Offset, UseGPRIdxMode, false);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
   if (UseGPRIdxMode) {
@@ -3150,6 +3134,7 @@ static MachineBasicBlock *emitIndirectDs
         .addReg(Dst, RegState::ImplicitDefine)
         .addReg(PhiReg, RegState::Implicit)
         .addReg(AMDGPU::M0, RegState::Implicit);
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
   } else {
     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll?rev=325355&r1=325354&r2=325355&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll Fri Feb 16 08:31:30 2018
@@ -118,23 +118,21 @@ entry:
 ; The offset depends on the register that holds the first element of the vector.
 
 ; FIXME: The waitcnt for the argument load can go after the loop
-; IDXMODE: s_set_gpr_idx_on 0, src0
 ; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
+; GCN: s_and_saveexec_b64 vcc, vcc
 
 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
-; MOVREL: s_and_saveexec_b64 vcc, vcc
 ; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
 
 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
-; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
-; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
 ; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
+; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: s_cbranch_execnz
 
-; IDXMODE: s_set_gpr_idx_off
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
 entry:
@@ -250,21 +248,19 @@ entry:
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
+; GCN: s_and_saveexec_b64 vcc, vcc
 
 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
-; MOVREL: s_and_saveexec_b64 vcc, vcc
 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
 
 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
-; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
-; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
+; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: s_cbranch_execnz [[LOOPBB]]
 ; GCN: s_mov_b64 exec, [[SAVEEXEC]]
 
-; IDXMODE: s_set_gpr_idx_off
-
 ; GCN: buffer_store_dword
 define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
@@ -283,8 +279,6 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
 
-; IDXMODE: s_set_gpr_idx_on 0, dst
-
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
 
 ; The offset depends on the register that holds the first element of the vector.
@@ -294,12 +288,11 @@ entry:
 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
 
 ; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
-; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
+; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
+; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: s_cbranch_execnz
-
-; IDXMODE: s_set_gpr_idx_off
 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -322,52 +315,46 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
 
-; IDXMODE: s_set_gpr_idx_on 0, src0
-
 ; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
 
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
 
 ; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL: s_and_saveexec_b64 vcc, vcc
 ; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
 
-; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
-; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
 ; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+; IDXMODE: s_set_gpr_idx_off
 
 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
 ; GCN-NEXT: s_cbranch_execnz [[LOOP0]]
 
 ; FIXME: Redundant copy
 ; GCN: s_mov_b64 exec, [[MASK]]
-; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]
 
-; IDXMODE: s_set_gpr_idx_on 0, src0
 ; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
 
 ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
 
 ; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL: s_and_saveexec_b64 vcc, vcc
 ; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
 
-; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
-; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
 ; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
+; IDXMODE: s_set_gpr_idx_off
 
 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
 ; GCN: s_cbranch_execnz [[LOOP1]]
 
-; IDXMODE: s_set_gpr_idx_off
-
 ; GCN: buffer_store_dword [[MOVREL0]]
 ; GCN: buffer_store_dword [[MOVREL1]]
 define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
@@ -403,42 +390,38 @@ bb2:
 ; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
 ; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
 
-; IDXMODE: s_set_gpr_idx_on 0, dst
-
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
 
 ; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL: s_and_saveexec_b64 vcc, vcc
 ; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
 
-; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
-; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
+; IDXMODE: s_set_gpr_idx_off
 
 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
 ; GCN: s_cbranch_execnz [[LOOP0]]
 
 ; FIXME: Redundant copy
 ; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
-; IDXMODE: s_set_gpr_idx_off
 
-; IDXMODE: s_set_gpr_idx_on 0, dst
 ; GCN: s_mov_b64 [[MASK]], exec
 
 ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
+; GCN: s_and_saveexec_b64 vcc, vcc
 
 ; MOVREL: s_mov_b32 m0, [[READLANE]]
-; MOVREL: s_and_saveexec_b64 vcc, vcc
 ; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
 
-; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
-; IDXMODE: s_and_saveexec_b64 vcc, vcc
+; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
+; IDXMODE: s_set_gpr_idx_off
 
 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
 ; GCN: s_cbranch_execnz [[LOOP1]]
@@ -639,7 +622,6 @@ define amdgpu_kernel void @insertelement
 
 ; GCN: {{^BB[0-9]+_[0-9]+}}:
 ; GCN: s_mov_b64 exec,
-; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: [[BB2]]:
 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
@@ -648,8 +630,10 @@ define amdgpu_kernel void @insertelement
 ; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
 ; MOVREL: v_movreld_b32_e32
 
-; IDXMODE: s_set_gpr_idx_idx
+; IDXMODE: s_set_gpr_idx_on
 ; IDXMODE: v_mov_b32_e32
+; IDXMODE: s_set_gpr_idx_off
+
 ; GCN: s_cbranch_execnz [[REGLOOP]]
 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:




More information about the llvm-commits mailing list