[llvm] r364185 - AMDGPU: Fold frame index into MUBUF

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 24 07:53:56 PDT 2019


Author: arsenm
Date: Mon Jun 24 07:53:56 2019
New Revision: 364185

URL: http://llvm.org/viewvc/llvm-project?rev=364185&view=rev
Log:
AMDGPU: Fold frame index into MUBUF

This matters for byval uses outside of the entry block, which appear
as copies.

Previously, the only folding done was during selection, which could
not see the underlying frame index. For any uses outside the entry
block, the frame index was materialized in the entry block relative to
the global scratch wave offset.

This may produce worse code in cases where the offset ends up not
fitting in the MUBUF offset field. A better heuristic would be helpfu
for extreme frames.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
    llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp?rev=364185&r1=364184&r2=364185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp Mon Jun 24 07:53:56 2019
@@ -87,10 +87,11 @@ public:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const GCNSubtarget *ST;
+  const SIMachineFunctionInfo *MFI;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
-                   unsigned UseOpIdx,
+                   int UseOpIdx,
                    SmallVectorImpl<FoldCandidate> &FoldList,
                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
 
@@ -159,6 +160,17 @@ static bool isInlineConstantIfFolded(con
   }
 }
 
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+                              const MachineInstr &UseMI,
+                              int OpNo,
+                              const MachineOperand &OpToFold) {
+  return OpToFold.isFI() &&
+    (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+    OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
 FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
@@ -290,7 +302,6 @@ static bool tryAddToFoldList(SmallVector
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
@@ -403,7 +414,7 @@ static bool isUseSafeToFold(const SIInst
 void SIFoldOperands::foldOperand(
   MachineOperand &OpToFold,
   MachineInstr *UseMI,
-  unsigned UseOpIdx,
+  int UseOpIdx,
   SmallVectorImpl<FoldCandidate> &FoldList,
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
@@ -453,10 +464,28 @@ void SIFoldOperands::foldOperand(
     return;
   }
 
+  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+    // Sanity check that this is a stack access.
+    // FIXME: Should probably use stack pseudos before frame lowering.
+    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+    if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+                           SOff->getReg() != MFI->getStackPtrOffsetReg()))
+      return;
+
+    if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+        MFI->getScratchRSrcReg())
+      return;
 
-  bool FoldingImm = OpToFold.isImm();
+    // A frame index will resolve to a positive constant, so it should always be
+    // safe to fold the addressing mode, even pre-GFX9.
+    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+    SOff->setReg(MFI->getStackPtrOffsetReg());
+    return;
+  }
 
-  if (FoldingImm && UseMI->isCopy()) {
+  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+
+  if (FoldingImmLike && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
@@ -517,7 +546,7 @@ void SIFoldOperands::foldOperand(
       // %sgpr = V_READFIRSTLANE_B32 %vgpr
       // =>
       // %sgpr = S_MOV_B32 imm
-      if (FoldingImm) {
+      if (FoldingImmLike) {
         if (execMayBeModifiedBeforeUse(*MRI,
                                        UseMI->getOperand(UseOpIdx).getReg(),
                                        *OpToFold.getParent(),
@@ -528,7 +557,10 @@ void SIFoldOperands::foldOperand(
 
         // FIXME: ChangeToImmediate should clear subreg
         UseMI->getOperand(1).setSubReg(0);
-        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        if (OpToFold.isImm())
+          UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        else
+          UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
         UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
@@ -560,7 +592,7 @@ void SIFoldOperands::foldOperand(
       return;
   }
 
-  if (!FoldingImm) {
+  if (!FoldingImmLike) {
     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 
     // FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -904,6 +936,9 @@ void SIFoldOperands::foldInstOperand(Mac
       // in some cases. A better heuristic is needed.
       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList,
+                    CopiesToReplace);
       } else {
         if (++NumLiteralUses == 1) {
           NonInlineUse = &*Use;
@@ -1170,8 +1205,7 @@ bool SIFoldOperands::runOnMachineFunctio
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
-
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
   // correctly handle signed zeros.

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=364185&r1=364184&r2=364185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Mon Jun 24 07:53:56 2019
@@ -496,6 +496,11 @@ public:
     return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
   }
 
+  // FIXME: Make this more precise
+  static bool isFLATScratch(const MachineInstr &MI) {
+    return isSegmentSpecificFLAT(MI);
+  }
+
   // Any FLAT encoded instruction, including global_* and scratch_*.
   bool isFLAT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;

Modified: llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll?rev=364185&r1=364184&r2=364185&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll Mon Jun 24 07:53:56 2019
@@ -27,6 +27,47 @@ entry:
   ret void
 }
 
+; Make sure the offset is folded and function's frame register is used
+; rather than the global scratch wave offset.
+; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block:
+; GCN-NOT: v_lshrrev_b32
+; GCN-NOT: s_sub_u32
+
+; GCN: s_and_saveexec_b64
+; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]]
+
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+
+; GCN: [[BB1]]
+; GCN: s_or_b64 exec, exec
+define hidden void @void_func_byval_struct_use_outside_entry_block(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1, i1 %cond) #1 {
+entry:
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 1
+  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
+  store volatile i32 9, i32 addrspace(1)* null, align 4
+  br label %bb1
+
+bb1:
+  ret void
+}
+
 ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
 ; GCN: s_mov_b32 s5, s32
 ; GCN: s_add_u32 s32, s32, 0xc00{{$}}

Added: llvm/trunk/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fold-fi-mubuf.mir?rev=364185&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fold-fi-mubuf.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/fold-fi-mubuf.mir Mon Jun 24 07:53:56 2019
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination  %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: no_fold_fi_non_stack_rsrc_soffset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr6'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %1:sreg_32_xm0 = S_MOV_B32 0
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name: no_fold_fi_non_stack_rsrc
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+# Offset is from global scratch wave offset.
+---
+name: fold_fi_mubuf_scratch_scratch_wave_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: no_fold_fi_mubuf_scratch_sp_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...

Modified: llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll?rev=364185&r1=364184&r2=364185&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll Mon Jun 24 07:53:56 2019
@@ -144,9 +144,6 @@ define void @void_func_byval_struct_i8_i
   ret void
 }
 
-; FIXME: Should be able to see that this can use vaddr, but the
-; FrameIndex is hidden behind a CopyFromReg in the second block.
-
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
 ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
 
@@ -156,13 +153,13 @@ define void @void_func_byval_struct_i8_i
 
 ; GCN: s_and_saveexec_b64
 
-; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]]
-; CI: buffer_load_dword v1, v1, s[0:3], s33 offen offset:4{{$}}
+; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
 
-; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]]
-; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s33 offen offset:4{{$}}
+; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]]
+; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
 
-; GCN: ds_write_b32
+; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 {
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %bb, label %ret

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll?rev=364185&r1=364184&r2=364185&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll Mon Jun 24 07:53:56 2019
@@ -440,6 +440,32 @@ main_body:
   ret float %val
 }
 
+; Make sure a frame index folding doessn't crash on a MUBUF not used
+; for stack access.
+
+; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
+; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
+define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
+  ret float %ret.val
+}
+
+; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
+; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
+; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]]
+define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
+  ret float %ret.val
+}
+
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll?rev=364185&r1=364184&r2=364185&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll Mon Jun 24 07:53:56 2019
@@ -60,8 +60,7 @@ define amdgpu_kernel void @test_readfirs
 
 ; Make sure this doesn't crash.
 ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: v_mov_b32_e32 [[FIVAL:v[0-9]]], 4
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[FIVAL]]
+; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
 define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint i32 addrspace(5)* %alloca to i32




More information about the llvm-commits mailing list