[llvm] r269265 - AMDGPU: Fix breaking IR on instructions with multiple pointer operands

Wed May 11 18:58:58 PDT 2016

Author: arsenm
Date: Wed May 11 20:58:58 2016
New Revision: 269265

URL: http://llvm.org/viewvc/llvm-project?rev=269265&view=rev
Log:
AMDGPU: Fix breaking IR on instructions with multiple pointer operands

The promote alloca pass would attempt to promote an alloca with
a select, icmp, or phi user, even though the other operand was
from a non-promotable source, producing a select on two different
pointer types.

Only do this if we know that both operands derive from the same
alloca. In the future we should be able to relax this to an alloca
which will also be promoted.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp?rev=269265&r1=269264&r2=269265&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Wed May 11 20:58:58 2016
@@ -32,6 +32,7 @@ class AMDGPUPromoteAlloca : public Funct
 private:
   const TargetMachine *TM;
   Module *Mod;
+  const DataLayout *DL;
   MDNode *MaxWorkGroupSizeRange;
 
   // FIXME: This should be per-kernel.
@@ -43,6 +44,20 @@ private:
   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
 
+  /// BaseAlloca is the alloca root the search started from.
+  /// Val may be that alloca or a recursive user of it.
+  bool collectUsesWithPtrTypes(Value *BaseAlloca,
+                               Value *Val,
+                               std::vector<Value*> &WorkList) const;
+
+  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
+  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
+  /// Returns true if both operands are derived from the same alloca. Val should
+  /// be the same value as one of the input operands of UseInst.
+  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
+                                       Instruction *UseInst,
+                                       int OpIdx0, int OpIdx1) const;
+
 public:
   static char ID;
 
@@ -50,6 +65,7 @@ public:
     FunctionPass(ID),
     TM(TM_),
     Mod(nullptr),
+    DL(nullptr),
     MaxWorkGroupSizeRange(nullptr),
     LocalMemAvailable(0),
     IsAMDGCN(false),
@@ -63,6 +79,11 @@ public:
   }
 
   void handleAlloca(AllocaInst &I);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
@@ -80,6 +101,7 @@ bool AMDGPUPromoteAlloca::doInitializati
     return false;
 
   Mod = &M;
+  DL = &Mod->getDataLayout();
 
   // The maximum workitem id.
   //
@@ -131,8 +153,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(
         continue;
 
       if (Use->getParent()->getParent() == &F) {
-        LocalMemAvailable -=
-          Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
+        LocalMemAvailable -= DL->getTypeAllocSize(GV.getValueType());
         break;
       }
     }
@@ -428,7 +449,39 @@ static bool isCallPromotable(CallInst *C
   }
 }
 
-static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
+                                                          Value *Val,
+                                                          Instruction *Inst,
+                                                          int OpIdx0,
+                                                          int OpIdx1) const {
+  // Figure out which operand is the one we might not be promoting.
+  Value *OtherOp = Inst->getOperand(OpIdx0);
+  if (Val == OtherOp)
+    OtherOp = Inst->getOperand(OpIdx1);
+
+  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+  if (!isa<AllocaInst>(OtherObj))
+    return false;
+
+  // TODO: We should be able to replace undefs with the right pointer type.
+
+  // TODO: If we know the other base object is another promotable
+  // alloca, not necessarily this alloca, we can do this. The
+  // important part is both must have the same address space at
+  // the end.
+  if (OtherObj != BaseAlloca) {
+    DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
+  Value *BaseAlloca,
+  Value *Val,
+  std::vector<Value*> &WorkList) const {
+
   for (User *User : Val->users()) {
     if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
       continue;
@@ -441,11 +494,11 @@ static bool collectUsesWithPtrTypes(Valu
       continue;
     }
 
-    Instruction *UseInst = dyn_cast<Instruction>(User);
-    if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
+    Instruction *UseInst = cast<Instruction>(User);
+    if (UseInst->getOpcode() == Instruction::PtrToInt)
       return false;
 
-    if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
       if (SI->isVolatile())
         return false;
 
@@ -464,6 +517,13 @@ static bool collectUsesWithPtrTypes(Valu
         return false;
     }
 
+    // Only promote a select if we know that the other select operand
+    // is from another pointer that will also be promoted.
+    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
+        return false;
+    }
+
     if (!User->getType()->isPointerTy())
       continue;
 
@@ -474,8 +534,31 @@ static bool collectUsesWithPtrTypes(Valu
         return false;
     }
 
+    // Only promote a select if we know that the other select operand is from
+    // another pointer that will also be promoted.
+    if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
+        return false;
+    }
+
+    // Repeat for phis.
+    if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
+      // TODO: Handle more complex cases. We should be able to replace loops
+      // over arrays.
+      switch (Phi->getNumIncomingValues()) {
+      case 1:
+        break;
+      case 2:
+        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
+          return false;
+        break;
+      default:
+        return false;
+      }
+    }
+
     WorkList.push_back(User);
-    if (!collectUsesWithPtrTypes(User, WorkList))
+    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
       return false;
   }
 
@@ -516,7 +599,7 @@ void AMDGPUPromoteAlloca::handleAlloca(A
 
   std::vector<Value*> WorkList;
 
-  if (!collectUsesWithPtrTypes(&I, WorkList)) {
+  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     DEBUG(dbgs() << " Do not know how to convert all uses\n");
     return;
   }

Added: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll?rev=269265&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll Wed May 11 20:58:58 2016
@@ -0,0 +1,38 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; This normally would be fixed by instcombine to be compare to the GEP
+; indices
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  %cmp = icmp eq i32* %ptr0, %ptr1
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
+; CHECK: %alloca = alloca [16 x i32], align 4
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+; CHECK: %ptr1 = call i32* @get_unknown_pointer()
+; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = call i32* @get_unknown_pointer()
+  %cmp = icmp eq i32* %ptr0, %ptr1
+  %zext = zext i1 %cmp to i32
+  store volatile i32 %zext, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll?rev=269265&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll Wed May 11 20:58:58 2016
@@ -0,0 +1,170 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+
+; CHECK-LABEL: @branch_ptr_var_same_alloca(
+; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %else
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+else:
+  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %b
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @one_phi_value(
+; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14
+; CHECK:  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: br label %exit
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @one_phi_value(i32 %a) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %exit
+
+exit:
+  %phi.ptr = phi i32* [ %arrayidx0, %entry ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @branch_ptr_alloca_unknown_obj(
+; CHECK: %alloca = alloca [64 x i32], align 4
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = call i32* @get_unknown_pointer()
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32* %phi.ptr, align 4
+define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  br i1 undef, label %if, label %else
+
+if:
+  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+  br label %endif
+
+else:
+  %arrayidx1 = call i32* @get_unknown_pointer()
+  br label %endif
+
+endif:
+  %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, i32* %phi.ptr, align 4
+  ret void
+}
+
+; kernel void ptr_induction_var_same_alloca(void)
+; {
+;     int alloca[64];
+;     int i = 0;
+
+;     #pragma nounroll
+;     for (int* p = &alloca[2], *e = &alloca[48]; p != e; ++p, ++i)
+;     {
+;         *p = i;
+;     }
+; }
+
+; FIXME: This should be promotable. We need to use
+; GetUnderlyingObjects when looking at the icmp user.
+
+; CHECK-LABEL: @ptr_induction_var_same_alloca(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+define void @ptr_induction_var_same_alloca() #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 48
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %p.08 = phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+  store i32 %i.09, i32* %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %cmp = icmp eq i32* %incdec.ptr, %arrayidx1
+  br i1 %cmp, label %for.cond.cleanup, label %for.body
+}
+
+
+; extern int* get_unknown_pointer(void);
+
+; kernel void ptr_induction_var_alloca_unknown(void)
+; {
+;     int alloca[64];
+;     int i = 0;
+;
+;     for (int* p = &alloca[2], *e = get_unknown_pointer(); p != e; ++p, ++i)
+;     {
+;         *p = i;
+;     }
+; }
+
+; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
+define void @ptr_induction_var_alloca_unknown() #0 {
+entry:
+  %alloca = alloca [64 x i32], align 4
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+  %call = tail call i32* @get_unknown_pointer() #2
+  %cmp.7 = icmp eq i32* %arrayidx, %call
+  br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+  store i32 %i.09, i32* %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %cmp = icmp eq i32* %incdec.ptr, %call
+  br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll?rev=269265&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll Wed May 11 20:58:58 2016
@@ -0,0 +1,102 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
+; CHECK: %alloca = alloca i32
+; CHECK: select i1 undef, i32* undef, i32* %alloca
+define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+  %alloca = alloca i32, align 4
+  %select = select i1 undef, i32* undef, i32* %alloca
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; FIXME: This should be promotable but requires knowing that both will be promoted first.
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
+; CHECK: %alloca0 = alloca i32, i32 16, align 4
+; CHECK: %alloca1 = alloca i32, i32 16, align 4
+; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
+define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+  %alloca0 = alloca i32, i32 16, align 4
+  %alloca1 = alloca i32, i32 16, align 4
+  %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+  %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; TODO: Maybe this should be canonicalized to select on the constant and GEP after.
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_constant_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 1
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
+  %select = select i1 undef, i32* %ptr0, i32* %ptr1
+  store i32 0, i32* %select, align 4
+  ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_select_input_select(
+; CHECK: getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_select_input_select.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %b
+; CHECK: %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %c
+; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
+; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
+define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
+  %select0 = select i1 undef, i32* %ptr0, i32* %ptr1
+  %select1 = select i1 undef, i32* %select0, i32* %ptr2
+  store i32 0, i32* %select1, align 4
+  ret void
+}
+
+define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+  %alloca = alloca [16 x i32], align 4
+  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+  store i32 0, i32* %ptr0
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
+  %select0 = select i1 undef, i32* undef, i32* %ptr2
+  store i32 0, i32* %ptr1
+  br label %bb2
+
+bb2:
+  %phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ]
+  %select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1
+  store i32 0, i32* %select1, align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind }