[llvm] [SeparateConstOffsetFromGEP] Support GEP reordering for different types (PR #90802)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Tue May 7 19:05:22 PDT 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/90802

>From a8c5ae1a45a55a4a9bf5f628da649e7c518cd222 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 29 Apr 2024 15:49:15 -0700
Subject: [PATCH 1/7] [SeparateConstOffsetFromGEP] Support GEP reordering for
 conflicting types.

Change-Id: Ic3ecd2c39f49b50ba7486a4223332771f8d672c0
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |  54 ++-
 .../AMDGPU/reorder-gep-inbounds.ll            |  34 ++
 .../AMDGPU/reorder-gep.ll                     | 429 ++++++++++++++++++
 .../SeparateConstOffsetFromGEP/reorder-gep.ll |  63 +++
 4 files changed, 575 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index c54a956fc7e24..e9ff227b6a5c9 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -985,9 +985,10 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
   if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
     return false;
 
-  // TODO: support reordering for non-trivial GEP chains
-  if (PtrGEPType != GEPType ||
-      PtrGEP->getSourceElementType() != GEP->getSourceElementType())
+  bool GEPIsPtr = GEPType->getScalarType()->isPointerTy();
+  bool PtrGEPIsPtr = PtrGEPType->getScalarType()->isPointerTy();
+
+  if (GEPIsPtr != PtrGEPIsPtr)
     return false;
 
   bool NestedNeedsExtraction;
@@ -1002,8 +1003,6 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
                                  /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
     return false;
 
-  IRBuilder<> Builder(GEP);
-  Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
   bool GEPInBounds = GEP->isInBounds();
   bool PtrGEPInBounds = PtrGEP->isInBounds();
   bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
@@ -1017,6 +1016,50 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
       IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
     }
   }
+  TypeSize GEPSize = DL->getTypeSizeInBits(GEP->getSourceElementType());
+  TypeSize PtrGEPSize = DL->getTypeSizeInBits(PtrGEP->getSourceElementType());
+  IRBuilder<> Builder(GEP);
+  Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
+  if (GEPSize > PtrGEPSize) {
+    if (GEPSize % PtrGEPSize)
+      return false;
+    unsigned Ratio = GEPSize / PtrGEPSize;
+    if (NestedByteOffset % Ratio)
+      return false;
+
+    auto NewGEPOffset = Builder.CreateUDiv(
+        *PtrGEP->indices().begin(),
+        Builder.getIntN(
+            PtrGEP->indices().begin()->get()->getType()->getScalarSizeInBits(),
+            Ratio));
+    auto NewSrc = Builder.CreateGEP(GEPType, PtrGEP->getPointerOperand(),
+                                    SmallVector<Value *, 4>(GEP->indices()));
+    cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
+    auto NewGEP = Builder.CreateGEP(GEPType, NewSrc, NewGEPOffset);
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
+    GEP->replaceAllUsesWith(NewGEP);
+    RecursivelyDeleteTriviallyDeadInstructions(GEP);
+    return true;
+  }
+
+  if (GEPSize < PtrGEPSize) {
+    if (PtrGEPSize % GEPSize)
+      return false;
+    unsigned Ratio = PtrGEPSize / GEPSize;
+
+    auto NewGEPOffset = Builder.CreateMul(
+        *PtrGEP->indices().begin(),
+        Builder.getIntN(
+            PtrGEP->indices().begin()->get()->getType()->getScalarSizeInBits(),
+            Ratio));
+    auto NewSrc = Builder.CreateGEP(GEPType, PtrGEP->getPointerOperand(),
+                                    SmallVector<Value *, 4>(GEP->indices()));
+    cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
+    auto NewGEP = Builder.CreateGEP(GEPType, NewSrc, NewGEPOffset);
+    GEP->replaceAllUsesWith(NewGEP);
+    RecursivelyDeleteTriviallyDeadInstructions(GEP);
+    return true;
+  }
 
   // For trivial GEP chains, we can swap the indicies.
   auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
@@ -1025,6 +1068,7 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
   auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
                                   SmallVector<Value *, 4>(PtrGEP->indices()));
   cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
+  cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
   GEP->replaceAllUsesWith(NewGEP);
   RecursivelyDeleteTriviallyDeadInstructions(GEP);
   return true;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
index c24bbd5f658f9..d5d89d42bc697 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -49,3 +49,37 @@ entry:
   %idx1 = getelementptr <2 x i8>, ptr %const1, i32 %in.idx1.nneg
   ret void
 }
+
+define void @inboundsNonNegativeTypeShrink(ptr %in.ptr, i32 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegativeTypeShrink(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 2048
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i32 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds i16, ptr %in.ptr, i32 1024
+  %idx1 = getelementptr inbounds i8, ptr %const1, i32 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonNegativeTypeExpand(ptr %in.ptr, i32 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegativeTypeExpand(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 512
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i32 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds i8, ptr %in.ptr, i32 1024
+  %idx1 = getelementptr inbounds i16, ptr %const1, i32 %in.idx1.nneg
+  ret void
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
index 7137f0fb66fdb..fcf48dc415c03 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
@@ -173,3 +173,432 @@ end:
   call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
   ret void
 }
+
+
+define protected amdgpu_kernel void @reorder_expand(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: reorder_expand:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_i32 s0, s0, s1
+; CHECK-NEXT:    s_lshl_b32 s2, s2, 1
+; CHECK-NEXT:    s_add_i32 s0, s0, s2
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB2_2
+; CHECK-NEXT:  ; %bb.1: ; %bb.1
+; CHECK-NEXT:    v_mov_b32_e32 v12, s0
+; CHECK-NEXT:    ds_read_b128 v[0:3], v12
+; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:256
+; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:512
+; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:768
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[4:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[8:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[12:15]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB2_2: ; %end
+; CHECK-NEXT:    s_add_i32 s1, s0, 0x100
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    s_add_i32 s2, s0, 0x200
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    s_add_i32 s3, s0, 0x300
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_endpgm
+entry:
+  %base = getelementptr i8, ptr addrspace(3) %in.ptr, i32 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr i8, ptr addrspace(3) %base, i32 256
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr i8, ptr addrspace(3) %base, i32 512
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr i8, ptr addrspace(3) %base, i32 768
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
+  %cmp0 = icmp eq i32 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
+  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
+  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
+  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  ret void
+}
+
+define protected amdgpu_kernel void @reorder_shrink(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: reorder_shrink:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s3, s1, 3
+; CHECK-NEXT:    s_add_i32 s0, s0, s3
+; CHECK-NEXT:    s_lshl_b32 s2, s2, 1
+; CHECK-NEXT:    s_add_i32 s0, s0, s2
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB3_2
+; CHECK-NEXT:  ; %bb.1: ; %bb.1
+; CHECK-NEXT:    v_mov_b32_e32 v12, s0
+; CHECK-NEXT:    ds_read_b128 v[0:3], v12
+; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:2048
+; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:4096
+; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:6144
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[4:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[8:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[12:15]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB3_2: ; %end
+; CHECK-NEXT:    s_add_i32 s1, s0, 0x800
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    s_add_i32 s2, s0, 0x1000
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    s_add_i32 s3, s0, 0x1800
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_endpgm
+entry:
+  %base = getelementptr i64, ptr addrspace(3) %in.ptr, i32 %in.idx0
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr i64, ptr addrspace(3) %base, i32 256
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr i64, ptr addrspace(3) %base, i32 512
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr i64, ptr addrspace(3) %base, i32 768
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
+  %cmp0 = icmp eq i32 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
+  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
+  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
+  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  ret void
+}
+
+define protected amdgpu_kernel void @reorder_shrink2(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: reorder_shrink2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s3, s1, 1
+; CHECK-NEXT:    s_add_i32 s0, s0, s3
+; CHECK-NEXT:    s_add_i32 s0, s0, s2
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB4_2
+; CHECK-NEXT:  ; %bb.1: ; %bb.1
+; CHECK-NEXT:    v_mov_b32_e32 v12, s0
+; CHECK-NEXT:    ds_read_b128 v[0:3], v12
+; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:512
+; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:1024
+; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:1536
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[4:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[8:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[12:15]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB4_2: ; %end
+; CHECK-NEXT:    s_add_i32 s1, s0, 0x200
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    s_add_i32 s2, s0, 0x400
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    s_add_i32 s3, s0, 0x600
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_endpgm
+entry:
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
+  %idx0 = getelementptr i8, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr half, ptr addrspace(3) %base, i32 256
+  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr half, ptr addrspace(3) %base, i32 512
+  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr half, ptr addrspace(3) %base, i32 768
+  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i32 %in.idx1
+  %cmp0 = icmp eq i32 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
+  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
+  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
+  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  ret void
+}
+
+
+
+define protected amdgpu_kernel void @bad_index(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: bad_index:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s3, s1, 1
+; CHECK-NEXT:    s_add_i32 s0, s0, s3
+; CHECK-NEXT:    s_add_i32 s0, s0, s2
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB5_2
+; CHECK-NEXT:  ; %bb.1: ; %bb.1
+; CHECK-NEXT:    v_mov_b32_e32 v12, s0
+; CHECK-NEXT:    ds_read_b128 v[0:3], v12
+; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:2
+; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:4
+; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:6
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[4:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[8:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[12:15]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB5_2: ; %end
+; CHECK-NEXT:    s_add_i32 s1, s0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    s_add_i32 s2, s0, 4
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    s_add_i32 s3, s0, 6
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_endpgm
+entry:
+  %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
+  %idx0 = getelementptr i8, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr half, ptr addrspace(3) %base, i32 1
+  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr half, ptr addrspace(3) %base, i32 2
+  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr half, ptr addrspace(3) %base, i32 3
+  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i32 %in.idx1
+  %cmp0 = icmp eq i32 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
+  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
+  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
+  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  ret void
+}
+
+
+%struct.Packed = type <{ [8 x i8], [4 x half] }>
+define protected amdgpu_kernel void @struct_type(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: struct_type:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s0, s5, 14
+; CHECK-NEXT:    s_add_i32 s3, s4, s0
+; CHECK-NEXT:    s_add_i32 s3, s3, s6
+; CHECK-NEXT:    s_add_i32 s2, s3, 0x400000
+; CHECK-NEXT:    s_add_i32 s1, s3, 0x800000
+; CHECK-NEXT:    s_add_i32 s0, s3, 0xc00000
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB6_2
+; CHECK-NEXT:  ; %bb.1: ; %bb.1
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    v_mov_b32_e32 v4, s2
+; CHECK-NEXT:    v_mov_b32_e32 v8, s1
+; CHECK-NEXT:    v_mov_b32_e32 v12, s0
+; CHECK-NEXT:    ds_read_b128 v[0:3], v0
+; CHECK-NEXT:    ds_read_b128 v[4:7], v4
+; CHECK-NEXT:    ds_read_b128 v[8:11], v8
+; CHECK-NEXT:    ds_read_b128 v[12:15], v12
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[4:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[8:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[12:15]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB6_2: ; %end
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_endpgm
+entry:
+  %base = getelementptr [1024 x %struct.Packed], ptr addrspace(3) %in.ptr, i32 %in.idx0
+  %idx0 = getelementptr i8, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr [1024 x %struct.Packed], ptr addrspace(3) %base, i32 256
+  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr [1024 x %struct.Packed], ptr addrspace(3) %base, i32 512
+  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr [1024 x %struct.Packed], ptr addrspace(3) %base, i32 768
+  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i32 %in.idx1
+  %cmp0 = icmp eq i32 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
+  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
+  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
+  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  ret void
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll
index a15f11a634db5..2e3b6ca3653fc 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/reorder-gep.ll
@@ -186,3 +186,66 @@ end:
   call void asm sideeffect "; use $0", "v"(ptr %idx3)
   ret void
 }
+
+
+define void @different_type_reorder2(ptr %in.ptr, i64 %in.idx0, i64 %in.idx1) {
+; CHECK-LABEL: define void @different_type_reorder2(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX0:%.*]], i64 [[IN_IDX1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr [[IN_PTR]], i64 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr i64, ptr [[BASE]], i64 256
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i8, ptr [[CONST1]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST2:%.*]] = getelementptr i64, ptr [[BASE]], i64 512
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i8, ptr [[CONST2]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr i64, ptr [[BASE]], i64 768
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i8, ptr [[CONST3]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 [[IN_IDX0]], 0
+; CHECK-NEXT:    br i1 [[CMP0]], label [[BB_1:%.*]], label [[END:%.*]]
+; CHECK:       bb.1:
+; CHECK-NEXT:    [[VAL0:%.*]] = load <8 x i64>, ptr [[IDX0]], align 16
+; CHECK-NEXT:    [[VAL1:%.*]] = load <8 x i64>, ptr [[IDX1]], align 16
+; CHECK-NEXT:    [[VAL2:%.*]] = load <8 x i64>, ptr [[IDX2]], align 16
+; CHECK-NEXT:    [[VAL3:%.*]] = load <8 x i64>, ptr [[IDX3]], align 16
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+entry:
+  %base = getelementptr i8, ptr %in.ptr, i64 %in.idx0
+  %idx0 = getelementptr i8, ptr %base, i64 %in.idx1
+  %const1 = getelementptr i64, ptr %base, i64 256
+  %idx1 = getelementptr i8, ptr %const1, i64 %in.idx1
+  %const2 = getelementptr i64, ptr %base, i64 512
+  %idx2 = getelementptr i8, ptr %const2, i64 %in.idx1
+  %const3 = getelementptr i64, ptr %base, i64 768
+  %idx3 = getelementptr i8, ptr %const3, i64 %in.idx1
+  %cmp0 = icmp eq i64 %in.idx0, 0
+  br i1 %cmp0, label %bb.1, label %end
+
+bb.1:
+  %val0 = load <8 x i64>, ptr %idx0, align 16
+  %val1 = load <8 x i64>, ptr %idx1, align 16
+  %val2 = load <8 x i64>, ptr %idx2, align 16
+  %val3 = load <8 x i64>, ptr %idx3, align 16
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val0)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val1)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val2)
+  call void asm sideeffect "; use $0", "v"(<8 x i64> %val3)
+  br label %end
+
+end:
+  call void asm sideeffect "; use $0", "v"(ptr %idx0)
+  call void asm sideeffect "; use $0", "v"(ptr %idx1)
+  call void asm sideeffect "; use $0", "v"(ptr %idx2)
+  call void asm sideeffect "; use $0", "v"(ptr %idx3)
+  ret void
+}

>From 88c84fcba708ddad4f6de6da2fc0cb0f81bd0df5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 1 May 2024 17:12:27 -0700
Subject: [PATCH 2/7] actually set inbounds

Change-Id: I3bd435e1baa27a36402cb06977c60662bda5059b
---
 llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp       | 2 +-
 .../SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index e9ff227b6a5c9..56e5ac8a0cf95 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1056,6 +1056,7 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
                                     SmallVector<Value *, 4>(GEP->indices()));
     cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
     auto NewGEP = Builder.CreateGEP(GEPType, NewSrc, NewGEPOffset);
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
     GEP->replaceAllUsesWith(NewGEP);
     RecursivelyDeleteTriviallyDeadInstructions(GEP);
     return true;
@@ -1068,7 +1069,6 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
   auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
                                   SmallVector<Value *, 4>(PtrGEP->indices()));
   cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
-  cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
   GEP->replaceAllUsesWith(NewGEP);
   RecursivelyDeleteTriviallyDeadInstructions(GEP);
   return true;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
index d5d89d42bc697..9a601a6bfc992 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -57,7 +57,7 @@ define void @inboundsNonNegativeTypeShrink(ptr %in.ptr, i32 %in.idx1) {
 ; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 2048
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 2048
 ; CHECK-NEXT:    ret void
 ;
 entry:

>From 664c92fbfa6946f8bbe4c2fc3dd0a420af3b6289 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 2 May 2024 08:54:28 -0700
Subject: [PATCH 3/7] Review comments

Change-Id: If84c0b348407e40dee488145d575497f687c56d3
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |  15 +-
 .../AMDGPU/reorder-gep.ll                     | 631 +++---------------
 .../NVPTX/lower-gep-reorder.ll                |  12 +-
 3 files changed, 123 insertions(+), 535 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 56e5ac8a0cf95..db3c3f093e508 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -972,7 +972,7 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
 
 bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
                                             TargetTransformInfo &TTI) {
-  Type *GEPType = GEP->getResultElementType();
+  Type *GEPType = GEP->getSourceElementType();
   // TODO: support reordering for non-trivial GEP chains
   if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
     return false;
@@ -980,13 +980,13 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
   auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
   if (!PtrGEP)
     return false;
-  Type *PtrGEPType = PtrGEP->getResultElementType();
+  Type *PtrGEPType = PtrGEP->getSourceElementType();
   // TODO: support reordering for non-trivial GEP chains
   if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
     return false;
 
-  bool GEPIsPtr = GEPType->getScalarType()->isPointerTy();
-  bool PtrGEPIsPtr = PtrGEPType->getScalarType()->isPointerTy();
+  bool GEPIsPtr = GEPType->getScalarType()->isPtrOrPtrVectorTy();
+  bool PtrGEPIsPtr = PtrGEPType->getScalarType()->isPtrOrPtrVectorTy();
 
   if (GEPIsPtr != PtrGEPIsPtr)
     return false;
@@ -1016,8 +1016,11 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
       IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
     }
   }
-  TypeSize GEPSize = DL->getTypeSizeInBits(GEP->getSourceElementType());
-  TypeSize PtrGEPSize = DL->getTypeSizeInBits(PtrGEP->getSourceElementType());
+  TypeSize GEPSize = DL->getTypeSizeInBits(GEP->getIndexedType(
+      GEP->getSourceElementType(), GEP->indices().begin()->get()));
+  TypeSize PtrGEPSize = DL->getTypeSizeInBits(PtrGEP->getIndexedType(
+      PtrGEP->getSourceElementType(), PtrGEP->indices().begin()->get()));
+
   IRBuilder<> Builder(GEP);
   Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
   if (GEPSize > PtrGEPSize) {
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
index fcf48dc415c03..72347ddbea2a4 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
@@ -1,60 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --start-before=separate-const-offset-from-gep < %s | FileCheck %s
-
-define protected amdgpu_kernel void @sink_addr(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: sink_addr:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s3, s1, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s3
-; CHECK-NEXT:    s_lshl_b32 s2, s2, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s2
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v12
-; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:512
-; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:1024
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:1536
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB0_2: ; %end
-; CHECK-NEXT:    s_add_i32 s1, s0, 0x200
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_add_i32 s2, s0, 0x400
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    s_add_i32 s3, s0, 0x600
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --passes=separate-const-offset-from-gep < %s | FileCheck %s
+
+define void @sink_addr(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @sink_addr(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 256
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i32 512
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i32 768
+; CHECK-NEXT:    ret void
+;
 entry:
   %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
   %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
@@ -64,85 +24,23 @@ entry:
   %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
   %const3 = getelementptr half, ptr addrspace(3) %base, i32 768
   %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
-
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
-
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
   ret void
 }
 
-define protected amdgpu_kernel void @illegal_addr_mode(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: illegal_addr_mode:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s5, 1
-; CHECK-NEXT:    s_lshl_b32 s1, s6, 1
-; CHECK-NEXT:    s_add_i32 s3, s4, s0
-; CHECK-NEXT:    s_add_i32 s3, s3, s1
-; CHECK-NEXT:    s_add_i32 s2, s3, 0x12a60
-; CHECK-NEXT:    s_add_i32 s1, s3, 0x12c60
-; CHECK-NEXT:    s_add_i32 s0, s3, 0x12ed8
-; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB1_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    v_mov_b32_e32 v4, s2
-; CHECK-NEXT:    v_mov_b32_e32 v8, s1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v0
-; CHECK-NEXT:    ds_read_b128 v[4:7], v4
-; CHECK-NEXT:    ds_read_b128 v[8:11], v8
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB1_2: ; %end
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+define void @illegal_addr_mode(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @illegal_addr_mode(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 38192
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr half, ptr addrspace(3) [[CONST1]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 38448
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr half, ptr addrspace(3) [[CONST2]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 38764
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr half, ptr addrspace(3) [[CONST3]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
   %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
@@ -152,82 +50,24 @@ entry:
   %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
   %const3 = getelementptr half, ptr addrspace(3) %base, i32 38764
   %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
-
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
-
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
   ret void
 }
 
 
-define protected amdgpu_kernel void @reorder_expand(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: reorder_expand:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_add_i32 s0, s0, s1
-; CHECK-NEXT:    s_lshl_b32 s2, s2, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s2
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB2_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v12
-; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:256
-; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:512
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:768
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB2_2: ; %end
-; CHECK-NEXT:    s_add_i32 s1, s0, 0x100
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_add_i32 s2, s0, 0x200
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    s_add_i32 s3, s0, 0x300
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+define void @reorder_expand(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @reorder_expand(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 128
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i32 256
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i32 384
+; CHECK-NEXT:    ret void
+;
 entry:
   %base = getelementptr i8, ptr addrspace(3) %in.ptr, i32 %in.idx0
   %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
@@ -237,82 +77,23 @@ entry:
   %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
   %const3 = getelementptr i8, ptr addrspace(3) %base, i32 768
   %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
-
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
-
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
   ret void
 }
 
-define protected amdgpu_kernel void @reorder_shrink(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: reorder_shrink:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s3, s1, 3
-; CHECK-NEXT:    s_add_i32 s0, s0, s3
-; CHECK-NEXT:    s_lshl_b32 s2, s2, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s2
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB3_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v12
-; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:2048
-; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:4096
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:6144
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB3_2: ; %end
-; CHECK-NEXT:    s_add_i32 s1, s0, 0x800
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_add_i32 s2, s0, 0x1000
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    s_add_i32 s3, s0, 0x1800
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+define void @reorder_shrink(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @reorder_shrink(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i64, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 1024
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i32 2048
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i32 3072
+; CHECK-NEXT:    ret void
+;
 entry:
   %base = getelementptr i64, ptr addrspace(3) %in.ptr, i32 %in.idx0
   %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
@@ -322,81 +103,23 @@ entry:
   %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
   %const3 = getelementptr i64, ptr addrspace(3) %base, i32 768
   %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
-
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
-
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
   ret void
 }
 
-define protected amdgpu_kernel void @reorder_shrink2(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: reorder_shrink2:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s3, s1, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s3
-; CHECK-NEXT:    s_add_i32 s0, s0, s2
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB4_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v12
-; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:512
-; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:1024
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:1536
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB4_2: ; %end
-; CHECK-NEXT:    s_add_i32 s1, s0, 0x200
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_add_i32 s2, s0, 0x400
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    s_add_i32 s3, s0, 0x600
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+define void @reorder_shrink2(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @reorder_shrink2(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 512
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 1024
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 1536
+; CHECK-NEXT:    ret void
+;
 entry:
   %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
   %idx0 = getelementptr i8, ptr addrspace(3) %base, i32 %in.idx1
@@ -406,172 +129,53 @@ entry:
   %idx2 = getelementptr i8, ptr addrspace(3) %const2, i32 %in.idx1
   %const3 = getelementptr half, ptr addrspace(3) %base, i32 768
   %idx3 = getelementptr i8, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
-
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
-
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
   ret void
 }
 
 
 
-define protected amdgpu_kernel void @bad_index(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: bad_index:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s3, s1, 1
-; CHECK-NEXT:    s_add_i32 s0, s0, s3
-; CHECK-NEXT:    s_add_i32 s0, s0, s2
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB5_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v12
-; CHECK-NEXT:    ds_read_b128 v[4:7], v12 offset:2
-; CHECK-NEXT:    ds_read_b128 v[8:11], v12 offset:4
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB5_2: ; %end
-; CHECK-NEXT:    s_add_i32 s1, s0, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_add_i32 s2, s0, 4
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    s_add_i32 s3, s0, 6
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+define void @bad_index(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @bad_index(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 1
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr half, ptr addrspace(3) [[CONST1]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 1
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 3
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr half, ptr addrspace(3) [[CONST3]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %base = getelementptr half, ptr addrspace(3) %in.ptr, i32 %in.idx0
-  %idx0 = getelementptr i8, ptr addrspace(3) %base, i32 %in.idx1
-  %const1 = getelementptr half, ptr addrspace(3) %base, i32 1
-  %idx1 = getelementptr i8, ptr addrspace(3) %const1, i32 %in.idx1
-  %const2 = getelementptr half, ptr addrspace(3) %base, i32 2
-  %idx2 = getelementptr i8, ptr addrspace(3) %const2, i32 %in.idx1
-  %const3 = getelementptr half, ptr addrspace(3) %base, i32 3
-  %idx3 = getelementptr i8, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
-
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
-
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
+  %idx0 = getelementptr half, ptr addrspace(3) %base, i32 %in.idx1
+  %const1 = getelementptr i8, ptr addrspace(3) %base, i32 1
+  %idx1 = getelementptr half, ptr addrspace(3) %const1, i32 %in.idx1
+  %const2 = getelementptr i8, ptr addrspace(3) %base, i32 2
+  %idx2 = getelementptr half, ptr addrspace(3) %const2, i32 %in.idx1
+  %const3 = getelementptr i8, ptr addrspace(3) %base, i32 3
+  %idx3 = getelementptr half, ptr addrspace(3) %const3, i32 %in.idx1
   ret void
 }
 
 
 %struct.Packed = type <{ [8 x i8], [4 x half] }>
-define protected amdgpu_kernel void @struct_type(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: struct_type:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s5, 14
-; CHECK-NEXT:    s_add_i32 s3, s4, s0
-; CHECK-NEXT:    s_add_i32 s3, s3, s6
-; CHECK-NEXT:    s_add_i32 s2, s3, 0x400000
-; CHECK-NEXT:    s_add_i32 s1, s3, 0x800000
-; CHECK-NEXT:    s_add_i32 s0, s3, 0xc00000
-; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB6_2
-; CHECK-NEXT:  ; %bb.1: ; %bb.1
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    v_mov_b32_e32 v4, s2
-; CHECK-NEXT:    v_mov_b32_e32 v8, s1
-; CHECK-NEXT:    v_mov_b32_e32 v12, s0
-; CHECK-NEXT:    ds_read_b128 v[0:3], v0
-; CHECK-NEXT:    ds_read_b128 v[4:7], v4
-; CHECK-NEXT:    ds_read_b128 v[8:11], v8
-; CHECK-NEXT:    ds_read_b128 v[12:15], v12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[8:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v[12:15]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB6_2: ; %end
-; CHECK-NEXT:    v_mov_b32_e32 v0, s3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use v0
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_endpgm
+define void @struct_type(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @struct_type(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
+; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) [[BASE]], i32 256
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i8, ptr addrspace(3) [[CONST1]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST2:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) [[BASE]], i32 512
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i8, ptr addrspace(3) [[CONST2]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) [[BASE]], i32 768
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i8, ptr addrspace(3) [[CONST3]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %base = getelementptr [1024 x %struct.Packed], ptr addrspace(3) %in.ptr, i32 %in.idx0
   %idx0 = getelementptr i8, ptr addrspace(3) %base, i32 %in.idx1
@@ -581,24 +185,5 @@ entry:
   %idx2 = getelementptr i8, ptr addrspace(3) %const2, i32 %in.idx1
   %const3 = getelementptr [1024 x %struct.Packed], ptr addrspace(3) %base, i32 768
   %idx3 = getelementptr i8, ptr addrspace(3) %const3, i32 %in.idx1
-  %cmp0 = icmp eq i32 %in.idx0, 0
-  br i1 %cmp0, label %bb.1, label %end
-
-bb.1:
-  %val0 = load <8 x half>, ptr addrspace(3) %idx0, align 16
-  %val1 = load <8 x half>, ptr addrspace(3) %idx1, align 16
-  %val2 = load <8 x half>, ptr addrspace(3) %idx2, align 16
-  %val3 = load <8 x half>, ptr addrspace(3) %idx3, align 16
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val0)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val1)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val2)
-  call void asm sideeffect "; use $0", "v"(<8 x half> %val3)
-  br label %end
-
-end:
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx0)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx1)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx2)
-  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) %idx3)
   ret void
 }
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
index a91c8172177f9..d43987accad78 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
@@ -7,14 +7,14 @@ define protected amdgpu_kernel void @sink_addr(ptr %in.ptr, i64 %in.idx0, i64 %i
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 [[IN_IDX1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
-; CHECK-NEXT:    [[CONST11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 2048
-; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i64, ptr [[CONST11]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i64, ptr [[TMP3]], i64 256
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
-; CHECK-NEXT:    [[CONST22:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4096
-; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i64, ptr [[CONST22]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i64, ptr [[TMP4]], i64 512
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
-; CHECK-NEXT:    [[CONST33:%.*]] = getelementptr i8, ptr [[TMP2]], i64 6144
-; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i64, ptr [[CONST33]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP2]], i64 [[IN_IDX1]]
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i64, ptr [[TMP7]], i64 768
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 [[IN_IDX0]], 0
 ; CHECK-NEXT:    br i1 [[CMP0]], label [[BB_1:%.*]], label [[END:%.*]]
 ; CHECK:       bb.1:

>From a4c1767f51a9394c59b7b528dbbc4fb0ccb65b3c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 May 2024 08:46:38 -0700
Subject: [PATCH 4/7] Review Comments 2

Change-Id: Ic94d65538a02cb73d12d461ec513b915dafe711d
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 58 ++-----------------
 .../AMDGPU/reorder-gep-inbounds.ll            | 12 ++--
 .../AMDGPU/reorder-gep.ll                     | 40 ++++++-------
 .../NVPTX/lower-gep-reorder.ll                |  6 +-
 4 files changed, 35 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index db3c3f093e508..af0c41df18d2a 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1016,62 +1016,16 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
       IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
     }
   }
-  TypeSize GEPSize = DL->getTypeSizeInBits(GEP->getIndexedType(
-      GEP->getSourceElementType(), GEP->indices().begin()->get()));
-  TypeSize PtrGEPSize = DL->getTypeSizeInBits(PtrGEP->getIndexedType(
-      PtrGEP->getSourceElementType(), PtrGEP->indices().begin()->get()));
 
   IRBuilder<> Builder(GEP);
   Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
-  if (GEPSize > PtrGEPSize) {
-    if (GEPSize % PtrGEPSize)
-      return false;
-    unsigned Ratio = GEPSize / PtrGEPSize;
-    if (NestedByteOffset % Ratio)
-      return false;
-
-    auto NewGEPOffset = Builder.CreateUDiv(
-        *PtrGEP->indices().begin(),
-        Builder.getIntN(
-            PtrGEP->indices().begin()->get()->getType()->getScalarSizeInBits(),
-            Ratio));
-    auto NewSrc = Builder.CreateGEP(GEPType, PtrGEP->getPointerOperand(),
-                                    SmallVector<Value *, 4>(GEP->indices()));
-    cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
-    auto NewGEP = Builder.CreateGEP(GEPType, NewSrc, NewGEPOffset);
-    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
-    GEP->replaceAllUsesWith(NewGEP);
-    RecursivelyDeleteTriviallyDeadInstructions(GEP);
-    return true;
-  }
-
-  if (GEPSize < PtrGEPSize) {
-    if (PtrGEPSize % GEPSize)
-      return false;
-    unsigned Ratio = PtrGEPSize / GEPSize;
-
-    auto NewGEPOffset = Builder.CreateMul(
-        *PtrGEP->indices().begin(),
-        Builder.getIntN(
-            PtrGEP->indices().begin()->get()->getType()->getScalarSizeInBits(),
-            Ratio));
-    auto NewSrc = Builder.CreateGEP(GEPType, PtrGEP->getPointerOperand(),
-                                    SmallVector<Value *, 4>(GEP->indices()));
-    cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
-    auto NewGEP = Builder.CreateGEP(GEPType, NewSrc, NewGEPOffset);
-    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
-    GEP->replaceAllUsesWith(NewGEP);
-    RecursivelyDeleteTriviallyDeadInstructions(GEP);
-    return true;
-  }
-
   // For trivial GEP chains, we can swap the indicies.
-  auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
-                                  SmallVector<Value *, 4>(GEP->indices()));
-  cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
-  auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
-                                  SmallVector<Value *, 4>(PtrGEP->indices()));
-  cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
+  Value *NewSrc = Builder.CreateGEP(GEPType, PtrGEP->getPointerOperand(),
+                                    SmallVector<Value *, 4>(GEP->indices()), "",
+                                    IsChainInBounds);
+  Value *NewGEP = Builder.CreateGEP(PtrGEPType, NewSrc,
+                                    SmallVector<Value *, 4>(PtrGEP->indices()),
+                                    "", IsChainInBounds);
   GEP->replaceAllUsesWith(NewGEP);
   RecursivelyDeleteTriviallyDeadInstructions(GEP);
   return true;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
index 9a601a6bfc992..e3511fc81f5a1 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -50,14 +50,14 @@ entry:
   ret void
 }
 
-define void @inboundsNonNegativeTypeShrink(ptr %in.ptr, i32 %in.idx1) {
-; CHECK-LABEL: define void @inboundsNonNegativeTypeShrink(
+define void @inboundsNonNegativeType_i16i8(ptr %in.ptr, i32 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegativeType_i16i8(
 ; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 2048
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 1024
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -67,14 +67,14 @@ entry:
   ret void
 }
 
-define void @inboundsNonNegativeTypeExpand(ptr %in.ptr, i32 %in.idx1) {
-; CHECK-LABEL: define void @inboundsNonNegativeTypeExpand(
+define void @inboundsNonNegative_i8i16(ptr %in.ptr, i32 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegative_i8i16(
 ; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 512
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 1024
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
index 72347ddbea2a4..c8465f4cf975d 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep.ll
@@ -54,18 +54,18 @@ entry:
 }
 
 
-define void @reorder_expand(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: define void @reorder_expand(
+define void @reorder_i8half(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @reorder_i8half(
 ; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 128
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 256
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i32 256
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 512
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i32 384
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 768
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -80,18 +80,18 @@ entry:
   ret void
 }
 
-define void @reorder_shrink(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: define void @reorder_shrink(
+define void @reorder_i64half(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @reorder_i64half(
 ; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i64, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 1024
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr addrspace(3) [[TMP0]], i32 256
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i32 2048
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr addrspace(3) [[TMP2]], i32 512
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i32 3072
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr addrspace(3) [[TMP4]], i32 768
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -106,18 +106,18 @@ entry:
   ret void
 }
 
-define void @reorder_shrink2(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
-; CHECK-LABEL: define void @reorder_shrink2(
+define void @reorder_halfi8(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
+; CHECK-LABEL: define void @reorder_halfi8(
 ; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i32 [[IN_IDX0:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 512
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 256
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 1024
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr addrspace(3) [[TMP2]], i32 512
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 1536
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP4]], i32 768
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -140,12 +140,12 @@ define void @bad_index(ptr addrspace(3) %in.ptr, i32 %in.idx0, i32 %in.idx1) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BASE:%.*]] = getelementptr half, ptr addrspace(3) [[IN_PTR]], i32 [[IN_IDX0]]
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[CONST1:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 1
-; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr half, ptr addrspace(3) [[CONST1]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr addrspace(3) [[BASE]], i32 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr addrspace(3) [[TMP0]], i32 1
-; CHECK-NEXT:    [[CONST3:%.*]] = getelementptr i8, ptr addrspace(3) [[BASE]], i32 3
-; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr half, ptr addrspace(3) [[CONST3]], i32 [[IN_IDX1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 3
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
index d43987accad78..43dda1ae15176 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lower-gep-reorder.ll
@@ -8,13 +8,13 @@ define protected amdgpu_kernel void @sink_addr(ptr %in.ptr, i64 %in.idx0, i64 %i
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 [[IN_IDX1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[IN_IDX1]]
-; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i64, ptr [[TMP3]], i64 256
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr i8, ptr [[TMP3]], i64 2048
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[IN_IDX1]]
-; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i64, ptr [[TMP4]], i64 512
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr i8, ptr [[TMP4]], i64 4096
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [8192 x i64], ptr [[IN_PTR]], i64 [[IN_IDX0]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP2]], i64 [[IN_IDX1]]
-; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i64, ptr [[TMP7]], i64 768
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr i8, ptr [[TMP7]], i64 6144
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 [[IN_IDX0]], 0
 ; CHECK-NEXT:    br i1 [[CMP0]], label [[BB_1:%.*]], label [[END:%.*]]
 ; CHECK:       bb.1:

>From a8ee263615444eb0f6d458f10556a05a1aa6dcee Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 May 2024 11:40:26 -0700
Subject: [PATCH 5/7] Add Tests

Change-Id: I301582a8a06dbef0184b9fcfa22cb7992e5bb9e2
---
 .../AMDGPU/reorder-gep-inbounds.ll            | 212 +++++++++++++++---
 1 file changed, 177 insertions(+), 35 deletions(-)

diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
index e3511fc81f5a1..ff38f0a70ef4e 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -1,28 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -S -passes=separate-const-offset-from-gep < %s | FileCheck %s
 
-define void @inboundsPossiblyNegative(ptr %in.ptr, i32 %in.idx1) {
+define void @inboundsPossiblyNegative(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-LABEL: define void @inboundsPossiblyNegative(
-; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <2 x i8>, ptr [[IN_PTR]], i64 [[IN_IDX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i32 1
-  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i32 %in.idx1
+  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i64 1
+  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i64 %in.idx1
   ret void
 }
 
-define void @inboundsNonNegative(ptr %in.ptr, i32 %in.idx1) {
-; CHECK-LABEL: define void @inboundsNonNegative(
+define void @inboundsNonNegative_nonCanonical(ptr %in.ptr, i32 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegative_nonCanonical(
 ; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[IN_IDX1_NNEG1:%.*]] = and i32 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = sext i32 [[IN_IDX1_NNEG1]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i8>, ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    ret void
 ;
@@ -33,53 +32,196 @@ entry:
   ret void
 }
 
-define void @inboundsNonchained(ptr %in.ptr, i32 %in.idx1) {
+define void @inboundsNonNegative(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonNegative(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i8>, ptr [[TMP0]], i64 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i64 1
+  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonchained(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-LABEL: define void @inboundsNonchained(
-; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <2 x i8>, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %in.idx1.nneg = and i32 %in.idx1, 2147483647
-  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i32 1
-  %idx1 = getelementptr <2 x i8>, ptr %const1, i32 %in.idx1.nneg
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds <2 x i8>, ptr %in.ptr, i64 1
+  %idx1 = getelementptr <2 x i8>, ptr %const1, i64 %in.idx1.nneg
   ret void
 }
 
-define void @inboundsNonNegativeType_i16i8(ptr %in.ptr, i32 %in.idx1) {
+define void @inboundsNonNegativeType_i16i8(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-LABEL: define void @inboundsNonNegativeType_i16i8(
-; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 1024
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 1024
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %in.idx1.nneg = and i32 %in.idx1, 2147483647
-  %const1 = getelementptr inbounds i16, ptr %in.ptr, i32 1024
-  %idx1 = getelementptr inbounds i8, ptr %const1, i32 %in.idx1.nneg
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i16, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i8, ptr %const1, i64 %in.idx1.nneg
   ret void
 }
 
-define void @inboundsNonNegative_i8i16(ptr %in.ptr, i32 %in.idx1) {
+define void @inboundsNonNegative_i8i16(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-LABEL: define void @inboundsNonNegative_i8i16(
-; CHECK-SAME: ptr [[IN_PTR:%.*]], i32 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i32 [[IN_IDX1]], 2147483647
-; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IN_IDX1_NNEG]] to i64
+; CHECK-NEXT:    [[IDXPROM:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[IN_PTR]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 1024
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1024
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %in.idx1.nneg = and i32 %in.idx1, 2147483647
-  %const1 = getelementptr inbounds i8, ptr %in.ptr, i32 1024
-  %idx1 = getelementptr inbounds i16, ptr %const1, i32 %in.idx1.nneg
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i16, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonchained_first(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonchained_first(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr i32, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @inboundsNonchained_second(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @inboundsNonchained_second(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i64, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @notInbounds(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @notInbounds(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr i8, ptr %in.ptr, i64 1024
+  %idx1 = getelementptr i128, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @vectorType1(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @badVectorType(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <2 x i8>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i8>, ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x i8>, ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds <2 x i8>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @vectorType2(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @badVectorType2(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <4 x half>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i8>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x i8>, ptr %in.ptr, i32 1
+  %idx1 = getelementptr inbounds <4 x half>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @addrspace1(ptr addrspace(1) %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @addrspace1(
+; CHECK-SAME: ptr addrspace(1) [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr addrspace(1) %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i128, ptr addrspace(1) %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @addrspace3(ptr addrspace(3) %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @addrspace3(
+; CHECK-SAME: ptr addrspace(3) [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX1_NNEG]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr addrspace(3) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr addrspace(3) %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i128, ptr addrspace(3) %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @addrspace7(ptr addrspace(7) %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @addrspace7(
+; CHECK-SAME: ptr addrspace(7) [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 9223372036854775807
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 [[IN_IDX1_NNEG]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr addrspace(7) [[IN_PTR]], i32 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr addrspace(7) [[TMP0]], i64 1024
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 9223372036854775807
+  %const1 = getelementptr inbounds i8, ptr addrspace(7) %in.ptr, i64 1024
+  %idx1 = getelementptr inbounds i128, ptr addrspace(7) %const1, i64 %in.idx1.nneg
   ret void
 }

>From b371d0bf58e935ca0244a4f98cf03c5c144f4400 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 May 2024 19:01:15 -0700
Subject: [PATCH 6/7] Allow ptr source type

Change-Id: I66bdfcfd7ce3d53c28b4439bd8ebd65905574560
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |  6 --
 .../AMDGPU/reorder-gep-inbounds.ll            | 87 ++++++++++++++++++-
 2 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index af0c41df18d2a..93752a345daab 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -985,12 +985,6 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
   if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
     return false;
 
-  bool GEPIsPtr = GEPType->getScalarType()->isPtrOrPtrVectorTy();
-  bool PtrGEPIsPtr = PtrGEPType->getScalarType()->isPtrOrPtrVectorTy();
-
-  if (GEPIsPtr != PtrGEPIsPtr)
-    return false;
-
   bool NestedNeedsExtraction;
   int64_t NestedByteOffset =
       accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
index ff38f0a70ef4e..5df016e2eea70 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -6,7 +6,7 @@ define void @inboundsPossiblyNegative(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <2 x i8>, ptr [[IN_PTR]], i64 [[IN_IDX1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <2 x i8>, ptr [[TMP0]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -145,7 +145,7 @@ entry:
 }
 
 define void @vectorType1(ptr %in.ptr, i64 %in.idx1) {
-; CHECK-LABEL: define void @badVectorType(
+; CHECK-LABEL: define void @vectorType1(
 ; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
@@ -161,7 +161,7 @@ entry:
 }
 
 define void @vectorType2(ptr %in.ptr, i64 %in.idx1) {
-; CHECK-LABEL: define void @badVectorType2(
+; CHECK-LABEL: define void @vectorType2(
 ; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
@@ -176,6 +176,87 @@ entry:
   ret void
 }
 
+define void @vectorType3(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @vectorType3(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x ptr>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x ptr>, ptr %in.ptr, i32 1
+  %idx1 = getelementptr inbounds ptr, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @vectorType4(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @vectorType4(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x ptr addrspace(1)>, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x ptr>, ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x ptr>, ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds <8 x ptr addrspace(1)>, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+
+define void @ptrType(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @ptrType(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x ptr>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds <4 x ptr>, ptr %in.ptr, i32 1
+  %idx1 = getelementptr inbounds ptr, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @ptrType2(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @ptrType2(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr addrspace(3), ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds ptr addrspace(3), ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds i64, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
+define void @ptrType3(ptr %in.ptr, i64 %in.idx1) {
+; CHECK-LABEL: define void @ptrType3(
+; CHECK-SAME: ptr [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr addrspace(7), ptr [[TMP0]], i32 3
+; CHECK-NEXT:    ret void
+;
+entry:
+  %in.idx1.nneg = and i64 %in.idx1, 2147483647
+  %const1 = getelementptr inbounds ptr addrspace(7), ptr %in.ptr, i32 3
+  %idx1 = getelementptr inbounds i16, ptr %const1, i64 %in.idx1.nneg
+  ret void
+}
+
 define void @addrspace1(ptr addrspace(1) %in.ptr, i64 %in.idx1) {
 ; CHECK-LABEL: define void @addrspace1(
 ; CHECK-SAME: ptr addrspace(1) [[IN_PTR:%.*]], i64 [[IN_IDX1:%.*]]) #[[ATTR0]] {

>From fe698dde150d7dd4a733be6ada055656ebafbced Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 7 May 2024 19:04:26 -0700
Subject: [PATCH 7/7] remove redundant test

Change-Id: I8fdfcb81082fa2e868bae101eef40237a21d8e37
---
 .../SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
index 5df016e2eea70..16e47f057babc 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/reorder-gep-inbounds.ll
@@ -215,12 +215,12 @@ define void @ptrType(ptr %in.ptr, i64 %in.idx1) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IN_IDX1_NNEG:%.*]] = and i64 [[IN_IDX1]], 2147483647
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[IN_PTR]], i64 [[IN_IDX1_NNEG]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x ptr>, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr addrspace(2), ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %in.idx1.nneg = and i64 %in.idx1, 2147483647
-  %const1 = getelementptr inbounds <4 x ptr>, ptr %in.ptr, i32 1
+  %const1 = getelementptr inbounds ptr addrspace(2), ptr %in.ptr, i32 1
   %idx1 = getelementptr inbounds ptr, ptr %const1, i64 %in.idx1.nneg
   ret void
 }



More information about the llvm-commits mailing list