[llvm] c2654be - [SeparateConstOFfsetFromGEP] Regenerate test checks (NFC)

Wed Jan 10 02:44:13 PST 2024

Author: Nikita Popov
Date: 2024-01-10T11:43:50+01:00
New Revision: c2654befcaecba121ca40415d157925e0da05b5e

URL: https://github.com/llvm/llvm-project/commit/c2654befcaecba121ca40415d157925e0da05b5e
DIFF: https://github.com/llvm/llvm-project/commit/c2654befcaecba121ca40415d157925e0da05b5e.diff

LOG: [SeparateConstOFfsetFromGEP] Regenerate test checks (NFC)

Added: 
    

Modified: 
    llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
    llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
index 5cb8cbd05a7aef..2cd7fdfce35eb3 100644

--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -1,15 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -mtriple=amdgcn-- -S -passes=separate-const-offset-from-gep,gvn -reassociate-geps-verify-no-dead-code < %s | FileCheck -check-prefix=IR %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
 @array = internal addrspace(4) constant [4096 x [32 x float]] zeroinitializer, align 4
 
-; IR-LABEL: @sum_of_array(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 1
-; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 32
-; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 33
 define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
+; IR-LABEL: define amdgpu_kernel void @sum_of_array(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) nocapture [[OUTPUT:%.*]]) {
+; IR-NEXT:    [[TMP:%.*]] = sext i32 [[Y]] to i64
+; IR-NEXT:    [[TMP1:%.*]] = sext i32 [[X]] to i64
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 [[TMP1]], i64 [[TMP]]
+; IR-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 1
+; IR-NEXT:    [[TMP144:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 32
+; IR-NEXT:    [[TMP187:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 33
+; IR-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
+; IR-NEXT:    ret void
+;
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 %tmp1, i64 %tmp
@@ -36,13 +43,22 @@ define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocaptu
 
 ; Some of the indices go over the maximum mubuf offset, so don't split them.
 
-; IR-LABEL: @sum_of_array_over_max_mubuf_offset(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, ptr addrspace(4) [[BASE_PTR]], i64 255
-; IR: add i32 %x, 256
-; IR: getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
+; IR-LABEL: define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) nocapture [[OUTPUT:%.*]]) {
+; IR-NEXT:    [[TMP:%.*]] = sext i32 [[Y]] to i64
+; IR-NEXT:    [[TMP1:%.*]] = sext i32 [[X]] to i64
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP1]], i64 [[TMP]]
+; IR-NEXT:    [[TMP6:%.*]] = add i32 [[Y]], 255
+; IR-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
+; IR-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 255
+; IR-NEXT:    [[TMP12:%.*]] = add i32 [[X]], 256
+; IR-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP]]
+; IR-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP7]]
+; IR-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
+; IR-NEXT:    ret void
+;
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 %tmp1, i64 %tmp
@@ -69,12 +85,24 @@ define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, pt
 @lds_array = internal addrspace(3) global [4096 x [4 x float]] undef, align 4
 
 ; DS instructions have a larger immediate offset, so make sure these are OK.
-; IR-LABEL: @sum_of_lds_array_over_max_mubuf_offset(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %{{[a-zA-Z0-9]+}}, i32 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i32 255
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i32 16128
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i32 16383
 define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, ptr addrspace(1) nocapture %output) {
+; IR-LABEL: define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr addrspace(1) nocapture [[OUTPUT:%.*]]) {
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 [[X]], i32 [[Y]]
+; IR-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4
+; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
+; IR-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i32 255
+; IR-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(3) [[TMP82]], align 4
+; IR-NEXT:    [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]]
+; IR-NEXT:    [[TMP144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i32 16128
+; IR-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(3) [[TMP144]], align 4
+; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]]
+; IR-NEXT:    [[TMP187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i32 16383
+; IR-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(3) [[TMP187]], align 4
+; IR-NEXT:    [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]]
+; IR-NEXT:    store float [[TMP21]], ptr addrspace(1) [[OUTPUT]], align 4
+; IR-NEXT:    ret void
+;
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 %x, i32 %y
   %tmp4 = load float, ptr addrspace(3) %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
@@ -93,11 +121,35 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y
   ret void
 }
 
-; IR-LABEL: @keep_metadata(
-; IR: getelementptr {{.*}} !amdgpu.uniform
-; IR: getelementptr {{.*}} !amdgpu.uniform
-; IR: getelementptr {{.*}} !amdgpu.uniform
 define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata(ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+; IR-LABEL: define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata(
+; IR-SAME: ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP0:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP1:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP2:%.*]], ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615) [[TMP3:%.*]], float inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], <2 x i32> [[TMP6:%.*]], <2 x i32> [[TMP7:%.*]], <2 x i32> [[TMP8:%.*]], <3 x i32> [[TMP9:%.*]], <2 x i32> [[TMP10:%.*]], <2 x i32> [[TMP11:%.*]], <2 x i32> [[TMP12:%.*]], float [[TMP13:%.*]], float [[TMP14:%.*]], float [[TMP15:%.*]], float [[TMP16:%.*]], float [[TMP17:%.*]], i32 [[TMP18:%.*]], i32 [[TMP19:%.*]], float [[TMP20:%.*]], i32 [[TMP21:%.*]]) #[[ATTR0:[0-9]+]] {
+; IR-NEXT:  main_body:
+; IR-NEXT:    [[TMP22:%.*]] = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 [[TMP5]]) #[[ATTR3:[0-9]+]]
+; IR-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
+; IR-NEXT:    [[TMP24:%.*]] = shl i32 [[TMP23]], 1
+; IR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[TMP24]] to i64
+; IR-NEXT:    [[TMP25:%.*]] = getelementptr [0 x <8 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[IDXPROM1]], !amdgpu.uniform [[META0:![0-9]+]]
+; IR-NEXT:    [[TMP26:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP25]], align 32, !invariant.load [[META0]]
+; IR-NEXT:    [[TMP27:%.*]] = shl i32 [[TMP23]], 2
+; IR-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; IR-NEXT:    [[TMP29:%.*]] = getelementptr [0 x <4 x i32>], ptr addrspace(4) [[TMP1]], i64 0, i64 [[TMP28]], !amdgpu.uniform [[META0]]
+; IR-NEXT:    [[TMP30:%.*]] = getelementptr <4 x i32>, ptr addrspace(4) [[TMP29]], i64 3, !amdgpu.uniform [[META0]]
+; IR-NEXT:    [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP30]], align 16, !invariant.load [[META0]]
+; IR-NEXT:    [[TMP32:%.*]] = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> [[TMP26]], <4 x i32> [[TMP31]], i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #[[ATTR3]]
+; IR-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP32]], i32 0
+; IR-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP32]], i32 1
+; IR-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP32]], i32 2
+; IR-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP32]], i32 3
+; IR-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP4]] to i32
+; IR-NEXT:    [[TMP38:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 [[TMP37]], 4
+; IR-NEXT:    [[TMP39:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP38]], float [[TMP33]], 5
+; IR-NEXT:    [[TMP40:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP39]], float [[TMP34]], 6
+; IR-NEXT:    [[TMP41:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP40]], float [[TMP35]], 7
+; IR-NEXT:    [[TMP42:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP41]], float [[TMP36]], 8
+; IR-NEXT:    [[TMP43:%.*]] = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP42]], float [[TMP20]], 19
+; IR-NEXT:    ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> [[TMP43]]
+;
 main_body:
   %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
   %23 = bitcast float %22 to i32
@@ -136,3 +188,6 @@ attributes #5 = { "InitialPSInputAddr"="45175" }
 attributes #6 = { nounwind readnone speculatable }
 attributes #7 = { nounwind readonly }
 attributes #8 = { nounwind readnone }
+;.
+; IR: [[META0]] = !{}
+;.

diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 5652b6657b5361..1391cb4e7b4971 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -mtriple=nvptx64-nvidia-cuda -S -passes=separate-const-offset-from-gep,gvn \
 ; RUN:       -reassociate-geps-verify-no-dead-code \
 ; RUN:     | FileCheck %s --check-prefix=IR
@@ -18,6 +19,30 @@
 @array = internal addrspace(3) global [32 x [32 x float]] zeroinitializer, align 4
 
 define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) {
+; IR-LABEL: define void @sum_of_array(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture [[OUTPUT:%.*]]) {
+; IR-NEXT:  .preheader:
+; IR-NEXT:    [[TMP0:%.*]] = sext i32 [[Y]] to i64
+; IR-NEXT:    [[TMP1:%.*]] = sext i32 [[X]] to i64
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
+; IR-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr
+; IR-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
+; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
+; IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 1
+; IR-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr
+; IR-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
+; IR-NEXT:    [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]]
+; IR-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 32
+; IR-NEXT:    [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr
+; IR-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4
+; IR-NEXT:    [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]]
+; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 33
+; IR-NEXT:    [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr
+; IR-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
+; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]]
+; IR-NEXT:    store float [[TMP17]], ptr [[OUTPUT]], align 4
+; IR-NEXT:    ret void
+;
 .preheader:
   %0 = sext i32 %y to i64
   %1 = sext i32 %x to i64
@@ -50,13 +75,8 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) {
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
-; IR-LABEL: @sum_of_array(
 ; TODO: GVN is unable to preserve the "inbounds" keyword on the first GEP. Need
 ; some infrastructure changes to enable such optimizations.
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 1
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 32
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 33
 
 ; @sum_of_array2 is very similar to @sum_of_array. The only 
diff erence is in
 ; the order of "sext" and "add" when computing the array indices. @sum_of_array
@@ -65,6 +85,30 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) {
 ; e.g., array[sext(x) + 1][sext(y) + 1]. SeparateConstOffsetFromGEP should be
 ; able to extract constant offsets from both forms.
 define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) {
+; IR-LABEL: define void @sum_of_array2(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture [[OUTPUT:%.*]]) {
+; IR-NEXT:  .preheader:
+; IR-NEXT:    [[TMP0:%.*]] = sext i32 [[Y]] to i64
+; IR-NEXT:    [[TMP1:%.*]] = sext i32 [[X]] to i64
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
+; IR-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr
+; IR-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
+; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
+; IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 1
+; IR-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr
+; IR-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
+; IR-NEXT:    [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]]
+; IR-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 32
+; IR-NEXT:    [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr
+; IR-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4
+; IR-NEXT:    [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]]
+; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 33
+; IR-NEXT:    [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr
+; IR-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
+; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]]
+; IR-NEXT:    store float [[TMP17]], ptr [[OUTPUT]], align 4
+; IR-NEXT:    ret void
+;
 .preheader:
   %0 = sext i32 %y to i64
   %1 = sext i32 %x to i64
@@ -95,11 +139,6 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) {
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
-; IR-LABEL: @sum_of_array2(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 1
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 32
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 33
 
 
 ; This function loads
@@ -113,6 +152,30 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) {
 ; 2) annotates the addition with "nuw"; otherwise, zext(x + 1) => zext(x) + 1
 ;    may be invalid.
 define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) {
+; IR-LABEL: define void @sum_of_array3(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture [[OUTPUT:%.*]]) {
+; IR-NEXT:  .preheader:
+; IR-NEXT:    [[TMP0:%.*]] = zext i32 [[Y]] to i64
+; IR-NEXT:    [[TMP1:%.*]] = zext i32 [[X]] to i64
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
+; IR-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr
+; IR-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
+; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
+; IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 1
+; IR-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr
+; IR-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
+; IR-NEXT:    [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]]
+; IR-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 32
+; IR-NEXT:    [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr
+; IR-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4
+; IR-NEXT:    [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]]
+; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 33
+; IR-NEXT:    [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr
+; IR-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
+; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]]
+; IR-NEXT:    store float [[TMP17]], ptr [[OUTPUT]], align 4
+; IR-NEXT:    ret void
+;
 .preheader:
   %0 = zext i32 %y to i64
   %1 = zext i32 %x to i64
@@ -145,11 +208,6 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) {
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
-; IR-LABEL: @sum_of_array3(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 1
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 32
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 33
 
 
 ; This function loads
@@ -161,6 +219,30 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) {
 ; We expect the generated code to reuse the computation of
 ; &array[zext(x)][zext(y)]. See the expected IR and PTX for details.
 define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) {
+; IR-LABEL: define void @sum_of_array4(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture [[OUTPUT:%.*]]) {
+; IR-NEXT:  .preheader:
+; IR-NEXT:    [[TMP0:%.*]] = zext i32 [[Y]] to i64
+; IR-NEXT:    [[TMP1:%.*]] = zext i32 [[X]] to i64
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
+; IR-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr
+; IR-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
+; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
+; IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 1
+; IR-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr
+; IR-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
+; IR-NEXT:    [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]]
+; IR-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 32
+; IR-NEXT:    [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr
+; IR-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4
+; IR-NEXT:    [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]]
+; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[TMP2]], i64 33
+; IR-NEXT:    [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr
+; IR-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
+; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]]
+; IR-NEXT:    store float [[TMP17]], ptr [[OUTPUT]], align 4
+; IR-NEXT:    ret void
+;
 .preheader:
   %0 = zext i32 %y to i64
   %1 = zext i32 %x to i64
@@ -191,11 +273,6 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) {
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
 ; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
-; IR-LABEL: @sum_of_array4(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 1
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 32
-; IR: getelementptr inbounds float, ptr addrspace(3) [[BASE_PTR]], i64 33
 
 
 ; The source code is:
@@ -211,7 +288,19 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) {
 ;   p0 = &input[sext(x + y)];
 ;   p1 = &p0[5];
 define void @reunion(i32 %x, i32 %y, ptr %input) {
-; IR-LABEL: @reunion(
+; IR-LABEL: define void @reunion(
+; IR-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[INPUT:%.*]]) {
+; IR-NEXT:  entry:
+; IR-NEXT:    [[XY:%.*]] = add nsw i32 [[X]], [[Y]]
+; IR-NEXT:    [[TMP0:%.*]] = sext i32 [[XY]] to i64
+; IR-NEXT:    [[P0:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP0]]
+; IR-NEXT:    [[V0:%.*]] = load float, ptr [[P0]], align 4
+; IR-NEXT:    call void @use(float [[V0]])
+; IR-NEXT:    [[P13:%.*]] = getelementptr inbounds float, ptr [[P0]], i64 5
+; IR-NEXT:    [[V1:%.*]] = load float, ptr [[P13]], align 4
+; IR-NEXT:    call void @use(float [[V1]])
+; IR-NEXT:    ret void
+;
 ; PTX-LABEL: reunion(
 entry:
   %xy = add nsw i32 %x, %y
@@ -225,7 +314,6 @@ entry:
   %xy5 = add nsw i32 %x, %y5
   %1 = sext i32 %xy5 to i64
   %p1 = getelementptr inbounds float, ptr %input, i64 %1
-; IR: getelementptr inbounds float, ptr %p0, i64 5
   %v1 = load float, ptr %p1, align 4
 ; PTX: ld.f32 %f{{[0-9]+}}, [[[p0]]+20]
   call void @use(float %v1)