[PATCH] D147079: InlineSpiller: Consider if all subranges are the same when avoiding redundant spills
Jay Foad via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 2 03:35:47 PDT 2023
foad added a comment.
This is causing infinite compile times on a bunch of graphics shaders. Can you please fix or revert?
Test case with `llc -march=amdgcn -mcpu=gfx900`:
define amdgpu_cs void @main(i32 %i, float %i72) {
bb:
%i12 = call i64 @llvm.amdgcn.s.getpc()
%i2 = lshr i64 %i12, 32
%i3 = trunc i64 %i2 to i32
%i4 = insertelement <2 x i32> zeroinitializer, i32 %i3, i64 1
%i5 = bitcast <2 x i32> %i4 to i64
%i6 = inttoptr i64 %i5 to ptr addrspace(4)
%i7 = getelementptr i8, ptr addrspace(4) %i6, i64 48
%i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16
%i9 = getelementptr i8, ptr addrspace(4) %i6, i64 64
%i10 = load <4 x i32>, ptr addrspace(4) %i9, align 16
%i11 = getelementptr i8, ptr addrspace(4) %i6, i64 240
%i123 = load <8 x i32>, ptr addrspace(4) %i11, align 32
%i13 = getelementptr i8, ptr addrspace(4) %i6, i64 272
%i14 = load <8 x i32>, ptr addrspace(4) %i13, align 32
%i15 = getelementptr i8, ptr addrspace(4) %i6, i64 304
%i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32
%i17 = getelementptr i8, ptr addrspace(4) %i6, i64 336
%i18 = load <8 x i32>, ptr addrspace(4) %i17, align 32
%i19 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> zeroinitializer, i32 0, i32 0)
%i20 = bitcast <4 x i32> %i19 to <4 x float>
%i21 = extractelement <4 x float> %i20, i64 0
%i27 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> zeroinitializer, i32 128, i32 0)
%i28 = bitcast <4 x i32> %i27 to <4 x float>
%i29 = extractelement <4 x float> %i28, i64 0
%i30 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> zeroinitializer, i32 144, i32 0)
%i31 = bitcast <4 x i32> %i30 to <4 x float>
%i32 = extractelement <4 x float> %i31, i64 0
%i39 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> zeroinitializer, i32 1, i32 0)
%i40 = bitcast <4 x i32> %i39 to <4 x float>
%i41 = extractelement <4 x float> %i40, i64 0
%i42 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> zeroinitializer, i32 208, i32 0)
%i43 = bitcast <4 x i32> %i42 to <4 x float>
%i45 = extractelement <4 x float> %i43, i64 0
%i52 = getelementptr i8, ptr addrspace(4) %i6, i64 496
%i53 = load <8 x i32>, ptr addrspace(4) %i52, align 32
%i54 = getelementptr i8, ptr addrspace(4) %i6, i64 528
%i55 = load <8 x i32>, ptr addrspace(4) %i54, align 32
%i56 = getelementptr i8, ptr addrspace(4) %i6, i64 752
%i57 = load <8 x i32>, ptr addrspace(4) %i56, align 32
%i58 = getelementptr i8, ptr addrspace(4) %i6, i64 784
%i59 = load <8 x i32>, ptr addrspace(4) %i58, align 32
%i60 = getelementptr i8, ptr addrspace(4) %i6, i64 944
%i61 = load <4 x i32>, ptr addrspace(4) %i60, align 16
%i67 = bitcast <4 x i32> %i61 to <4 x float>
%i68 = extractelement <4 x float> %i67, i64 0
%i69 = fmul float %i68, %i32
%i726 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float %i69, float 0.000000e+00, <8 x i32> %i14, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 false, i32 0, i32 0)
%i76 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i16, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
%i77 = extractelement <4 x float> %i76, i64 0
%i78 = call float @llvm.amdgcn.fmed3.f32(float %i77, float %i726, float 0.000000e+00)
%i79 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i18, <4 x i32> %i8, i1 false, i32 0, i32 0)
%i80 = extractelement <4 x float> %i79, i64 0
%i81 = fcmp one float %i72, 0.000000e+00
%i83 = icmp ne i32 %i, 0
br i1 %i83, label %bb84, label %bb111
bb84: ; preds = %bb
br i1 %i81, label %bb85, label %bb102
bb85: ; preds = %bb85, %bb84
%i86 = phi float [ %i101, %bb85 ], [ 0.000000e+00, %bb84 ]
%i87 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i53, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
%i88 = extractelement <2 x float> %i87, i64 0
%i89 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i57, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
%i90 = extractelement <2 x float> %i89, i64 0
%i91 = fsub float %i90, %i88
%i95 = fmul float %i91, %i78
%i99 = fadd float %i86, %i95
%i101 = fsub float %i86, %i99
br label %bb85
bb102: ; preds = %bb84
%i103 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i123, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
%i105 = bitcast float %i103 to i32
%i106 = insertelement <3 x i32> zeroinitializer, i32 %i105, i64 0
call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %i106, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
%i109 = bitcast float %i80 to i32
%i110 = insertelement <4 x i32> zeroinitializer, i32 %i109, i64 0
call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i110, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
ret void
bb111: ; preds = %bb
%i112 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i14, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
br label %bb122
bb122: ; preds = %bb122, %bb111
%i1237 = phi float [ 0.000000e+00, %bb111 ], [ %i162, %bb122 ]
%i125 = fmul float %i1237, %i21
%i133 = fmul float %i45, %i125
%i135 = fmul float %i133, %i29
%i136 = fadd float %i135, %i41
%i137 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i136, float 0.000000e+00, <8 x i32> %i55, <4 x i32> %i10, i1 false, i32 0, i32 0)
%i138 = extractelement <2 x float> %i137, i64 0
%i140 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i1237, float 0.000000e+00, <8 x i32> %i59, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
%i141 = extractelement <2 x float> %i140, i64 0
%i142 = fsub float %i141, %i138
%i145 = fmul float %i142, %i32
%i147 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> zeroinitializer, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 false, i32 0, i32 0)
%i148 = extractelement <2 x float> %i147, i64 0
%i157 = fadd float %i148, %i145
%i158 = fmul float %i157, %i78
%i162 = fmul float %i158, %i112
br label %bb122
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.fmed3.f32(float, float, float)
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.amdgcn.s.getpc()
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
declare float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
declare <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg)
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D147079/new/
https://reviews.llvm.org/D147079
More information about the llvm-commits
mailing list