[llvm] [VPlan] Introduce CSE pass (PR #151872)
Adrian Kuegel via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 02:44:01 PDT 2025
akuegel wrote:
@artagnon it looks like there might be a problem with the new CSE pass. We tracked down a test failure to this revision, and I dumped the function after the pass, both with CSE enabled and with CSE disabled. Here are the two dumps:
With CSE enabled:
```
define noalias noundef ptr @transpose_copy_fusion(ptr readonly captures(none) %0) local_unnamed_addr #0 {
%2 = getelementptr inbounds nuw i8, ptr %0, i64 24
%3 = load ptr, ptr %2, align 8, !invariant.load !3
%4 = load ptr, ptr %3, align 8, !invariant.load !3, !dereferenceable !4
%5 = getelementptr inbounds nuw i8, ptr %3, i64 16
%6 = load ptr, ptr %5, align 8, !invariant.load !3, !dereferenceable !4
tail call void @llvm.experimental.noalias.scope.decl(metadata !5)
tail call void @llvm.experimental.noalias.scope.decl(metadata !8)
br i1 false, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %1
br label %vector.body
vector.body: ; preds = %vector.ph
%7 = getelementptr i8, ptr %4, i64 12
%8 = getelementptr float, ptr %7, i32 0
%9 = getelementptr float, ptr %8, i32 -3
%wide.load = load <4 x float>, ptr %9, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse = shufflevector <4 x float> %wide.load, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%10 = getelementptr float, ptr %4, i64 4
%11 = getelementptr i8, ptr %10, i64 12
%12 = getelementptr float, ptr %11, i32 0
%13 = getelementptr float, ptr %12, i32 -3
%wide.load1 = load <4 x float>, ptr %13, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse2 = shufflevector <4 x float> %wide.load1, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%14 = getelementptr float, ptr %4, i64 8
%15 = getelementptr i8, ptr %14, i64 12
%16 = getelementptr float, ptr %15, i32 0
%17 = getelementptr float, ptr %16, i32 -3
%wide.load3 = load <4 x float>, ptr %17, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse4 = shufflevector <4 x float> %wide.load3, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%18 = getelementptr float, ptr %4, i64 12
%19 = getelementptr i8, ptr %18, i64 12
%20 = getelementptr float, ptr %19, i32 0
%21 = getelementptr float, ptr %20, i32 -3
%wide.load5 = load <4 x float>, ptr %21, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse6 = shufflevector <4 x float> %wide.load5, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%22 = shufflevector <4 x float> %reverse, <4 x float> %reverse2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%23 = shufflevector <4 x float> %reverse4, <4 x float> %reverse6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%24 = shufflevector <8 x float> %22, <8 x float> %23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x float> %24, <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
store <16 x float> %interleaved.vec, ptr %6, align 4, !alias.scope !8, !noalias !5
br label %middle.block
middle.block: ; preds = %vector.body
br label %transpose_copy_fusion_wrapped.exit
scalar.ph: ; preds = %1
br label %.preheader
.preheader: ; preds = %scalar.ph, %.preheader
%25 = phi i64 [ 0, %scalar.ph ], [ %46, %.preheader ]
%.idx = shl i64 %25, 4
%26 = getelementptr i8, ptr %6, i64 %.idx
%27 = sub nsw i64 0, %25
%28 = getelementptr float, ptr %4, i64 %27
%29 = getelementptr i8, ptr %28, i64 12
%30 = load float, ptr %29, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
store float %30, ptr %26, align 4, !alias.scope !8, !noalias !5
%31 = sub nuw nsw i64 4, %25
%32 = getelementptr float, ptr %4, i64 %31
%33 = getelementptr i8, ptr %32, i64 12
%34 = load float, ptr %33, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%35 = getelementptr i8, ptr %26, i64 4
store float %34, ptr %35, align 4, !alias.scope !8, !noalias !5
%36 = sub nuw nsw i64 8, %25
%37 = getelementptr float, ptr %4, i64 %36
%38 = getelementptr i8, ptr %37, i64 12
%39 = load float, ptr %38, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%40 = getelementptr i8, ptr %26, i64 8
store float %39, ptr %40, align 4, !alias.scope !8, !noalias !5
%41 = sub nuw nsw i64 12, %25
%42 = getelementptr float, ptr %4, i64 %41
%43 = getelementptr i8, ptr %42, i64 12
%44 = load float, ptr %43, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%45 = getelementptr i8, ptr %26, i64 12
store float %44, ptr %45, align 4, !alias.scope !8, !noalias !5
%46 = add nuw nsw i64 %25, 1
%exitcond.not = icmp eq i64 %46, 4
br i1 %exitcond.not, label %transpose_copy_fusion_wrapped.exit, label %.preheader, !llvm.loop !10
transpose_copy_fusion_wrapped.exit: ; preds = %middle.block, %.preheader
ret ptr null
}
; Function Attrs: uwtable
define noalias noundef ptr @convert_element_type.1_kernel(ptr readonly captures(none) %0) local_unnamed_addr #0 {
%args_gep = getelementptr inbounds nuw i8, ptr %0, i64 24
%args = load ptr, ptr %args_gep, align 8
%arg0 = load ptr, ptr %args, align 8, !invariant.load !3, !dereferenceable !4, !align !5
%arg1_gep = getelementptr i8, ptr %args, i64 16
%arg1 = load ptr, ptr %arg1_gep, align 8, !invariant.load !3, !dereferenceable !5, !align !5
br label %convert_element_type.1.loop_header.dim.1.preheader
convert_element_type.1.loop_header.dim.1.preheader: ; preds = %1, %convert_element_type.1.loop_header.dim.1.preheader
%convert_element_type.1.invar_address.dim.0.06 = phi i64 [ 0, %1 ], [ %invar.inc, %convert_element_type.1.loop_header.dim.1.preheader ]
%.split = getelementptr inbounds nuw [4 x float], ptr %arg0, i64 %convert_element_type.1.invar_address.dim.0.06
%.split4 = getelementptr inbounds nuw [4 x bfloat], ptr %arg1, i64 %convert_element_type.1.invar_address.dim.0.06
%2 = load float, ptr %.split, align 4, !invariant.load !3, !noalias !6
%3 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %2) #1
store bfloat %3, ptr %.split4, align 2, !alias.scope !6
%4 = getelementptr inbounds nuw i8, ptr %.split, i64 4
%5 = load float, ptr %4, align 4, !invariant.load !3, !noalias !6
%6 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %5) #1
%7 = getelementptr inbounds nuw i8, ptr %.split4, i64 2
store bfloat %6, ptr %7, align 2, !alias.scope !6
%8 = getelementptr inbounds nuw i8, ptr %.split, i64 8
%9 = load float, ptr %8, align 4, !invariant.load !3, !noalias !6
%10 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %9) #1
%11 = getelementptr inbounds nuw i8, ptr %.split4, i64 4
store bfloat %10, ptr %11, align 2, !alias.scope !6
%12 = getelementptr inbounds nuw i8, ptr %.split, i64 12
%13 = load float, ptr %12, align 4, !invariant.load !3, !noalias !6
%14 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %13) #1
%15 = getelementptr inbounds nuw i8, ptr %.split4, i64 6
store bfloat %14, ptr %15, align 2, !alias.scope !6
%invar.inc = add nuw nsw i64 %convert_element_type.1.invar_address.dim.0.06, 1
%exitcond = icmp eq i64 %invar.inc, 4
br i1 %exitcond, label %return, label %convert_element_type.1.loop_header.dim.1.preheader, !llvm.loop !9
return: ; preds = %convert_element_type.1.loop_header.dim.1.preheader
ret ptr null
}```
With CSE disabled:
```
define noalias noundef ptr @transpose_copy_fusion(ptr readonly captures(none) %0) local_unnamed_addr #0 {
%2 = getelementptr inbounds nuw i8, ptr %0, i64 24
%3 = load ptr, ptr %2, align 8, !invariant.load !3
%4 = load ptr, ptr %3, align 8, !invariant.load !3, !dereferenceable !4
%5 = getelementptr inbounds nuw i8, ptr %3, i64 16
%6 = load ptr, ptr %5, align 8, !invariant.load !3, !dereferenceable !4
tail call void @llvm.experimental.noalias.scope.decl(metadata !5)
tail call void @llvm.experimental.noalias.scope.decl(metadata !8)
br i1 false, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %1
br label %vector.body
vector.body: ; preds = %vector.ph
%7 = getelementptr i8, ptr %4, i64 12
%8 = getelementptr float, ptr %7, i32 0
%9 = getelementptr float, ptr %8, i32 -3
%wide.load = load <4 x float>, ptr %9, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse = shufflevector <4 x float> %wide.load, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%10 = getelementptr float, ptr %4, i64 4
%11 = getelementptr i8, ptr %10, i64 12
%12 = getelementptr float, ptr %11, i32 0
%13 = getelementptr float, ptr %12, i32 -3
%wide.load1 = load <4 x float>, ptr %13, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse2 = shufflevector <4 x float> %wide.load1, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%14 = getelementptr float, ptr %4, i64 8
%15 = getelementptr i8, ptr %14, i64 12
%16 = getelementptr float, ptr %15, i32 0
%17 = getelementptr float, ptr %16, i32 -3
%wide.load3 = load <4 x float>, ptr %17, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse4 = shufflevector <4 x float> %wide.load3, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%18 = getelementptr i8, ptr %7, i64 12
%19 = getelementptr float, ptr %18, i32 0
%20 = getelementptr float, ptr %19, i32 -3
%wide.load5 = load <4 x float>, ptr %20, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%reverse6 = shufflevector <4 x float> %wide.load5, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%21 = shufflevector <4 x float> %reverse, <4 x float> %reverse2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%22 = shufflevector <4 x float> %reverse4, <4 x float> %reverse6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%23 = shufflevector <8 x float> %21, <8 x float> %22, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x float> %23, <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
store <16 x float> %interleaved.vec, ptr %6, align 4, !alias.scope !8, !noalias !5
br label %middle.block
middle.block: ; preds = %vector.body
br label %transpose_copy_fusion_wrapped.exit
scalar.ph: ; preds = %1
br label %.preheader
.preheader: ; preds = %scalar.ph, %.preheader
%24 = phi i64 [ 0, %scalar.ph ], [ %45, %.preheader ]
%.idx = shl i64 %24, 4
%25 = getelementptr i8, ptr %6, i64 %.idx
%26 = sub nsw i64 0, %24
%27 = getelementptr float, ptr %4, i64 %26
%28 = getelementptr i8, ptr %27, i64 12
%29 = load float, ptr %28, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
store float %29, ptr %25, align 4, !alias.scope !8, !noalias !5
%30 = sub nuw nsw i64 4, %24
%31 = getelementptr float, ptr %4, i64 %30
%32 = getelementptr i8, ptr %31, i64 12
%33 = load float, ptr %32, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%34 = getelementptr i8, ptr %25, i64 4
store float %33, ptr %34, align 4, !alias.scope !8, !noalias !5
%35 = sub nuw nsw i64 8, %24
%36 = getelementptr float, ptr %4, i64 %35
%37 = getelementptr i8, ptr %36, i64 12
%38 = load float, ptr %37, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%39 = getelementptr i8, ptr %25, i64 8
store float %38, ptr %39, align 4, !alias.scope !8, !noalias !5
%40 = sub nuw nsw i64 12, %24
%41 = getelementptr float, ptr %4, i64 %40
%42 = getelementptr i8, ptr %41, i64 12
%43 = load float, ptr %42, align 4, !invariant.load !3, !alias.scope !5, !noalias !8
%44 = getelementptr i8, ptr %25, i64 12
store float %43, ptr %44, align 4, !alias.scope !8, !noalias !5
%45 = add nuw nsw i64 %24, 1
%exitcond.not = icmp eq i64 %45, 4
br i1 %exitcond.not, label %transpose_copy_fusion_wrapped.exit, label %.preheader, !llvm.loop !10
transpose_copy_fusion_wrapped.exit: ; preds = %middle.block, %.preheader
ret ptr null
}
; Function Attrs: uwtable
define noalias noundef ptr @convert_element_type.1_kernel(ptr readonly captures(none) %0) local_unnamed_addr #0 {
%args_gep = getelementptr inbounds nuw i8, ptr %0, i64 24
%args = load ptr, ptr %args_gep, align 8
%arg0 = load ptr, ptr %args, align 8, !invariant.load !3, !dereferenceable !4, !align !5
%arg1_gep = getelementptr i8, ptr %args, i64 16
%arg1 = load ptr, ptr %arg1_gep, align 8, !invariant.load !3, !dereferenceable !5, !align !5
br label %convert_element_type.1.loop_header.dim.1.preheader
convert_element_type.1.loop_header.dim.1.preheader: ; preds = %1, %convert_element_type.1.loop_header.dim.1.preheader
%convert_element_type.1.invar_address.dim.0.06 = phi i64 [ 0, %1 ], [ %invar.inc, %convert_element_type.1.loop_header.dim.1.preheader ]
%.split = getelementptr inbounds nuw [4 x float], ptr %arg0, i64 %convert_element_type.1.invar_address.dim.0.06
%.split4 = getelementptr inbounds nuw [4 x bfloat], ptr %arg1, i64 %convert_element_type.1.invar_address.dim.0.06
%2 = load float, ptr %.split, align 4, !invariant.load !3, !noalias !6
%3 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %2) #1
store bfloat %3, ptr %.split4, align 2, !alias.scope !6
%4 = getelementptr inbounds nuw i8, ptr %.split, i64 4
%5 = load float, ptr %4, align 4, !invariant.load !3, !noalias !6
%6 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %5) #1
%7 = getelementptr inbounds nuw i8, ptr %.split4, i64 2
store bfloat %6, ptr %7, align 2, !alias.scope !6
%8 = getelementptr inbounds nuw i8, ptr %.split, i64 8
%9 = load float, ptr %8, align 4, !invariant.load !3, !noalias !6
%10 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %9) #1
%11 = getelementptr inbounds nuw i8, ptr %.split4, i64 4
store bfloat %10, ptr %11, align 2, !alias.scope !6
%12 = getelementptr inbounds nuw i8, ptr %.split, i64 12
%13 = load float, ptr %12, align 4, !invariant.load !3, !noalias !6
%14 = tail call bfloat @xla.fptrunc.f32.to.bf16(float %13) #1
%15 = getelementptr inbounds nuw i8, ptr %.split4, i64 6
store bfloat %14, ptr %15, align 2, !alias.scope !6
%invar.inc = add nuw nsw i64 %convert_element_type.1.invar_address.dim.0.06, 1
%exitcond = icmp eq i64 %invar.inc, 4
br i1 %exitcond, label %return, label %convert_element_type.1.loop_header.dim.1.preheader, !llvm.loop !9
return: ; preds = %convert_element_type.1.loop_header.dim.1.preheader
ret ptr null
}
```
You can see that with CSE enabled, it removes the line `%18 = getelementptr float, ptr %4, i64 12`
and changes the next one to `%18 = getelementptr i8, ptr %7, i64 12` instead of `%19 = getelementptr i8, ptr %18, i64 12`
I believe this is a bug, it seems that it CSEs the line with `%7 = getelementptr i8, ptr %4, i64 12`, but that has a different type than `%18`.
https://github.com/llvm/llvm-project/pull/151872
More information about the llvm-commits
mailing list