[llvm-dev] [PATCH] D70246: [InstCombine] remove identity shuffle simplification for mask with undefs

Mon Dec 9 07:36:43 PST 2019

Sanjay,

I'm looking at some missed optimizations caused by D70246. Here's a test case:

define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
  %t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
  %t44 = bitcast <3 x i32> %t43 to <3 x float>
  %t45 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 0, i32 undef, i32 undef, i32 undef>
  %t46 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 undef, i32 1, i32 undef, i32 undef>
  %t47 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 undef, i32 undef, i32 2, i32 undef>
  %t48 = insertelement <4 x float> %t45, float 1.000000e+00, i32 3
  %t49 = shufflevector <4 x float> %t48, <4 x float> %t46, <4 x i32>
<i32 0, i32 5, i32 undef, i32 3>
  %t50 = shufflevector <4 x float> %t49, <4 x float> %t47, <4 x i32>
<i32 0, i32 1, i32 6, i32 3>
  %t55 = load <4 x float>, <4 x float>* %t24, align 16
  %t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
  %t59 = load <4 x float>, <4 x float>* %t58, align 16
  %t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
  %t63 = load <4 x float>, <4 x float>* %t62, align 16
  %t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
  %t67 = load <4 x float>, <4 x float>* %t66, align 16
  %t69 = shufflevector <4 x float> %t50, <4 x float> undef, <4 x i32>
zeroinitializer
  %t71 = fmul <4 x float> %t55, %t69
  %t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
  %t74 = fmul <4 x float> %t59, %t72
  %t75 = fadd <4 x float> %t71, %t74
  %t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
  %t78 = fmul <4 x float> %t63, %t76
  %t79 = fadd <4 x float> %t75, %t78
  %t80 = shufflevector <4 x float> %t50, <4 x float> undef, <4 x i32>
<i32 3, i32 3, i32 3, i32 3>
  %t82 = fmul <4 x float> %t67, %t80
  %t83 = fadd <4 x float> %t79, %t82
  ret <4 x float> %t83
}

Before D70246, opt -instcombine gives this:

define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
  %t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
  %t44 = bitcast <3 x i32> %t43 to <3 x float>
  %t55 = load <4 x float>, <4 x float>* %t24, align 16
  %t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
  %t59 = load <4 x float>, <4 x float>* %t58, align 16
  %t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
  %t63 = load <4 x float>, <4 x float>* %t62, align 16
  %t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
  %t67 = load <4 x float>, <4 x float>* %t66, align 16
  %t69 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
zeroinitializer
  %t71 = fmul <4 x float> %t55, %t69
  %t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
  %t74 = fmul <4 x float> %t59, %t72
  %t75 = fadd <4 x float> %t71, %t74
  %t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
  %t78 = fmul <4 x float> %t63, %t76
  %t79 = fadd <4 x float> %t75, %t78
  %t83 = fadd <4 x float> %t79, %t67
  ret <4 x float> %t83
}

After D70246, opt -instcombine gives this:

define <4 x float> @f(i32 %t32, <4 x float>* %t24) {
.entry:
  %t43 = insertelement <3 x i32> undef, i32 %t32, i32 2
  %t44 = bitcast <3 x i32> %t43 to <3 x float>
  %t45 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 0, i32 undef, i32 undef, i32 undef>
  %t48 = insertelement <4 x float> %t45, float 1.000000e+00, i32 3
  %t55 = load <4 x float>, <4 x float>* %t24, align 16
  %t58 = getelementptr <4 x float>, <4 x float>* %t24, i64 1
  %t59 = load <4 x float>, <4 x float>* %t58, align 16
  %t62 = getelementptr <4 x float>, <4 x float>* %t24, i64 2
  %t63 = load <4 x float>, <4 x float>* %t62, align 16
  %t66 = getelementptr <4 x float>, <4 x float>* %t24, i64 3
  %t67 = load <4 x float>, <4 x float>* %t66, align 16
  %t69 = shufflevector <4 x float> %t48, <4 x float> undef, <4 x i32>
zeroinitializer
  %t71 = fmul <4 x float> %t55, %t69
  %t72 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 1, i32 1, i32 1, i32 1>
  %t74 = fmul <4 x float> %t59, %t72
  %t75 = fadd <4 x float> %t71, %t74
  %t76 = shufflevector <3 x float> %t44, <3 x float> undef, <4 x i32>
<i32 2, i32 2, i32 2, i32 2>
  %t78 = fmul <4 x float> %t63, %t76
  %t79 = fadd <4 x float> %t75, %t78
  %t80 = shufflevector <4 x float> %t48, <4 x float> undef, <4 x i32>
<i32 3, i32 3, i32 3, i32 3>
  %t82 = fmul <4 x float> %t67, %t80
  %t83 = fadd <4 x float> %t79, %t82
  ret <4 x float> %t83
}

Notice that it has failed to simplify %t80, which extracts element 3
from %t48, which is obviously just the constant 1.0.

Would you expect the change you committed to have this kind of effect?
Do you agree that simplifying %t80 is still valid, even with the new
interpretation of the rules about shufflevector and undef and poison?

Thanks,
Jay.

On Thu, 14 Nov 2019 at 16:25, Sanjay Patel via Phabricator via
llvm-commits <llvm-commits at lists.llvm.org> wrote:
>
> spatel created this revision.
> spatel added reviewers: efriedma, regehr, aqjune, nlopes, RKSimon, lebedev.ri, liuz.
> Herald added subscribers: hiraditya, mcrosier.
> Herald added a project: LLVM.
>
> Given a shuffle that includes undef elements in an otherwise identity mask like:
>
>   define <4 x float> @shuffle(<4 x float> %arg) {
>     %shuf = shufflevector <4 x float> %arg, <4 x float> undef, <4 x i32> <i32 undef, i32 1, i32 2, i32 3>
>     ret <4 x float> %shuf
>   }
>
> We were simplifying that to the input operand.
>
> But as discussed in PR43958:
> https://bugs.llvm.org/show_bug.cgi?id=43958
> ...that means that per-vector-element poison that would be stopped by the shuffle can now leak to the result.
>
> For reference, this transform was added long ago with:
> rL142671 <https://reviews.llvm.org/rL142671>
>
> Also note that we still have (and there are tests for) the same transform with no undef elements in the mask (a fully-defined identity mask). I don't think there's any controversy about that case - it's a valid transform under any interpretation of shufflevector/undef/poison.
>
> Looking at a few of the diffs into codegen, I don't see any difference in final asm. So depending on your perspective, that's good (no real loss of optimization power) or bad (poison exists in the DAG, so we only partially fixed the bug).
>
>
> https://reviews.llvm.org/D70246
>
> Files:
>   llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
>   llvm/test/Transforms/InstCombine/X86/clmulqdq.ll
>   llvm/test/Transforms/InstCombine/X86/x86-avx2.ll
>   llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
>   llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
>   llvm/test/Transforms/InstCombine/X86/x86-pack.ll
>   llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll
>   llvm/test/Transforms/InstCombine/X86/x86-sse.ll
>   llvm/test/Transforms/InstCombine/X86/x86-sse41.ll
>   llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll
>   llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
>   llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll
>   llvm/test/Transforms/InstCombine/X86/x86-xop.ll
>   llvm/test/Transforms/InstCombine/shuffle_select.ll
>   llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
>   llvm/test/Transforms/InstCombine/vec_shuffle.ll
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits