[llvm] 05a6e2e - [InstCombine] Add a combine for a shuffle of similar bitcasts

Mon Mar 8 08:37:13 PST 2021

Author: Sanne Wouda
Date: 2021-03-08T16:32:30Z
New Revision: 05a6e2eb9a412db22309031d2326bf6a3aa0c90c

URL: https://github.com/llvm/llvm-project/commit/05a6e2eb9a412db22309031d2326bf6a3aa0c90c
DIFF: https://github.com/llvm/llvm-project/commit/05a6e2eb9a412db22309031d2326bf6a3aa0c90c.diff

LOG: [InstCombine] Add a combine for a shuffle of similar bitcasts

Some intrinsics wrapper code has the habit of ignoring the type of the
elements in vectors, thinking of vector registers as a "bag of bits". As
a consequence, some operations are shared between vectors of different
types are shared. For example, functions that rearrange elements in a
vector can be shared between vectors of int32 and float.

This can result in bitcasts in awkward places that prevent the backend
from recognizing some instructions. For AArch64 in particular, it
inhibits the selection of dup from a general purpose register (GPR), and
mov from GPR to a vector lane.

This patch adds a pattern in InstCombine to move the bitcasts past the
shufflevector if this is possible. Sometimes this even allows
InstCombine to remove the bitcast entirely, as in the included tests.

Alternatively this could be done with a few extra patterns in the
AArch64 backend, but InstCombine seems like a better place for this.

Differential Revision: https://reviews.llvm.org/D97397

Added: 
    llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index d7474e3072f0..fb863ef133b4 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2289,6 +2289,25 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
 
   unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements();
   unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements();
+
+  // shuffle (bitcast X), (bitcast Y), Mask --> bitcast (shuffle X, Y, Mask)
+  //
+  // if X and Y are of the same (vector) type, and the element size is not
+  // changed by the bitcasts, we can distribute the bitcasts through the
+  // shuffle, hopefully reducing the number of instructions. We make sure that
+  // at least one bitcast only has one use, so we don't *increase* the number of
+  // instructions here.
+  Value *X, *Y;
+  if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_BitCast(m_Value(Y))) &&
+      X->getType()->isVectorTy() && X->getType() == Y->getType() &&
+      X->getType()->getScalarSizeInBits() ==
+          SVI.getType()->getScalarSizeInBits() &&
+      (LHS->hasOneUse() || RHS->hasOneUse())) {
+    Value *V = Builder.CreateShuffleVector(X, Y, SVI.getShuffleMask(),
+                                           SVI.getName() + ".uncasted");
+    return new BitCastInst(V, SVI.getType());
+  }
+
   ArrayRef<int> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
 
@@ -2298,7 +2317,6 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // TODO: This could be extended to allow length-changing shuffles.
   //       The transform might also be obsoleted if we allowed canonicalization
   //       of bitcasted shuffles.
-  Value *X;
   if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
       X->getType()->isVectorTy() && VWidth == LHSWidth) {
     // Try to create a scaled mask constant.

diff  --git a/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll b/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
new file mode 100644
index 000000000000..c491b82ef98a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define <2 x float> @vtrn1(<2 x i32> %v)
+; CHECK-LABEL: @vtrn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R_UNCASTED:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = bitcast <2 x i32> [[R_UNCASTED]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+{
+entry:
+  %vb1 = bitcast <2 x i32> %v to <2 x float>
+  %vb2 = bitcast <2 x i32> %v to <2 x float>
+  %r = shufflevector <2 x float> %vb1, <2 x float> %vb2, <2 x i32> <i32 0, i32 2>
+  ret <2 x float> %r
+}
+
+define <2 x float> @vtrn2(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @vtrn2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R_UNCASTED:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <2 x i32> [[R_UNCASTED]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+entry:
+  %xb = bitcast <2 x i32> %x to <2 x float>
+  %yb = bitcast <2 x i32> %y to <2 x float>
+  %r = shufflevector <2 x float> %xb, <2 x float> %yb, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %r
+}
+
+
+define <4 x float> @bc_shuf_lenchange(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @bc_shuf_lenchange(
+; CHECK-NEXT:    [[R_UNCASTED:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[R_UNCASTED]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xb = bitcast <2 x i32> %x to <2 x float>
+  %yb = bitcast <2 x i32> %y to <2 x float>
+  %r = shufflevector <2 x float> %xb, <2 x float> %yb, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %r
+}
+
+
+define <4 x float> @bc_shuf_nonvec(i64 %x, i64 %y) {
+; CHECK-LABEL: @bc_shuf_nonvec(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast i64 [[X:%.*]] to <2 x float>
+; CHECK-NEXT:    [[YB:%.*]] = bitcast i64 [[Y:%.*]] to <2 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[XB]], <2 x float> [[YB]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xb = bitcast i64 %x to <2 x float>
+  %yb = bitcast i64 %y to <2 x float>
+  %r = shufflevector <2 x float> %xb, <2 x float> %yb, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %r
+}
+
+define <4 x double> @bc_shuf_size(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @bc_shuf_size(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <4 x i32> [[X:%.*]] to <2 x double>
+; CHECK-NEXT:    [[YB:%.*]] = bitcast <4 x i32> [[Y:%.*]] to <2 x double>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x double> [[XB]], <2 x double> [[YB]], <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
+  %xb = bitcast <4 x i32> %x to <2 x double>
+  %yb = bitcast <4 x i32> %y to <2 x double>
+  %r = shufflevector <2 x double> %xb, <2 x double> %yb, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  ret <4 x double> %r
+}
+
+define <2 x double> @bc_shuf_mismatch(<4 x i32> %x, <2 x i64> %y) {
+; CHECK-LABEL: @bc_shuf_mismatch(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <4 x i32> [[X:%.*]] to <2 x double>
+; CHECK-NEXT:    [[YB:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <2 x double>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x double> [[XB]], <2 x double> [[YB]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %xb = bitcast <4 x i32> %x to <2 x double>
+  %yb = bitcast <2 x i64> %y to <2 x double>
+  %r = shufflevector <2 x double> %xb, <2 x double> %yb, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %r
+}
+
+define <8 x half> @bc_shuf_i8_float(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-LABEL: @bc_shuf_i8_float(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <8 x i8> [[X:%.*]] to <4 x half>
+; CHECK-NEXT:    [[YB:%.*]] = bitcast <8 x i8> [[Y:%.*]] to <4 x half>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[XB]], <4 x half> [[YB]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    ret <8 x half> [[R]]
+;
+  %xb = bitcast <8 x i8> %x to <4 x half>
+  %yb = bitcast <8 x i8> %y to <4 x half>
+  %r = shufflevector <4 x half> %xb, <4 x half> %yb, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x half> %r
+}
+
+define <4 x i16> @bc_shuf_elemtype_mismatch(<2 x half> %x, <2 x bfloat> %y) {
+; CHECK-LABEL: @bc_shuf_elemtype_mismatch(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <2 x half> [[X:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[YB:%.*]] = bitcast <2 x bfloat> [[Y:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i16> [[XB]], <2 x i16> [[YB]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i16> [[R]]
+;
+  %xb = bitcast <2 x half> %x to <2 x i16>
+  %yb = bitcast <2 x bfloat> %y to <2 x i16>
+  %r = shufflevector <2 x i16> %xb, <2 x i16> %yb, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i16> %r
+}
+
+define <2 x float> @bc_shuf_reuse(<4 x i32> %x){
+; CHECK-LABEL: @bc_shuf_reuse(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <4 x i32> [[X:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XB]], <4 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %xb = bitcast <4 x i32> %x to <4 x float>
+  %r = shufflevector <4 x float> %xb, <4 x float> %xb, <2 x i32> <i32 0, i32 4>
+  ret <2 x float> %r
+}
+
+define <4 x float> @bc_shuf_y_hasoneuse(<4 x i32> %x, <4 x i32> %y){
+; CHECK-LABEL: @bc_shuf_y_hasoneuse(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <4 x i32> [[X:%.*]] to <4 x float>
+; CHECK-NEXT:    [[SHUF_UNCASTED:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[SHUF:%.*]] = bitcast <4 x i32> [[SHUF_UNCASTED]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[XB]], [[SHUF]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xb = bitcast <4 x i32> %x to <4 x float>
+  %yb = bitcast <4 x i32> %y to <4 x float>
+  %shuf = shufflevector <4 x float> %xb, <4 x float> %yb, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %r = fadd <4 x float> %xb, %shuf
+  ret <4 x float> %r
+}
+
+define <4 x float> @bc_shuf_neither_hasoneuse(<4 x i32> %x, <4 x i32> %y){
+; CHECK-LABEL: @bc_shuf_neither_hasoneuse(
+; CHECK-NEXT:    [[XB:%.*]] = bitcast <4 x i32> [[X:%.*]] to <4 x float>
+; CHECK-NEXT:    [[YB:%.*]] = bitcast <4 x i32> [[Y:%.*]] to <4 x float>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[XB]], <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[SUM:%.*]] = fadd <4 x float> [[XB]], [[YB]]
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[SUM]], [[SHUF]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xb = bitcast <4 x i32> %x to <4 x float>
+  %yb = bitcast <4 x i32> %y to <4 x float>
+  %shuf = shufflevector <4 x float> %xb, <4 x float> %xb, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %sum = fadd <4 x float> %xb, %yb
+  %r = fadd <4 x float> %sum, %shuf
+  ret <4 x float> %r
+}