[llvm] 0bab0f6 - [InstCombine] canonicalize cast before unary shuffle

Tue May 25 05:44:18 PDT 2021

Author: Sanjay Patel
Date: 2021-05-25T08:43:09-04:00
New Revision: 0bab0f6161193cd0cd24b7b0fc51590a60e810d2

URL: https://github.com/llvm/llvm-project/commit/0bab0f6161193cd0cd24b7b0fc51590a60e810d2
DIFF: https://github.com/llvm/llvm-project/commit/0bab0f6161193cd0cd24b7b0fc51590a60e810d2.diff

LOG: [InstCombine] canonicalize cast before unary shuffle

We could go either direction on this transform. VectorCombine already goes this
way for bitcasts (and handles more complicated cases using the cost model), so
let's try cast-first.

Deferring completely to VectorCombine is another possibility. But the backend
should be able to invert this easily when the vectors have the same shape, so
it doesn't seem like a transform that we need to avoid.

The motivating example from https://llvm.org/PR49081 has an int-to-float
sandwiched between 2 shuffles, and the backend currently does not reduce that,
so on x86, we get something like:

  pshufd	$249, %xmm0, %xmm0]
  cvtdq2ps	%xmm0, %xmm0
  shufps	$144, %xmm0, %xmm0

...instead of just a single conversion instruction.

Differential Revision: https://reviews.llvm.org/D103038

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
    llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
    llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
    llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
    llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
    llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
    llvm/test/Transforms/InstCombine/vector-casts.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 785ee240d15b..49c6057d8c93 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -277,13 +277,13 @@ InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
 /// Implement the transforms common to all CastInst visitors.
 Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
+  Type *Ty = CI.getType();
 
   // Try to eliminate a cast of a cast.
   if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
     if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
-      auto *Ty = CI.getType();
       auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
       // Point debug users of the dying cast to the new one.
       if (CSrc->hasOneUse())
@@ -319,6 +319,24 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
         return NV;
   }
 
+  // Canonicalize a unary shuffle after the cast if neither operation changes
+  // the size or element size of the input vector.
+  // TODO: We could allow size-changing ops if that doesn't harm codegen.
+  // cast (shuffle X, Mask) --> shuffle (cast X), Mask
+  Value *X;
+  ArrayRef<int> Mask;
+  if (match(Src, m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(Mask))))) {
+    // TODO: Allow scalable vectors?
+    auto *SrcTy = dyn_cast<FixedVectorType>(X->getType());
+    auto *DestTy = dyn_cast<FixedVectorType>(Ty);
+    if (SrcTy && DestTy &&
+        SrcTy->getNumElements() == DestTy->getNumElements() &&
+        SrcTy->getPrimitiveSizeInBits() == DestTy->getPrimitiveSizeInBits()) {
+      Value *CastX = Builder.CreateCast(CI.getOpcode(), X, DestTy);
+      return new ShuffleVectorInst(CastX, UndefValue::get(DestTy), Mask);
+    }
+  }
+
   return nullptr;
 }
 

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
index bc0b67921784..041d55e65af4 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
@@ -24,8 +24,8 @@ define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
 ; All 8 elements required.
 define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
 ;

diff  --git a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
index 19a850c04212..bc7671e66732 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
@@ -24,8 +24,8 @@ define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
 ; All 8 elements required.
 define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
 ;

diff  --git a/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll b/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
index c491b82ef98a..df0cb49f8110 100644
--- a/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
+++ b/llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
@@ -4,8 +4,8 @@
 define <2 x float> @vtrn1(<2 x i32> %v)
 ; CHECK-LABEL: @vtrn1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R_UNCASTED:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = bitcast <2 x i32> [[R_UNCASTED]] to <2 x float>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <2 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
 {

diff  --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
index 0a88370df05a..e1cf1cf41ac7 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
@@ -56,9 +56,9 @@ define <4 x i16> @splat_bitcast_operand_uses(<8 x i8> %x) {
 
 define <4 x i32> @splat_bitcast_operand_same_size_src_elt(<4 x float> %x) {
 ; CHECK-LABEL: @splat_bitcast_operand_same_size_src_elt(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x float> [[S1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[S2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i32> [[BC]]
 ;
   %s1 = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %bc = bitcast <4 x float> %s1 to <4 x i32>

diff  --git a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
index 5f0ca891fdee..ba57de358aa4 100644
--- a/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -56,9 +56,9 @@ define <4 x i16> @splat_bitcast_operand_uses(<8 x i8> %x) {
 
 define <4 x i32> @splat_bitcast_operand_same_size_src_elt(<4 x float> %x) {
 ; CHECK-LABEL: @splat_bitcast_operand_same_size_src_elt(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x float> [[S1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[S2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i32> [[BC]]
 ;
   %s1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %bc = bitcast <4 x float> %s1 to <4 x i32>

diff  --git a/llvm/test/Transforms/InstCombine/vector-casts.ll b/llvm/test/Transforms/InstCombine/vector-casts.ll
index 6bb4f4566bfb..0cabb620b17a 100644
--- a/llvm/test/Transforms/InstCombine/vector-casts.ll
+++ b/llvm/test/Transforms/InstCombine/vector-casts.ll
@@ -413,8 +413,8 @@ define <2 x i64> @zext_less_casting_with_wideop(<2 x i64> %x, <2 x i64> %y) {
 
 define <4 x float> @sitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @sitofp_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
-; CHECK-NEXT:    [[R:%.*]] = sitofp <4 x i32> [[S]] to <4 x float>
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[X:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -424,8 +424,8 @@ define <4 x float> @sitofp_shuf(<4 x i32> %x) {
 
 define <3 x half> @uitofp_shuf(<3 x i16> %x) {
 ; CHECK-LABEL: @uitofp_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x i16> [[X:%.*]], <3 x i16> poison, <3 x i32> <i32 2, i32 undef, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = uitofp <3 x i16> [[S]] to <3 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp <3 x i16> [[X:%.*]] to <3 x half>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x half> [[TMP1]], <3 x half> undef, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x half> [[R]]
 ;
   %s = shufflevector <3 x i16> %x, <3 x i16> poison, <3 x i32> <i32 2, i32 undef, i32 0>
@@ -435,8 +435,8 @@ define <3 x half> @uitofp_shuf(<3 x i16> %x) {
 
 define <4 x i64> @fptosi_shuf(<4 x double> %x) {
 ; CHECK-LABEL: @fptosi_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[X:%.*]], <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
-; CHECK-NEXT:    [[R:%.*]] = fptosi <4 x double> [[S]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x double> [[X:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
 ;
   %s = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
@@ -446,8 +446,8 @@ define <4 x i64> @fptosi_shuf(<4 x double> %x) {
 
 define <2 x i32> @fptoui_shuf(<2 x float> %x) {
 ; CHECK-LABEL: @fptoui_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[R:%.*]] = fptoui <2 x float> [[S]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptoui <2 x float> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %s = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 1>
@@ -455,6 +455,9 @@ define <2 x i32> @fptoui_shuf(<2 x float> %x) {
   ret <2 x i32> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the shuffle?
+
 define <4 x half> @narrowing_sitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @narrowing_sitofp_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -466,6 +469,8 @@ define <4 x half> @narrowing_sitofp_shuf(<4 x i32> %x) {
   ret <4 x half> %r
 }
 
+; negative test
+
 define <4 x double> @widening_uitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @widening_uitofp_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -477,6 +482,8 @@ define <4 x double> @widening_uitofp_shuf(<4 x i32> %x) {
   ret <4 x double> %r
 }
 
+; negative test
+
 define <3 x i64> @fptosi_narrowing_shuf(<4 x double> %x) {
 ; CHECK-LABEL: @fptosi_narrowing_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[X:%.*]], <4 x double> poison, <3 x i32> <i32 undef, i32 2, i32 3>
@@ -488,6 +495,9 @@ define <3 x i64> @fptosi_narrowing_shuf(<4 x double> %x) {
   ret <3 x i64> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the cast?
+
 define <3 x i32> @fptoui_widening_shuf(<2 x float> %x) {
 ; CHECK-LABEL: @fptoui_widening_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <3 x i32> <i32 1, i32 1, i32 0>
@@ -499,6 +509,9 @@ define <3 x i32> @fptoui_widening_shuf(<2 x float> %x) {
   ret <3 x i32> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the cast?
+
 define <4 x half> @narrowing_sitofp_widening_shuf(<2 x i32> %x) {
 ; CHECK-LABEL: @narrowing_sitofp_widening_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 undef>
@@ -512,6 +525,8 @@ define <4 x half> @narrowing_sitofp_widening_shuf(<2 x i32> %x) {
 
 declare void @use(<4 x i32>)
 
+; negative test
+
 define <4 x float> @sitofp_shuf_extra_use(<4 x i32> %x) {
 ; CHECK-LABEL: @sitofp_shuf_extra_use(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -524,3 +539,17 @@ define <4 x float> @sitofp_shuf_extra_use(<4 x i32> %x) {
   %r = sitofp <4 x i32> %s to <4 x float>
   ret <4 x float> %r
 }
+
+; negative test
+; TODO: Allow scalable vectors?
+
+define <vscale x 4 x float> @sitofp_shuf_scalable(<vscale x 4 x i32> %x) {
+; CHECK-LABEL: @sitofp_shuf_scalable(
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <vscale x 4 x i32> [[X:%.*]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = sitofp <vscale x 4 x i32> [[S]] to <vscale x 4 x float>
+; CHECK-NEXT:    ret <vscale x 4 x float> [[R]]
+;
+  %s = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %r = sitofp <vscale x 4 x i32> %s to <vscale x 4 x float>
+  ret <vscale x 4 x float> %r
+}