[PATCH] D21190: [InstCombine] allow more than one use for vector cast folding with selects

Thu Jun 9 10:50:57 PDT 2016

spatel created this revision.
spatel added reviewers: majnemer, eli.friedman, RKSimon.
spatel added a subscriber: llvm-commits.
Herald added a subscriber: mcrosier.

The motivating example for this transform is similar to D20774 where bitcasts interfere with a single cmp/select sequence, but in this case we have 2 uses of each bitcast to produce min and max ops:

  define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) {
    %cmp = fcmp olt <4 x float> %a, %b
    %bc1 = bitcast <4 x float> %a to <4 x i32>
    %bc2 = bitcast <4 x float> %b to <4 x i32>
    %sel1 = select <4 x i1> %cmp, <4 x i32> %bc1, <4 x i32> %bc2
    %sel2 = select <4 x i1> %cmp, <4 x i32> %bc2, <4 x i32> %bc1
    %bc3 = bitcast <4 x float>* %ptr1 to <4 x i32>*
    store <4 x i32> %sel1, <4 x i32>* %bc3
    %bc4 = bitcast <4 x float>* %ptr2 to <4 x i32>*
    store <4 x i32> %sel2, <4 x i32>* %bc4
    ret void
  }

With this patch, we move the selects up to use the input args which allows getting rid of all of the bitcasts:
  define void @minmax_bc_store(<4 x float> %a, <4 x float> %b, <4 x float>* %ptr1, <4 x float>* %ptr2) {
    %cmp = fcmp olt <4 x float> %a, %b
    %sel1.v = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
    %sel2.v = select <4 x i1> %cmp, <4 x float> %b, <4 x float> %a
    store <4 x float> %sel1.v, <4 x float>* %ptr1, align 16
    store <4 x float> %sel2.v, <4 x float>* %ptr2, align 16
    ret void
  }

The asm for x86 SSE then improves from:
  movaps	%xmm0, %xmm2
  cmpltps	%xmm1, %xmm2
  movaps	%xmm2, %xmm3
  andnps	%xmm1, %xmm3
  movaps	%xmm2, %xmm4
  andnps	%xmm0, %xmm4
  andps	%xmm2, %xmm0
  orps	%xmm3, %xmm0
  andps	%xmm1, %xmm2
  orps	%xmm4, %xmm2
  movaps	%xmm0, (%rdi)
  movaps	%xmm2, (%rsi)

To:
  movaps	%xmm0, %xmm2
  minps	%xmm1, %xmm2
  maxps	%xmm0, %xmm1
  movaps	%xmm2, (%rdi)
  movaps	%xmm1, (%rsi)


http://reviews.llvm.org/D21190

Files:
  lib/Transforms/InstCombine/InstCombineSelect.cpp
  test/Transforms/InstCombine/select.ll

Index: test/Transforms/InstCombine/select.ll
===================================================================

--- test/Transforms/InstCombine/select.ll
+++ test/Transforms/InstCombine/select.ll
@@ -723,6 +723,30 @@
   ret i48 %tmp2
 }
 
+; Allow select promotion even if there are multiple uses of casted ops.
+; Hoisting the selects allows later pattern matching to see that these are min/max ops.
+
+define void @min_max_bitcast(<4 x float> %a, <4 x float> %b, <4 x i32>* %ptr1, <4 x i32>* %ptr2) {
+; CHECK-LABEL: @min_max_bitcast(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x float> %a, %b
+; CHECK-NEXT:    [[SEL1_V:%.*]] = select <4 x i1> [[CMP]], <4 x float> %a, <4 x float> %b
+; CHECK-NEXT:    [[SEL2_V:%.*]] = select <4 x i1> [[CMP]], <4 x float> %b, <4 x float> %a
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32>* %ptr1 to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[SEL1_V]], <4 x float>* [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32>* %ptr2 to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[SEL2_V]], <4 x float>* [[TMP2]], align 16
+; CHECK-NEXT:    ret void
+;
+  %cmp = fcmp olt <4 x float> %a, %b
+  %bc1 = bitcast <4 x float> %a to <4 x i32>
+  %bc2 = bitcast <4 x float> %b to <4 x i32>
+  %sel1 = select <4 x i1> %cmp, <4 x i32> %bc1, <4 x i32> %bc2
+  %sel2 = select <4 x i1> %cmp, <4 x i32> %bc2, <4 x i32> %bc1
+  store <4 x i32> %sel1, <4 x i32>* %ptr1
+  store <4 x i32> %sel2, <4 x i32>* %ptr2
+  ret void
+}
+
 ; PR8575
 
 define i32 @test52(i32 %n, i32 %m) nounwind {
Index: lib/Transforms/InstCombine/InstCombineSelect.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -116,8 +116,7 @@
   }
 }
 
-/// Here we have (select c, TI, FI), and we know that TI and FI
-/// have the same opcode and only one use each.  Try to simplify this.
+/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
 Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                                           Instruction *FI) {
   // If this is a cast from the same type, merge.
@@ -129,10 +128,17 @@
     // The select condition may be a vector. We may only change the operand
     // type if the vector width remains the same (and matches the condition).
     Type *CondTy = SI.getCondition()->getType();
-    if (CondTy->isVectorTy() &&
-        (!FIOpndTy->isVectorTy() ||
-         CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
+    if (CondTy->isVectorTy()) {
+      if (!FIOpndTy->isVectorTy())
+        return nullptr;
+      if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())
+        return nullptr;
+    } else if (!TI->hasOneUse() || !FI->hasOneUse()) {
+      // TODO: The one-use restrictions for a scalar select could be eased if
+      // the fold of a select in visitLoadInst() was enhanced to match a pattern
+      // that includes a cast.
       return nullptr;
+    }
 
     // Fold this by inserting a select from the input values.
     Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
@@ -142,7 +148,7 @@
   }
 
   // Only handle binary operators here.
-  if (!isa<BinaryOperator>(TI))
+  if (!isa<BinaryOperator>(TI) || !TI->hasOneUse() || !FI->hasOneUse())
     return nullptr;
 
   // Figure out if the operations have any operands in common.
@@ -1056,14 +1062,12 @@
   if (Instruction *Add = foldAddSubSelect(SI, *Builder))
     return Add;
 
+  // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
   auto *TI = dyn_cast<Instruction>(TrueVal);
   auto *FI = dyn_cast<Instruction>(FalseVal);
-  if (TI && FI && TI->hasOneUse() && FI->hasOneUse()) {
-    // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
-    if (TI->getOpcode() == FI->getOpcode())
-      if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
-        return IV;
-  }
+  if (TI && FI && TI->getOpcode() == FI->getOpcode())
+    if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+      return IV;
 
   // See if we can fold the select into one of our operands.
   if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) {


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D21190.60197.patch
Type: text/x-patch
Size: 4263 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20160609/cda37952/attachment.bin>