[llvm] r235810 - [x86] instcombine more cases of insertps into a shufflevector

Sanjay Patel spatel at rotateright.com
Sat Apr 25 13:55:25 PDT 2015


Author: spatel
Date: Sat Apr 25 15:55:25 2015
New Revision: 235810

URL: http://llvm.org/viewvc/llvm-project?rev=235810&view=rev
Log:
[x86] instcombine more cases of insertps into a shufflevector

This is a follow-on to D8833 (insertps optimization when the zero mask is not used).

In this patch, we check for the case where the zmask is used, but both input vectors
to the insertps intrinsic are the same operand or the zmask overrides the destination
lane. This lets us replace the 2nd shuffle input operand with the zero vector.

Differential Revision: http://reviews.llvm.org/D9257


Modified:
    llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
    llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll

Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=235810&r1=235809&r2=235810&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Sat Apr 25 15:55:25 2015
@@ -201,7 +201,7 @@ static Value *SimplifyX86insertps(const
                                   InstCombiner::BuilderTy &Builder) {
   if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
     VectorType *VecTy = cast<VectorType>(II.getType());
-    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
     
     // The immediate permute control byte looks like this:
     //    [3:0] - zero mask for each 32-bit lane
@@ -213,25 +213,42 @@ static Value *SimplifyX86insertps(const
     uint8_t DestLane = (Imm >> 4) & 0x3;
     uint8_t SourceLane = (Imm >> 6) & 0x3;
 
+    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
     // If all zero mask bits are set, this was just a weird way to
     // generate a zero vector.
     if (ZMask == 0xf)
       return ZeroVector;
-    
-    // TODO: Model this case as two shuffles or a 'logical and' plus shuffle?
-    if (ZMask)
-      return nullptr;
 
-    assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
-
-    // If we're not zeroing anything, this is a single shuffle.
-    // Replace the selected destination lane with the selected source lane.
-    // For all other lanes, pass the first source bits through.
+    // Initialize by passing all of the first source bits through.
     int ShuffleMask[4] = { 0, 1, 2, 3 };
-    ShuffleMask[DestLane] = SourceLane + 4;
-    
-    return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1),
-                                       ShuffleMask);
+
+    // We may replace the second operand with the zero vector.
+    Value *V1 = II.getArgOperand(1);
+
+    if (ZMask) {
+      // If the zero mask is being used with a single input or the zero mask
+      // overrides the destination lane, this is a shuffle with the zero vector.
+      if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+          (ZMask & (1 << DestLane))) {
+        V1 = ZeroVector;
+        // We may still move 32-bits of the first source vector from one lane
+        // to another.
+        ShuffleMask[DestLane] = SourceLane;
+        // The zero mask may override the previous insert operation.
+        for (unsigned i = 0; i < 4; ++i)
+          if ((ZMask >> i) & 0x1)
+            ShuffleMask[i] = i + 4;
+      } else {
+        // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+        return nullptr;
+      }
+    } else {
+      // Replace the selected destination lane with the selected source lane.
+      ShuffleMask[DestLane] = SourceLane + 4;
+    }
+  
+    return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
   }
   return nullptr;
 }

Modified: llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll?rev=235810&r1=235809&r2=235810&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll Sat Apr 25 15:55:25 2015
@@ -30,14 +30,47 @@ define <4 x float> @insertps_0xff(<4 x f
 ; CHECK-NEXT:  ret <4 x float> zeroinitializer
 }
 
-; If some zero mask bits are set, we do not change anything.
+; If some zero mask bits are set that do not override the insertion, we do not change anything.
 
-define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) {
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
   ret <4 x float> %res
 
-; CHECK-LABEL: @insertps_0x03
-; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3)
+; CHECK-LABEL: @insertps_0x0c
+; CHECK-NEXT:  call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; ...unless both input vectors are the same operand.
+
+define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x15_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane.
+
+define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0x1a_single_input
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:  ret <4 x float>
+}
+
+; The zero mask overrides the insertion lane, so the second input vector is not used.
+
+define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
+  ret <4 x float> %res
+
+; CHECK-LABEL: @insertps_0xc1
+; CHECK-NEXT:  shufflevector <4 x float> %v1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:  ret <4 x float>
 }
 





More information about the llvm-commits mailing list