[llvm] 6fb5bed - [SLP]Do not create unused gather nodes for scalar arguments of vector intrinsics.

Mon Nov 15 06:13:45 PST 2021

Author: Alexey Bataev
Date: 2021-11-15T06:11:19-08:00
New Revision: 6fb5bed7d16bffc8549db0523323b182dd43fe4e

URL: https://github.com/llvm/llvm-project/commit/6fb5bed7d16bffc8549db0523323b182dd43fe4e
DIFF: https://github.com/llvm/llvm-project/commit/6fb5bed7d16bffc8549db0523323b182dd43fe4e.diff

LOG: [SLP]Do not create unused gather nodes for scalar arguments of vector intrinsics.

If the vector intrinsic has scalar argument, we currently still create
a tree entry for this argument. This entry is not used, just consumes
resources and increases the cost of the tree.

Differential Revision: https://reviews.llvm.org/D113806

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8fd5adb1ad05..a077b2513915 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4024,6 +4024,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    ReuseShuffleIndicies);
       TE->setOperandsInOrder();
       for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
+        // For scalar operands no need to to create an entry since no need to
+        // vectorize it.
+        if (hasVectorInstrinsicScalarOpd(ID, i))
+          continue;
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL) {

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
index 2c0ce0977a37..950bbcb7d5dd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
@@ -5,24 +5,17 @@ declare float @llvm.powi.f32.i32(float, i32)
 define void @vec_powi_f32(float* %a, float* %c, i32 %P) {
 ; CHECK-LABEL: @vec_powi_f32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I0:%.*]] = load float, float* [[A:%.*]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[I0]], i32 [[P:%.*]])
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 1
-; CHECK-NEXT:    [[I2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[I2]], i32 [[P]])
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2
-; CHECK-NEXT:    [[I4:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[I4]], i32 [[P]])
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3
-; CHECK-NEXT:    [[I6:%.*]] = load float, float* [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[I6]], i32 [[P]])
-; CHECK-NEXT:    store float [[CALL1]], float* [[C:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1
-; CHECK-NEXT:    store float [[CALL2]], float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]])
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
-; CHECK-NEXT:    store float [[CALL3]], float* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
-; CHECK-NEXT:    store float [[CALL4]], float* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: