[llvm] r336095 - [SLPVectorizer] Fix alternate opcode + shuffle cost function to correct handle SK_Select patterns.

Mon Jul 2 04:28:01 PDT 2018

Author: rksimon
Date: Mon Jul  2 04:28:01 2018
New Revision: 336095

URL: http://llvm.org/viewvc/llvm-project?rev=336095&view=rev
Log:
[SLPVectorizer] Fix alternate opcode + shuffle cost function to correct handle SK_Select patterns.

We were always using the opcodes of the first 2 scalars for the costs of the alternate opcode + shuffle. This made sense when we used SK_Alternate and opcodes were guaranteed to be alternating, but this fails for the more general SK_Select case.

This fix exposes an issue demonstrated by the fmul_fdiv_v4f32_const test - the SLM model has v4f32 fdiv costs which are more than twice those of the f32 scalar cost, meaning that the cost model determines that the vectorization is not performant. Unfortunately it completely ignores the fact that the fdiv by a constant will be changed into a fmul by InstCombine for a much lower cost vectorization. But at least we're seeing this now...

Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=336095&r1=336094&r2=336095&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Mon Jul  2 04:28:01 2018
@@ -2375,14 +2375,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
         Instruction *I = cast<Instruction>(i);
         if (!I)
           break;
+        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
-      Instruction *I0 = cast<Instruction>(VL[0]);
-      VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy);
-      Instruction *I1 = cast<Instruction>(VL[1]);
-      VecCost += TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy);
+      VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy);
+      VecCost += TTI->getArithmeticInstrCost(S.AltOpcode, VecTy);
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll?rev=336095&r1=336094&r2=336095&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll Mon Jul  2 04:28:01 2018
@@ -120,9 +120,30 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x
 }
 
 define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
-; CHECK-LABEL: @fmul_fdiv_v4f32_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; SSE-LABEL: @fmul_fdiv_v4f32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    ret <4 x float> [[TMP1]]
+;
+; SLM-LABEL: @fmul_fdiv_v4f32_const(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    ret <4 x float> [[R3]]
+;
+; AVX-LABEL: @fmul_fdiv_v4f32_const(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    ret <4 x float> [[TMP1]]
+;
+; AVX512-LABEL: @fmul_fdiv_v4f32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1