[llvm] 48a23bc - [VectorCombine] limit load+insert transform to one-use

Thu Sep 17 11:29:27 PDT 2020

Author: Sanjay Patel
Date: 2020-09-17T14:29:15-04:00
New Revision: 48a23bccf3732e1480ad169bd4a08a68bb100bfa

URL: https://github.com/llvm/llvm-project/commit/48a23bccf3732e1480ad169bd4a08a68bb100bfa
DIFF: https://github.com/llvm/llvm-project/commit/48a23bccf3732e1480ad169bd4a08a68bb100bfa.diff

LOG: [VectorCombine] limit load+insert transform to one-use

As discussed in:
https://llvm.org/PR47558
...there are several potential fixes/follow-ups visible
in the test case, but this is the quickest and safest
fix of the perf regression.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index abc706c3eaa4..1bac16b92a9d 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -95,7 +95,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // Match insert into fixed vector of scalar load.
   auto *Ty = dyn_cast<FixedVectorType>(I.getType());
   Value *Scalar;
-  if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())))
+  if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
+      !Scalar->hasOneUse())
     return false;
 
   // Do not vectorize scalar load (widening) if atomic/volatile or under

diff  --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index 6a63ebf497ab..5842f1478040 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -472,10 +472,8 @@ define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr
 ; CHECK-LABEL: @PR47558_multiple_use_load(
 ; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr()
 ; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, <2 x float>* [[OPPTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCALEPTR]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[SCALE:%.*]] = load float, float* [[SCALEPTR]], align 16
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
 ; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
 ; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
 ; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0