[llvm] [SLP] no need to generate extract for in-tree uses for original scala… (PR #76077)

Wed Dec 20 09:13:15 PST 2023

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Enna1 (Enna1)

<details>
<summary>Changes</summary>

…r instruction.

Before https://github.com/llvm/llvm-project/commit/77a609b55636dc540090ef9105c60a99cfdbd1dd, we always skip in-tree uses of the vectorized scalars in `buildExternalUses()`, that commit handle the case that if the in-tree use is scalar operand in vectorized instruction, we need to generate extract for these in-tree uses.

in-tree uses remain as scalar in vectorized instructions can be 3 cases:
- The pointer operand of vectorized LoadInst uses an in-tree scalar
- The pointer operand of vectorized StoreInst uses an in-tree scalar
- The scalar argument of vector form intrinsic uses an in-tree scalar

Generating extract for in-tree uses for vectorized instructions are implemented in BoUpSLP::vectorizeTree():
- https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11497-L11506
- https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11542-L11551
- https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11657-L11667

However, https://github.com/llvm/llvm-project/commit/77a609b55636dc540090ef9105c60a99cfdbd1dd not only generates extract for vectorized instructions, but also generate extract for original scalar instructions.

There is no need to generate extract for origin scalar instrutions, as these scalar instructions will be replaced by vector instructions and get erased later. Extract for origin scalar instrutions are also generated because in `BoUpSLP::buildExternalUses()` if `doesInTreeUserNeedToExtract()` return true, <in-tree scalar, scalar instruction use in-tree scalar> will be pushed to `ExternalUses`.

To omit generating extract for original scalar instruction, this patch remove `doesInTreeUserNeedToExtract()` check, and fold the follwing if expression to always true.
```
if (UseScalar != U ||
    UseEntry->State == TreeEntry::ScatterVectorize ||
    UseEntry->State == TreeEntry::PossibleStridedVectorize ||
    !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI))
```

With this change, it is also more likely to be profitable to vectorize since we remove the unneed entries in `ExternalUses` and get less extraction cost. E.g. the llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll testcase is updated as the `test2()` function is successfully vectorized with this change.

---
Full diff: https://github.com/llvm/llvm-project/pull/76077.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+4-40) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll (+8-11) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c325ad8a291a2..4e52421c287250 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -771,33 +771,6 @@ static bool allSameType(ArrayRef<Value *> VL) {
   return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
 }
 
-/// \returns True if in-tree use also needs extract. This refers to
-/// possible scalar operand in vectorized instruction.
-static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
-                                        TargetLibraryInfo *TLI) {
-  unsigned Opcode = UserInst->getOpcode();
-  switch (Opcode) {
-  case Instruction::Load: {
-    LoadInst *LI = cast<LoadInst>(UserInst);
-    return (LI->getPointerOperand() == Scalar);
-  }
-  case Instruction::Store: {
-    StoreInst *SI = cast<StoreInst>(UserInst);
-    return (SI->getPointerOperand() == Scalar);
-  }
-  case Instruction::Call: {
-    CallInst *CI = cast<CallInst>(UserInst);
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-    return any_of(enumerate(CI->args()), [&](auto &&Arg) {
-      return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
-             Arg.value().get() == Scalar;
-    });
-  }
-  default:
-    return false;
-  }
-}
-
 /// \returns the AA location that is being access by the instruction.
 static MemoryLocation getLocation(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -4933,19 +4906,10 @@ void BoUpSLP::buildExternalUses(
 
         // Skip in-tree scalars that become vectors
         if (TreeEntry *UseEntry = getTreeEntry(U)) {
-          Value *UseScalar = UseEntry->Scalars[0];
-          // Some in-tree scalars will remain as scalar in vectorized
-          // instructions. If that is the case, the one in Lane 0 will
-          // be used.
-          if (UseScalar != U ||
-              UseEntry->State == TreeEntry::ScatterVectorize ||
-              UseEntry->State == TreeEntry::PossibleStridedVectorize ||
-              !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
-            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
-                              << ".\n");
-            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
-            continue;
-          }
+          LLVM_DEBUG(dbgs()
+                     << "SLP: \tInternal user will be removed:" << *U << ".\n");
+          assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+          continue;
         }
 
         // Ignore users in the user ignore list.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
index a2c2e41d81b426..e2adeb911670a5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {
 
 define void @test2(i64* %a, i64* %b) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
-; CHECK-NEXT:    [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
-; CHECK-NEXT:    [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A1]], align 8
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
-; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
-; CHECK-NEXT:    store i64 [[ADD1]], ptr [[A1]], align 8
-; CHECK-NEXT:    store i64 [[ADD2]], ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a1 = getelementptr inbounds i64, i64* %a, i64 1

``````````

</details>


https://github.com/llvm/llvm-project/pull/76077