[llvm] 9803b0d - [RISCV] Implement getVScaleForTuning and thus prefer scalable vectorization when enabled

Sat Jun 25 11:25:34 PDT 2022

Author: Philip Reames
Date: 2022-06-25T11:25:23-07:00
New Revision: 9803b0d1e7b3cbcce33c1c91d4e1cd1f20eea3d4

URL: https://github.com/llvm/llvm-project/commit/9803b0d1e7b3cbcce33c1c91d4e1cd1f20eea3d4
DIFF: https://github.com/llvm/llvm-project/commit/9803b0d1e7b3cbcce33c1c91d4e1cd1f20eea3d4.diff

LOG: [RISCV] Implement getVScaleForTuning and thus prefer scalable vectorization when enabled

LoopVectorizer uses getVScaleForTuning for deciding how to discount the cost of a potential vector factor by the amount of work performed. Without the callback implemented, the vectorizer was defaulting to an estimated vscale of 1. This results in fixed vectorization looking falsely profitable (since it used the command line VLEN).

The test change is pretty limited since a) we don't have much coverage of the vectorizer with scalable vectors at all, and b) what little coverage we have mostly uses i64 element types. There's a separate issue with <vscale x 1 x i64> which prevents us from getting to this stage of costing, and thus only the one test explicitly written to avoid that is visible in the diff. However, this is actually a very wide impact change as it changes the practical vectorization result when both fixed and scalable is enabled to scalable.

As an aside, I think the vectorizer is at little too strongly biased towards scalable when both are legal, but we can explore that separately. For now, let's just get the cost model working the way it was intended.

Differential Revision: https://reviews.llvm.org/D128547

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
    llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5ffbe5d83e456..f69a87d291be4 100644

--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -137,6 +137,12 @@ Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
   return BaseT::getMaxVScale();
 }
 
+Optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
+  if (ST->hasVInstructions())
+    return ST->getRealMinVLen() / RISCV::RVVBitsPerBlock;
+  return BaseT::getVScaleForTuning();
+}
+
 TypeSize
 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   unsigned LMUL = PowerOf2Floor(

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 02047f6274c2a..426d81063d7ae 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -57,6 +57,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   bool shouldExpandReduction(const IntrinsicInst *II) const;
   bool supportsScalableVectors() const { return ST->hasVInstructions(); }
   Optional<unsigned> getMaxVScale() const;
+  Optional<unsigned> getVScaleForTuning() const;
 
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
 

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
index c92b2a251ad34..d581b70222eba 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
@@ -157,35 +157,53 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
 ;
 ; VLEN128-LABEL: @vector_add_i32(
 ; VLEN128-NEXT:  entry:
-; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VLEN128:       vector.ph:
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[V:%.*]], i32 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[V]], i32 0
-; VLEN128-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.*]], i32 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V]], i32 0
+; VLEN128-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLEN128:       vector.body:
 ; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
-; VLEN128-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
-; VLEN128-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; VLEN128-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
-; VLEN128-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP4]], align 4
-; VLEN128-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP5]], align 4
-; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VLEN128-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; VLEN128-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
+; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
+; VLEN128-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; VLEN128-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
+; VLEN128-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
+; VLEN128-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 4
+; VLEN128-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; VLEN128-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 2
+; VLEN128-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP14]]
+; VLEN128-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP15]], align 4
+; VLEN128-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; VLEN128-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
+; VLEN128-NEXT:    store <vscale x 2 x i32> [[TMP16]], ptr [[TMP12]], align 4
+; VLEN128-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
+; VLEN128-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], 2
+; VLEN128-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP19]]
+; VLEN128-NEXT:    store <vscale x 2 x i32> [[TMP17]], ptr [[TMP20]], align 4
+; VLEN128-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; VLEN128-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
+; VLEN128-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VLEN128-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VLEN128:       middle.block:
-; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; VLEN128:       scalar.ph:
-; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
 ; VLEN128:       for.body:
 ; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]