[llvm] 042394b - [RISCV] Add a command line option to control the LMUL used by TTI's getRegisterBitWidth.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 7 20:02:16 PST 2022
Author: Craig Topper
Date: 2022-01-07T20:02:10-08:00
New Revision: 042394b69e99c8e9e502209af1c5ff529ac30586
URL: https://github.com/llvm/llvm-project/commit/042394b69e99c8e9e502209af1c5ff529ac30586
DIFF: https://github.com/llvm/llvm-project/commit/042394b69e99c8e9e502209af1c5ff529ac30586.diff
LOG: [RISCV] Add a command line option to control the LMUL used by TTI's getRegisterBitWidth.
By default we return the width of an LMUL=1 register. We can enable
testing with larger LMUL values by returning a larger bit width.
This patch adds a RISCV specific option to provide a LMUL which will be
multiplied by the LMUL=1 bit width.
Reviewed By: kito-cheng
Differential Revision: https://reviews.llvm.org/D116339
Added:
Modified:
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index c435430a12883..ad9fe418d11e8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -15,6 +15,13 @@ using namespace llvm;
#define DEBUG_TYPE "riscvtti"
+static cl::opt<unsigned> RVVRegisterWidthLMUL(
+ "riscv-v-register-bit-width-lmul",
+ cl::desc(
+ "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
+ "by autovectorized code. Fractional LMULs are not supported."),
+ cl::init(1), cl::Hidden);
+
InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy() &&
@@ -137,6 +144,24 @@ Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
return BaseT::getMaxVScale();
}
+TypeSize
+RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ unsigned LMUL = PowerOf2Floor(
+ std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1));
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(ST->getXLen());
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ return TypeSize::getFixed(
+ ST->hasVInstructions() ? LMUL * ST->getMinRVVVectorSizeInBits() : 0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(
+ ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0);
+ }
+
+ llvm_unreachable("Unsupported register kind");
+}
+
InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index f9e33f4e328dc..a591be7718bbb 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -58,20 +58,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
bool supportsScalableVectors() const { return ST->hasVInstructions(); }
Optional<unsigned> getMaxVScale() const;
- TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
- switch (K) {
- case TargetTransformInfo::RGK_Scalar:
- return TypeSize::getFixed(ST->getXLen());
- case TargetTransformInfo::RGK_FixedWidthVector:
- return TypeSize::getFixed(
- ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0);
- case TargetTransformInfo::RGK_ScalableVector:
- return TypeSize::getScalable(
- ST->hasVInstructions() ? RISCV::RVVBitsPerBlock : 0);
- }
-
- llvm_unreachable("Unsupported register kind");
- }
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
index efbdc3abcfcb9..49007beb5714a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
@@ -1,14 +1,125 @@
-; RUN: opt < %s -loop-vectorize -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s --check-prefix=LMUL1
+; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s --check-prefix=LMUL1
+; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=2 -S | FileCheck %s --check-prefix=LMUL2
+; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=2 -S | FileCheck %s --check-prefix=LMUL2
; Function Attrs: nounwind
define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-;CHECK-LABEL: array_add
-;CHECK: load <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret
+; LMUL1-LABEL: @array_add(
+; LMUL1-NEXT: entry:
+; LMUL1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
+; LMUL1-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; LMUL1: for.body.preheader:
+; LMUL1-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1
+; LMUL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; LMUL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; LMUL1: vector.ph:
+; LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; LMUL1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; LMUL1-NEXT: br label [[VECTOR_BODY:%.*]]
+; LMUL1: vector.body:
+; LMUL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LMUL1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
+; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; LMUL1-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
+; LMUL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
+; LMUL1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; LMUL1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; LMUL1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
+; LMUL1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
+; LMUL1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; LMUL1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
+; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; LMUL1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; LMUL1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LMUL1: middle.block:
+; LMUL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; LMUL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; LMUL1: scalar.ph:
+; LMUL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; LMUL1-NEXT: br label [[FOR_BODY:%.*]]
+; LMUL1: for.body:
+; LMUL1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; LMUL1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; LMUL1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; LMUL1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; LMUL1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; LMUL1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+; LMUL1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; LMUL1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; LMUL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; LMUL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; LMUL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
+; LMUL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; LMUL1: for.end.loopexit:
+; LMUL1-NEXT: br label [[FOR_END]]
+; LMUL1: for.end:
+; LMUL1-NEXT: ret i32* [[C]]
+;
+; LMUL2-LABEL: @array_add(
+; LMUL2-NEXT: entry:
+; LMUL2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
+; LMUL2-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; LMUL2: for.body.preheader:
+; LMUL2-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1
+; LMUL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; LMUL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; LMUL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; LMUL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; LMUL2: vector.ph:
+; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; LMUL2-NEXT: br label [[VECTOR_BODY:%.*]]
+; LMUL2: vector.body:
+; LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; LMUL2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; LMUL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
+; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; LMUL2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
+; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
+; LMUL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
+; LMUL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
+; LMUL2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4
+; LMUL2-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; LMUL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
+; LMUL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
+; LMUL2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; LMUL2-NEXT: store <8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], align 4
+; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; LMUL2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; LMUL2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; LMUL2: middle.block:
+; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; LMUL2: scalar.ph:
+; LMUL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; LMUL2-NEXT: br label [[FOR_BODY:%.*]]
+; LMUL2: for.body:
+; LMUL2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; LMUL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; LMUL2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; LMUL2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; LMUL2-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; LMUL2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+; LMUL2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; LMUL2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; LMUL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; LMUL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; LMUL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
+; LMUL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; LMUL2: for.end.loopexit:
+; LMUL2-NEXT: br label [[FOR_END]]
+; LMUL2: for.end:
+; LMUL2-NEXT: ret i32* [[C]]
+;
entry:
%cmp10 = icmp sgt i32 %size, 0
br i1 %cmp10, label %for.body.preheader, label %for.end
More information about the llvm-commits
mailing list