[llvm] [RISCV] Account for zvfhmin and zvfbfmin promotion in register usage (PR #108370)

Thu Sep 12 04:38:39 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Luke Lau (lukel97)

<details>
<summary>Changes</summary>

A half with only zvfhmin or bfloat will end up getting promoted to a f32 for most instructions.

Unless the loop consists only of memory ops and permutation instructions which don't need promoted (is this common?), we'll end up using double the LMUL than what's currently being returned by getRegUsageForType.

Since this is used by the loop vectorizer, it seems better to be conservative and assume that any usage of a zvfhmin half/bfloat will end up being widened to a f32


---
Full diff: https://github.com/llvm/llvm-project/pull/108370.diff


3 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+8-1) 
- (added) llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll (+31) 
- (added) llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll (+37) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 2b5e7c47279284..3303534ecb4968 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2030,8 +2030,15 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
 }
 
 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
-  TypeSize Size = DL.getTypeSizeInBits(Ty);
   if (Ty->isVectorTy()) {
+    // f16 w/ zvfhmin and bf16 types will be promoted to f32
+    Type *EltTy = cast<VectorType>(Ty)->getElementType();
+    if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
+        EltTy->isBFloatTy())
+      Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
+                           cast<VectorType>(Ty));
+
+    TypeSize Size = DL.getTypeSizeInBits(Ty);
     if (Size.isScalable() && ST->hasVInstructions())
       return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
new file mode 100644
index 00000000000000..89514431278a74
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
@@ -0,0 +1,31 @@
+; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -debug-only=loop-vectorize -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s
+
+define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
+; CHECK-LABEL: add
+; CHECK:       LV(REG): Found max usage: 2 item
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
+
+entry:
+  %conv = zext i32 %size to i64
+  %cmp10.not = icmp eq i32 %size, 0
+  br i1 %cmp10.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.011 = phi i64 [ %add4, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds bfloat, ptr %src1, i64 %i.011
+  %0 = load bfloat, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds bfloat, ptr %src2, i64 %i.011
+  %1 = load bfloat, ptr %arrayidx2, align 4
+  %add = fadd bfloat %0, %1
+  %arrayidx3 = getelementptr inbounds bfloat, ptr %result, i64 %i.011
+  store bfloat %add, ptr %arrayidx3, align 4
+  %add4 = add nuw nsw i64 %i.011, 1
+  %exitcond.not = icmp eq i64 %add4, %conv
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
new file mode 100644
index 00000000000000..ceedcfba4691e1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
@@ -0,0 +1,37 @@
+; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfh -debug-only=loop-vectorize -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFH
+; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -debug-only=loop-vectorize -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFHMIN
+
+define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
+; CHECK-LABEL: add
+; ZVFH:       LV(REG): Found max usage: 2 item
+; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; ZVFH-NEXT:  LV(REG): Found invariant usage: 1 item
+; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
+; ZVFHMIN:       LV(REG): Found max usage: 2 item
+; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; ZVFHMIN-NEXT:  LV(REG): Found invariant usage: 1 item
+; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
+
+entry:
+  %conv = zext i32 %size to i64
+  %cmp10.not = icmp eq i32 %size, 0
+  br i1 %cmp10.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.011 = phi i64 [ %add4, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds half, ptr %src1, i64 %i.011
+  %0 = load half, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds half, ptr %src2, i64 %i.011
+  %1 = load half, ptr %arrayidx2, align 4
+  %add = fadd half %0, %1
+  %arrayidx3 = getelementptr inbounds half, ptr %result, i64 %i.011
+  store half %add, ptr %arrayidx3, align 4
+  %add4 = add nuw nsw i64 %i.011, 1
+  %exitcond.not = icmp eq i64 %add4, %conv
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/108370