[llvm] [RISCV] Fix missing scaling by LMUL in cost model (PR #73342)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 26 04:18:54 PST 2024
================
@@ -80,12 +84,44 @@ entry:
}
define <4 x i64> @ctpop_v4i64(ptr %a) {
-; CHECK-LABEL: define <4 x i64> @ctpop_v4i64
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP0]])
-; CHECK-NEXT: ret <4 x i64> [[TMP1]]
+; RV32-LABEL: define <4 x i64> @ctpop_v4i64
+; RV32-SAME: (ptr [[A:%.*]]) #[[ATTR0]] {
+; RV32-NEXT: entry:
+; RV32-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A]], align 32
+; RV32-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP0]])
+; RV32-NEXT: ret <4 x i64> [[TMP1]]
+;
+; RV64-LABEL: define <4 x i64> @ctpop_v4i64
+; RV64-SAME: (ptr [[A:%.*]]) #[[ATTR0]] {
+; RV64-NEXT: entry:
+; RV64-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A]], align 32
+; RV64-NEXT: [[VECEXT:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[VECEXT]])
+; RV64-NEXT: [[VECINS:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i64 0
+; RV64-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[VECEXT_1]])
+; RV64-NEXT: [[VECINS_1:%.*]] = insertelement <4 x i64> [[VECINS]], i64 [[TMP2]], i64 1
+; RV64-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; RV64-NEXT: [[TMP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[VECEXT_2]])
+; RV64-NEXT: [[VECINS_2:%.*]] = insertelement <4 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
+; RV64-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; RV64-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctpop.i64(i64 [[VECEXT_3]])
+; RV64-NEXT: [[VECINS_3:%.*]] = insertelement <4 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
+; RV64-NEXT: ret <4 x i64> [[VECINS_3]]
----------------
lukel97 wrote:
Not for this patch, but we're overcosting the intrinsics in the table now since not all of the instructions need to be scaled by LMUL. E.g. here:
```llvm
define <4 x i64> @ctpop_v4i64(<4 x i64> %v) {
%w = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %v)
ret <4 x i64> %w
}
```
gives us
```asm
vsetivli zero, 4, e64, m2, ta, ma
vsrl.vi v10, v8, 1
lui a0, 349525
addiw a0, a0, 1365
slli a1, a0, 32
add a0, a0, a1
vand.vx v10, v10, a0
vsub.vv v8, v8, v10
lui a0, 209715
addiw a0, a0, 819
slli a1, a0, 32
add a0, a0, a1
vand.vx v10, v8, a0
vsrl.vi v8, v8, 2
vand.vx v8, v8, a0
vadd.vv v8, v10, v8
vsrl.vi v10, v8, 4
vadd.vv v8, v8, v10
lui a0, 61681
addiw a0, a0, -241
slli a1, a0, 32
add a0, a0, a1
vand.vx v8, v8, a0
lui a0, 4112
addiw a0, a0, 257
slli a1, a0, 32
add a0, a0, a1
vmul.vx v8, v8, a0
li a0, 56
vsrl.vx v8, v8, a0
```
Only 12 of the instructions actually need to be multiplied by LMUL. We should probably look at using `getRISCVInstructionCost` to model such sequences of mixed scalar+vector instructions better. cc @arcbbb
https://github.com/llvm/llvm-project/pull/73342
More information about the llvm-commits
mailing list