[llvm] [RISCV] Be more aggressive about shrinking constant build_vector etype (PR #67175)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 22 11:31:53 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
<details>
<summary>Changes</summary>
If LMUL is more than m1, we can be more aggressive about narrowing the build_vector via a vsext if legal. If the narrow build_vector gets lowered as a load, while both are linear in lmul, load uops are generally more expensive than extend uops. If the narrow build_vector gets lowered via dominant values, that work is linear in both #unique elements and LMUL. So provided the number of unique values > 2, this is a net win in work performed.
---
Patch is 49.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67175.diff
11 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+8-8)
- (modified) llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll (+33-22)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll (+16-16)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll (+4-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll (+239-145)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll (+14-12)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll (+4-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll (+20-10)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll (+29-30)
- (modified) llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll (+15-12)
- (modified) llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll (+24-24)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 000c7157fe8e827..0827570e7b32c4c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3511,17 +3511,14 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
}
}
- if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
- return Res;
-
// If the number of signbits allows, see if we can lower as a <N x i8>.
- // We restrict this to N <= 4 to ensure the resulting narrow vector is
- // 32 bits of smaller and can thus be materialized cheaply from scalar.
- // The main motivation for this is the constant index vector required
- // by vrgather.vv. This covers all indice vectors up to size 4.
+ // Our main goal here is to reduce LMUL (and thus work) required to
+ // build the constant, but we will also narrow if the resulting
+ // narrow vector is known to materialize cheaply.
// TODO: We really should be costing the smaller vector. There are
// profitable cases this misses.
- if (EltBitSize > 8 && NumElts <= 4) {
+ if (EltBitSize > 8 &&
+ (NumElts <= 4 || VT.getSizeInBits() > Subtarget.getRealMinVLen())) {
unsigned SignBits = DAG.ComputeNumSignBits(Op);
if (EltBitSize - SignBits < 8) {
SDValue Source =
@@ -3533,6 +3530,9 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
}
}
+ if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
+ return Res;
+
// For constant vectors, use generic constant pool lowering. Otherwise,
// we'd have to materialize constants in GPRs just to move them into the
// vector.
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 2437c293644c1d3..87d95d7596d4fa3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -106,11 +106,12 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
-; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: vmsltu.vx v0, v16, a2
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
@@ -125,27 +126,30 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
-; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: vmsltu.vx v0, v16, a2
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 4
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vslideup.vi v0, v16, 6
@@ -160,59 +164,66 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
-; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vsaddu.vx v16, v16, a1
; CHECK-NEXT: vmsltu.vx v0, v16, a2
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 4
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 6
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 8
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 10
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 12
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsext.vf8 v16, v8
+; CHECK-NEXT: vsaddu.vx v8, v16, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vslideup.vi v0, v16, 14
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index 892db75ee671015..5605437443d76bb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -950,16 +950,16 @@ define i32 @extractelt_mul_v4i32(<4 x i32> %x) {
define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
; RV32NOM-LABEL: extractelt_sdiv_v4i32:
; RV32NOM: # %bb.0:
-; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32NOM-NEXT: vmv.v.i v9, 0
; RV32NOM-NEXT: lui a0, %hi(.LCPI42_0)
; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI42_0)
-; RV32NOM-NEXT: vle32.v v10, (a0)
-; RV32NOM-NEXT: li a0, -1
-; RV32NOM-NEXT: vslide1down.vx v9, v9, a0
-; RV32NOM-NEXT: vand.vv v9, v8, v9
-; RV32NOM-NEXT: vmulh.vv v8, v8, v10
-; RV32NOM-NEXT: vadd.vv v8, v8, v9
+; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32NOM-NEXT: vle32.v v9, (a0)
+; RV32NOM-NEXT: vmulh.vv v9, v8, v9
+; RV32NOM-NEXT: lui a0, 1044480
+; RV32NOM-NEXT: vmv.s.x v10, a0
+; RV32NOM-NEXT: vsext.vf4 v11, v10
+; RV32NOM-NEXT: vand.vv v8, v8, v11
+; RV32NOM-NEXT: vadd.vv v8, v9, v8
; RV32NOM-NEXT: lui a0, 12320
; RV32NOM-NEXT: addi a0, a0, 257
; RV32NOM-NEXT: vmv.s.x v9, a0
@@ -986,16 +986,16 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
;
; RV64NOM-LABEL: extractelt_sdiv_v4i32:
; RV64NOM: # %bb.0:
-; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64NOM-NEXT: vmv.v.i v9, 0
; RV64NOM-NEXT: lui a0, %hi(.LCPI42_0)
; RV64NOM-NEXT: addi a0, a0, %lo(.LCPI42_0)
-; RV64NOM-NEXT: vle32.v v10, (a0)
-; RV64NOM-NEXT: li a0, -1
-; RV64NOM-NEXT: vslide1down.vx v9, v9, a0
-; RV64NOM-NEXT: vand.vv v9, v8, v9
-; RV64NOM-NEXT: vmulh.vv v8, v8, v10
-; RV64NOM-NEXT: vadd.vv v8, v8, v9
+; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64NOM-NEXT: vle32.v v9, (a0)
+; RV64NOM-NEXT: vmulh.vv v9, v8, v9
+; RV64NOM-NEXT: lui a0, 1044480
+; RV64NOM-NEXT: vmv.s.x v10, a0
+; RV64NOM-NEXT: vsext.vf4 v11, v10
+; RV64NOM-NEXT: vand.vv v8, v8, v11
+; RV64NOM-NEXT: vadd.vv v8, v9, v8
; RV64NOM-NEXT: lui a0, 12320
; RV64NOM-NEXT: addiw a0, a0, 257
; RV64NOM-NEXT: vmv.s.x v9, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index 79947ca4cdf0696..e95978744c408e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -292,7 +292,8 @@ define <4 x i64> @buildvec_vid_step1_add0_v4i64() {
; RV32-NEXT: lui a0, %hi(.LCPI25_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI25_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vle8.v v10, (a0)
+; RV32-NEXT: vsext.vf4 v8, v10
; RV32-NEXT: ret
;
; RV64-LABEL: buildvec_vid_step1_add0_v4i64:
@@ -309,7 +310,8 @@ define <4 x i64> @buildvec_vid_step2_add0_v4i64() {
; RV32-NEXT: lui a0, %hi(.LCPI26_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI26_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: vle8.v v10, (a0)
+; RV32-NEXT: vsext.vf4 v8, v10
; RV32-NEXT: ret
;
; RV64-LABEL: buildvec_vid_step2_add0_v4i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index dbf7dfbcab49cb1..b2a9813e50a1868 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1292,29 +1292,55 @@ define void @mulhu_v6i16(ptr %x) {
}
define void @mulhu_v4i32(ptr %x) {
-; CHECK-LABEL: mulhu_v4i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: vmv.s.x v9, a1
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v10, v9, 2
-; CHECK-NEXT: lui a1, %hi(.LCPI68_0)
-; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0)
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vle32.v v9, (a1)
-; CHECK-NEXT: vmulhu.vv v9, v8, v9
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: vmulhu.vv v8, v8, v10
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vmv.v.i v9, 2
-; CHECK-NEXT: li a1, 1
-; CHECK-NEXT: vslide1down.vx v9, v9, a1
-; CHECK-NEXT: vsrl.vv v8, v8, v9
-; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: ret
+; RV32-LABEL: mulhu_v4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vle32.v v8, (a0)
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: vmv.s.x v9, a1
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; RV32-NEXT: vslideup.vi v10, v9, 2
+; RV32-NEXT: lui a1, %hi(.LCPI68_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI68_0)
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vle32.v v9, (a1)
+; RV32-NEXT: vmulhu.vv v9, v8, v9
+; RV32-NEXT: vsub.vv v8, v8, v9
+; RV32-NEXT: vmulhu.vv v8, v8, v10
+; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: lui a1, 4128
+; RV32-NEXT: addi a1, a1, 514
+; RV32-NEXT: vmv.s.x v9, a1
+; RV32-NEXT: vsext.vf4 v10, v9
+; RV32-NEXT: vsrl.vv v8, v8, v10
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mulhu_v4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vle32.v v8, (a0)
+; RV64-NEXT: lui a1, 524288
+; RV64-NEXT: vmv.s.x v9, a1
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: vsetivli zero, 3, e32, m1, tu, ma
+; RV64-NEXT: vslideup.vi v10, v9, 2
+; RV64-NEXT: lui a1, %hi(.LCPI68_0)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI68_0)
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vle32.v v9, (a1)
+; RV64-NEXT: vmulhu.vv v9, v8, v9
+; RV64-NEXT: vsub.vv v8, v8, v9
+; RV64-NEXT: vmulhu.vv v8, v8, v10
+; RV64-NEXT: vadd.vv v8, v8, v9
+; RV64-NEXT: lui a1, 4128
+; RV64-NEXT: addiw a1, a1, 514
+; RV64-NEXT: vmv.s.x v9, a1
+; RV64-NEXT: vsext.vf4 v10, v9
+; RV64-NEXT: vsrl.vv v8, v8, v10
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: ret
%a = load <4 x i32>, ptr %x
%b = udiv <4 x i32> %a, <i32 5, i32 6, i32 7, i32 9>
store <4 x i32> %b, ptr %x
@@ -1461,29 +1487,57 @@ define void @mulhs_v8i16(ptr %x) {
}
define void @mulhs_v6i16(ptr %x) {
-; CHECK-LABEL: mulhs_v6i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 7
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: li a1, -14
-; CHECK-NEXT: vmadd.vx v10, a1, v9
-; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vdiv.vv v9, v9, v10
-; CHECK-NEXT: vmv.v.i v0, 6
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.v.i v10, -7
-; CHECK-NEXT: vmerge.vim v10, v10, 7, v0
-; CHECK-NEXT: vdiv.vv v8, v8, v10
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v9, 4
-; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
-; CHECK-NEXT: ret
+; RV32-LABEL: mulhs_v6i16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; RV32-NEXT: vle16.v v8, (a0)
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vmv.v.i v9, 7
+; RV32-NEXT: vid.v v10
+; RV32-NEXT: li a1, -14
+; RV32-NEXT: vmadd.vx v10, a1, v9
+; RV32-NEXT: vsetivli zero, 2, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vdiv.vv v9, v9, v10
+; RV32-NEXT: lui a1, 1020016
+; RV32-NEXT: addi a1, a1, 2041
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmv.s.x v10, a1
+; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV32-NEXT: vsext.vf2 v11, v10
+; RV32-NEXT: vdiv.vv v8, v8, v11
+; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vslideup.vi v8, v9, 4
+; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mulhs_v6i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; RV64-NEXT: vle16.v v8, (a0)
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vmv.v.i v9, 7
+; RV64-NEXT: vid.v v10
+; RV64-NEXT: li a1, -14
+; RV64-NEXT: vmadd.vx v10, a1, v9
+; RV64-NEXT: vsetivli zero, 2, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 4
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vdiv.vv v9, v9, v10
+; RV64-NEXT: lui a1, 1020016
+; RV64-NEXT: addiw a1, a1, 2041
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.s.x v10, a1
+; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT: vsext.vf2 v11, v10
+; RV64-NEXT: vdiv.vv v8, v8, v11
+; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT: vslideup.vi v8, v9, 4
+; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: ret
%a = load <6 x i16>, ptr %x
%b = sdiv <6 x i16> %a, <i16 -7, i16 7, i16 7, i16 -7, i16 7, i16 -7>
store <6 x i16> %b, ptr %x
@@ -1550,16 +1604,15 @@ define void @mulhs_v2i64(ptr %x) {
; RV32-NEXT: vrsub.vi v10, v10, 0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vmadd.vv v10, v8, v9
+; RV32-NEXT: li a1, 63
+; RV32-NEXT: vsrl.vx v8, v10, a1
+; RV32-NEXT: lui a1, 16
+; RV32-NEXT: vmv.s.x v9, a1
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.i v8, 1
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma
-; RV32-NEXT: vslideup.vi v9, v8, 2
+; RV32-NEXT: vsext.vf4 v11, v9
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsra.vv v8, v10, v9
-; RV32-NEXT: li a1, 63
-; RV32-NEXT: vsrl.vx v9, v10, a1
-; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: vsra.vv v9, v10, v11
+; RV32-NEXT: vadd.vv v8, v9, v8
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
@@ -5101,63 +5154,70 @@ define void @mulhu_v16i16(ptr %x) {
; LMULMAX2-RV32-NEXT: vle16.v v10, (a0)
; LMULMAX2-RV32-NEXT: li a1, 257
; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1
-; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0
+; LMULMAX2-RV32-NEXT: vmv.v.i v8, 0
+; LMULMAX2-RV32-NEXT: lui a1, 1048568
+; LMULMAX2-RV32-NEXT: vmerge.vxm v12, v8, a1, v0
; LMULMAX2-RV32-NEXT: lui a1, 4
; LMULMAX2-RV32-NEXT: addi a1, a1, 64
; LMULMAX2-RV32-NEXT: vmv.s.x v8, a1
+; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; LMULMAX2-RV32-NEXT: vmv.v.i v9, 0
+; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8
+; LMULMAX2-RV32-NEXT: vmerge.vim v9, v9, 1, v0
+; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI182_0)
; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI182_0)
; LMULMAX2-RV32-NEXT: vle16.v v14, (a1)
-; LMULMAX2-RV32-NEXT: lui a1, 1048568
-; LMULMAX2-RV32-NEXT: vmerge.vxm v16, v12, a1, v0
-; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8
-; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 1, v0
-; LMULMAX2-RV32-NEXT: vsrl.vv v12, v10, v12
-; LMULMAX2-RV32-NEXT: vmulhu.vv v12, v12, v14
-; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v16
-; LMULMAX2-RV32-NEXT: vadd.vv v10, v10, v12
+; LMULMAX2-RV32-NEXT: vsext.vf2 v16, v9
+; LMULMAX2-RV32-NEXT: vsrl.vv v16, v10, v16
+; LMULMAX2-RV32-NEXT: vmulhu.vv v14, v16, v14
+; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v14
+; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v12
+; LMULMAX2-RV32-NEXT: vadd.vv v10, v10, v14
; LMULMAX2-RV32-NEXT: lui a1, 2
; LMULMAX2-RV32-NEXT: addi a1, a1, 289
; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1
-; LMULMAX2-RV32-NEXT: vmv.v.i v12, 3
-; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 2, v0
+; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; LMULMAX2-RV32-NEXT: vmv.v.i v9, 3
+; LMULMAX2-RV32-NEXT: vmerge.vim v9, v9, 2, v0
; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8
-; LMULMAX2-RV32-NEXT: vmerge.vim v8, v12, 1, v0
-; LMULMAX2-RV32-NEXT: vsrl.vv v8, v10, v8
+; LMULMAX2-RV32-NEXT: vmerge.vim v8, v9, 1, v0
+; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; LMULMAX2-RV32-NEXT: vsext.vf2 v12, v8
+; LMULMAX2-RV32-NEXT: vsrl.vv v8, v10, v12
; LMULMAX2-RV32-NEXT: vse16.v v8, (a0)
; LMULMAX2-RV32-NEXT: ret
;
; LMULMAX2-RV64-LABEL: mulhu_v16i16:
; LMULMAX2-RV64: # %bb.0:
; LMULMAX2-RV64-NEXT: vsetivli ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/67175
More information about the llvm-commits
mailing list