[llvm] [RISCV] Use vwsll.vi/vx + vwadd.wv to lower vector.interleave when Zvbb enabled. (PR #67521)
Yeting Kuo via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 27 17:18:38 PDT 2023
https://github.com/yetingk updated https://github.com/llvm/llvm-project/pull/67521
>From 3f0aa02bb54b232dcec25f21dd52b735cd33f276 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <yeting.kuo at sifive.com>
Date: Wed, 27 Sep 2023 14:45:15 +0800
Subject: [PATCH 1/2] [RISCV] Use vwsll.vi/vx + vwadd.wv to lower
vector.interleave when Zvbb enabled.
The replacement could avoid an assignment to GPR when the type is vector of i8/i16 and
vwmaccu.wv which may have higher cost than vwsll.vi/vx.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 54 ++--
llvm/lib/Target/RISCV/RISCVISelLowering.h | 1 +
.../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 1 +
llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td | 18 ++
.../RISCV/rvv/vector-interleave-fixed.ll | 78 +++++
.../CodeGen/RISCV/rvv/vector-interleave.ll | 270 ++++++++++++++++++
6 files changed, 402 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f6663c7f435ad70..d06db590f80e009 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4264,24 +4264,37 @@ static SDValue getWideningInterleave(SDValue EvenV, SDValue OddV,
auto [Mask, VL] = getDefaultVLOps(VecVT, VecContainerVT, DL, DAG, Subtarget);
SDValue Passthru = DAG.getUNDEF(WideContainerVT);
- // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV with
- // vwaddu.vv
- SDValue Interleaved = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideContainerVT,
- EvenV, OddV, Passthru, Mask, VL);
-
- // Then get OddV * by 2^(VecVT.getScalarSizeInBits() - 1)
- SDValue AllOnesVec = DAG.getSplatVector(
- VecContainerVT, DL, DAG.getAllOnesConstant(DL, Subtarget.getXLenVT()));
- SDValue OddsMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideContainerVT, OddV,
- AllOnesVec, Passthru, Mask, VL);
-
- // Add the two together so we get
- // (OddV * 0xff...ff) + (OddV + EvenV)
- // = (OddV * 0x100...00) + EvenV
- // = (OddV << VecVT.getScalarSizeInBits()) + EvenV
- // Note the ADD_VL and VLMULU_VL should get selected as vwmaccu.vx
- Interleaved = DAG.getNode(RISCVISD::ADD_VL, DL, WideContainerVT, Interleaved,
- OddsMul, Passthru, Mask, VL);
+ SDValue Interleaved;
+ if (Subtarget.hasStdExtZvbb()) {
+ // Interleaved = (OddV << VecVT.getScalarSizeInBits()) + EvenV.
+ SDValue OffsetVec =
+ DAG.getSplatVector(VecContainerVT, DL,
+ DAG.getConstant(VecVT.getScalarSizeInBits(), DL,
+ Subtarget.getXLenVT()));
+ Interleaved = DAG.getNode(RISCVISD::VWSLL_VL, DL, WideContainerVT, OddV,
+ OffsetVec, Passthru, Mask, VL);
+ Interleaved = DAG.getNode(RISCVISD::VWADD_W_VL, DL, WideContainerVT,
+ Interleaved, EvenV, Passthru, Mask, VL);
+ } else {
+ // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV with
+ // vwaddu.vv
+ Interleaved = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideContainerVT, EvenV,
+ OddV, Passthru, Mask, VL);
+
+ // Then get OddV * by 2^(VecVT.getScalarSizeInBits() - 1)
+ SDValue AllOnesVec = DAG.getSplatVector(
+ VecContainerVT, DL, DAG.getAllOnesConstant(DL, Subtarget.getXLenVT()));
+ SDValue OddsMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideContainerVT,
+ OddV, AllOnesVec, Passthru, Mask, VL);
+
+ // Add the two together so we get
+ // (OddV * 0xff...ff) + (OddV + EvenV)
+ // = (OddV * 0x100...00) + EvenV
+ // = (OddV << VecVT.getScalarSizeInBits()) + EvenV
+ // Note the ADD_VL and VLMULU_VL should get selected as vwmaccu.vx
+ Interleaved = DAG.getNode(RISCVISD::ADD_VL, DL, WideContainerVT,
+ Interleaved, OddsMul, Passthru, Mask, VL);
+ }
// Bitcast from <vscale x n * ty*2> to <vscale x 2*n x ty>
MVT ResultContainerVT = MVT::getVectorVT(
@@ -5315,7 +5328,7 @@ static bool hasMergeOp(unsigned Opcode) {
Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
"not a RISC-V target specific op");
static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP ==
- 124 &&
+ 125 &&
RISCVISD::LAST_RISCV_STRICTFP_OPCODE -
ISD::FIRST_TARGET_STRICTFP_OPCODE ==
21 &&
@@ -5339,7 +5352,7 @@ static bool hasMaskOp(unsigned Opcode) {
Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
"not a RISC-V target specific op");
static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP ==
- 124 &&
+ 125 &&
RISCVISD::LAST_RISCV_STRICTFP_OPCODE -
ISD::FIRST_TARGET_STRICTFP_OPCODE ==
21 &&
@@ -17535,6 +17548,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VWADDU_W_VL)
NODE_NAME_CASE(VWSUB_W_VL)
NODE_NAME_CASE(VWSUBU_W_VL)
+ NODE_NAME_CASE(VWSLL_VL)
NODE_NAME_CASE(VFWMUL_VL)
NODE_NAME_CASE(VFWADD_VL)
NODE_NAME_CASE(VFWSUB_VL)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0b8e6994a876ac8..2675b0ce43e439f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -309,6 +309,7 @@ enum NodeType : unsigned {
VWADDU_W_VL,
VWSUB_W_VL,
VWSUBU_W_VL,
+ VWSLL_VL,
VFWMUL_VL,
VFWADD_VL,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 78b67e59082308c..0a8ec34c4de670d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -409,6 +409,7 @@ def riscv_vwadd_vl : SDNode<"RISCVISD::VWADD_VL", SDT_RISCVVWIntBinOp_VL, [S
def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWIntBinOp_VL, [SDNPCommutative]>;
def riscv_vwsub_vl : SDNode<"RISCVISD::VWSUB_VL", SDT_RISCVVWIntBinOp_VL, []>;
def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWIntBinOp_VL, []>;
+def riscv_vwsll_vl : SDNode<"RISCVISD::VWSLL_VL", SDT_RISCVVWIntBinOp_VL, []>;
def SDT_RISCVVWIntTernOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisInt<0>,
SDTCisInt<1>,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 062529c054ecd46..5dd6a39126d9897 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -641,6 +641,24 @@ foreach vtiToWti = AllWidenableIntVectors in {
(!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+ def : Pat<(riscv_vwsll_vl
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
+ (wti.Vector wti.RegClass:$merge),
+ (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
+ wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+ (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+ def : Pat<(riscv_vwsll_vl
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Vector (SplatPat_uimm5 uimm5:$rs1)),
+ (wti.Vector wti.RegClass:$merge),
+ (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
+ wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+ (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
}
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index afe6c958f1e64be..564eab2c6cd8e7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck -check-prefixes=CHECK,RV32 %s
; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck -check-prefixes=CHECK,RV64 %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+experimental-zvbb,+zfh,+zvfh | FileCheck %s --check-prefix=ZVBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+experimental-zvbb,+zfh,+zvfh | FileCheck %s --check-prefix=ZVBB
; Integers
@@ -22,6 +24,23 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-NEXT: vmsne.vi v0, v12, 0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v32i1_v16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; ZVBB-NEXT: vslideup.vi v0, v8, 2
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma
+; ZVBB-NEXT: vmv.v.i v8, 0
+; ZVBB-NEXT: vmerge.vim v8, v8, 1, v0
+; ZVBB-NEXT: vsetivli zero, 16, e8, m2, ta, ma
+; ZVBB-NEXT: vslidedown.vi v10, v8, 16
+; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; ZVBB-NEXT: vwsll.vi v12, v10, 8
+; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma
+; ZVBB-NEXT: vmsne.vi v0, v12, 0
+; ZVBB-NEXT: ret
%res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
ret <32 x i1> %res
}
@@ -35,6 +54,14 @@ define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v16i16_v8i16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv2r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
ret <16 x i16> %res
}
@@ -48,6 +75,15 @@ define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v8i32_v4i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT: vwsll.vx v10, v9, a0
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv2r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
ret <8 x i32> %res
}
@@ -102,6 +138,14 @@ define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) {
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v4f16_v2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv1r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
ret <4 x half> %res
}
@@ -115,6 +159,14 @@ define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) {
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v8f16_v4f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv1r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
ret <8 x half> %res
}
@@ -128,6 +180,15 @@ define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v4f32_v2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; ZVBB-NEXT: vwsll.vx v10, v9, a0
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv1r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
ret <4 x float> %res
}
@@ -141,6 +202,14 @@ define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b)
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v16f16_v8f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv2r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
ret <16 x half> %res
}
@@ -154,6 +223,15 @@ define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_v8f32_v4f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT: vwsll.vx v10, v9, a0
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv2r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
ret <8 x float> %res
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index f742e182ab1ad6f..aadf97a0b4514c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck %s
; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+experimental-zvbb,+zfh,+zvfh | FileCheck %s --check-prefix=ZVBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+experimental-zvbb,+zfh,+zvfh | FileCheck %s --check-prefix=ZVBB
; Integers
@@ -25,6 +27,24 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vx v0, v8, a0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmv.v.i v10, 0
+; ZVBB-NEXT: vmerge.vim v12, v10, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmerge.vim v8, v10, 1, v0
+; ZVBB-NEXT: vwsll.vi v16, v8, 8
+; ZVBB-NEXT: vwadd.wv v16, v16, v12
+; ZVBB-NEXT: vmsne.vi v8, v18, 0
+; ZVBB-NEXT: vmsne.vi v0, v16, 0
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: srli a0, a0, 2
+; ZVBB-NEXT: add a1, a0, a0
+; ZVBB-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vx v0, v8, a0
+; ZVBB-NEXT: ret
%res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
ret <vscale x 32 x i1> %res
}
@@ -38,6 +58,14 @@ define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8>
; CHECK-NEXT: vwmaccu.vx v12, a0, v10
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32i8_nxv16i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vwsll.vi v12, v10, 8
+; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vmv4r.v v8, v12
+; ZVBB-NEXT: ret
%res = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 32 x i8> %res
}
@@ -51,6 +79,14 @@ define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16
; CHECK-NEXT: vwmaccu.vx v12, a0, v10
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16i16_nxv8i16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vwsll.vi v12, v10, 16
+; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vmv4r.v v8, v12
+; ZVBB-NEXT: ret
%res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
ret <vscale x 16 x i16> %res
}
@@ -64,6 +100,15 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32>
; CHECK-NEXT: vwmaccu.vx v12, a0, v10
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; ZVBB-NEXT: vwsll.vx v12, v10, a0
+; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vmv4r.v v8, v12
+; ZVBB-NEXT: ret
%res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
ret <vscale x 8 x i32> %res
}
@@ -83,6 +128,21 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4i64_nxv2i64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: srli a0, a0, 2
+; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; ZVBB-NEXT: vid.v v12
+; ZVBB-NEXT: vand.vi v13, v12, 1
+; ZVBB-NEXT: vmsne.vi v0, v13, 0
+; ZVBB-NEXT: vsrl.vi v16, v12, 1
+; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t
+; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16
+; ZVBB-NEXT: vmv.v.v v8, v12
+; ZVBB-NEXT: ret
%res = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
ret <vscale x 4 x i64> %res
}
@@ -115,6 +175,25 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; CHECK-NEXT: vmsne.vi v8, v24, 0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv128i1_nxv64i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZVBB-NEXT: vmv.v.i v24, 0
+; ZVBB-NEXT: vmerge.vim v16, v24, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmerge.vim v8, v24, 1, v0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZVBB-NEXT: vwsll.vi v24, v8, 8
+; ZVBB-NEXT: vwadd.wv v24, v24, v16
+; ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZVBB-NEXT: vmsne.vi v0, v24, 0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZVBB-NEXT: vwsll.vi v24, v12, 8
+; ZVBB-NEXT: vwadd.wv v24, v24, v20
+; ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZVBB-NEXT: vmsne.vi v8, v24, 0
+; ZVBB-NEXT: ret
%res = call <vscale x 128 x i1> @llvm.experimental.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
ret <vscale x 128 x i1> %res
}
@@ -131,6 +210,17 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
; CHECK-NEXT: vwmaccu.vx v0, a0, v20
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv128i8_nxv64i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vmv8r.v v24, v8
+; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZVBB-NEXT: vwsll.vi v8, v16, 8
+; ZVBB-NEXT: vwadd.wv v8, v8, v24
+; ZVBB-NEXT: vwsll.vi v0, v20, 8
+; ZVBB-NEXT: vwadd.wv v0, v0, v28
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
%res = call <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
ret <vscale x 128 x i8> %res
}
@@ -147,6 +237,17 @@ define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i
; CHECK-NEXT: vwmaccu.vx v0, a0, v20
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64i16_nxv32i16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vmv8r.v v24, v8
+; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVBB-NEXT: vwsll.vi v8, v16, 16
+; ZVBB-NEXT: vwadd.wv v8, v8, v24
+; ZVBB-NEXT: vwsll.vi v0, v20, 16
+; ZVBB-NEXT: vwadd.wv v0, v0, v28
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
%res = call <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
ret <vscale x 64 x i16> %res
}
@@ -163,6 +264,18 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
; CHECK-NEXT: vwmaccu.vx v0, a0, v20
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32i32_nxv16i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma
+; ZVBB-NEXT: vwsll.vx v24, v16, a0
+; ZVBB-NEXT: vwadd.wv v24, v24, v8
+; ZVBB-NEXT: vwsll.vx v0, v20, a0
+; ZVBB-NEXT: vwadd.wv v0, v0, v12
+; ZVBB-NEXT: vmv8r.v v8, v24
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
%res = call <vscale x 32 x i32> @llvm.experimental.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
ret <vscale x 32 x i32> %res
}
@@ -202,6 +315,41 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16i64_nxv8i64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVBB-NEXT: vmv8r.v v0, v8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: srli a0, a0, 1
+; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; ZVBB-NEXT: vid.v v24
+; ZVBB-NEXT: vand.vi v26, v24, 1
+; ZVBB-NEXT: vmsne.vi v10, v26, 0
+; ZVBB-NEXT: vsrl.vi v8, v24, 1
+; ZVBB-NEXT: vmv8r.v v24, v0
+; ZVBB-NEXT: vmv4r.v v12, v4
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vadd.vx v8, v8, a0, v0.t
+; ZVBB-NEXT: vmv4r.v v28, v16
+; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; ZVBB-NEXT: vrgatherei16.vv v0, v24, v8
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVBB-NEXT: vmv4r.v v16, v12
+; ZVBB-NEXT: vrgatherei16.vv v24, v16, v8
+; ZVBB-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVBB-NEXT: vmv.v.v v16, v24
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
%res = call <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
ret <vscale x 16 x i64> %res
}
@@ -230,6 +378,21 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
; CHECK-NEXT: vslideup.vx v10, v8, a0
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: srli a0, a0, 2
+; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vslidedown.vx v8, v10, a0
+; ZVBB-NEXT: add a1, a0, a0
+; ZVBB-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; ZVBB-NEXT: vslideup.vx v10, v8, a0
+; ZVBB-NEXT: vmv1r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
ret <vscale x 4 x half> %res
}
@@ -243,6 +406,14 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv2r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
ret <vscale x 8 x half> %res
}
@@ -256,6 +427,15 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x flo
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vwsll.vx v10, v9, a0
+; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vmv2r.v v8, v10
+; ZVBB-NEXT: ret
%res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
ret <vscale x 4 x float> %res
}
@@ -269,6 +449,14 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x ha
; CHECK-NEXT: vwmaccu.vx v12, a0, v10
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vwsll.vi v12, v10, 16
+; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vmv4r.v v8, v12
+; ZVBB-NEXT: ret
%res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
ret <vscale x 16 x half> %res
}
@@ -282,6 +470,15 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x flo
; CHECK-NEXT: vwmaccu.vx v12, a0, v10
; CHECK-NEXT: vmv4r.v v8, v12
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; ZVBB-NEXT: vwsll.vx v12, v10, a0
+; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vmv4r.v v8, v12
+; ZVBB-NEXT: ret
%res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
ret <vscale x 8 x float> %res
}
@@ -301,6 +498,21 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: srli a0, a0, 2
+; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; ZVBB-NEXT: vid.v v12
+; ZVBB-NEXT: vand.vi v13, v12, 1
+; ZVBB-NEXT: vmsne.vi v0, v13, 0
+; ZVBB-NEXT: vsrl.vi v16, v12, 1
+; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t
+; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16
+; ZVBB-NEXT: vmv.v.v v8, v12
+; ZVBB-NEXT: ret
%res = call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
ret <vscale x 4 x double> %res
}
@@ -325,6 +537,17 @@ define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x
; CHECK-NEXT: vwmaccu.vx v0, a0, v20
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vmv8r.v v24, v8
+; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVBB-NEXT: vwsll.vi v8, v16, 16
+; ZVBB-NEXT: vwadd.wv v8, v8, v24
+; ZVBB-NEXT: vwsll.vi v0, v20, 16
+; ZVBB-NEXT: vwadd.wv v0, v0, v28
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
%res = call <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
ret <vscale x 64 x half> %res
}
@@ -341,6 +564,18 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
; CHECK-NEXT: vwmaccu.vx v0, a0, v20
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma
+; ZVBB-NEXT: vwsll.vx v24, v16, a0
+; ZVBB-NEXT: vwadd.wv v24, v24, v8
+; ZVBB-NEXT: vwsll.vx v0, v20, a0
+; ZVBB-NEXT: vwadd.wv v0, v0, v12
+; ZVBB-NEXT: vmv8r.v v8, v24
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
%res = call <vscale x 32 x float> @llvm.experimental.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
ret <vscale x 32 x float> %res
}
@@ -380,6 +615,41 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVBB-NEXT: vmv8r.v v0, v8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: srli a0, a0, 1
+; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; ZVBB-NEXT: vid.v v24
+; ZVBB-NEXT: vand.vi v26, v24, 1
+; ZVBB-NEXT: vmsne.vi v10, v26, 0
+; ZVBB-NEXT: vsrl.vi v8, v24, 1
+; ZVBB-NEXT: vmv8r.v v24, v0
+; ZVBB-NEXT: vmv4r.v v12, v4
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vadd.vx v8, v8, a0, v0.t
+; ZVBB-NEXT: vmv4r.v v28, v16
+; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; ZVBB-NEXT: vrgatherei16.vv v0, v24, v8
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVBB-NEXT: vmv4r.v v16, v12
+; ZVBB-NEXT: vrgatherei16.vv v24, v16, v8
+; ZVBB-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVBB-NEXT: vmv.v.v v16, v24
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
%res = call <vscale x 16 x double> @llvm.experimental.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
ret <vscale x 16 x double> %res
}
>From 9f4cd3fe7aaf40950babf44265bdbf0f7badca45 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <yeting.kuo at sifive.com>
Date: Thu, 28 Sep 2023 08:18:07 +0800
Subject: [PATCH 2/2] Use vwaddu.vw instead of vwadd.vw
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +-
llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td | 9 ++++
.../RISCV/rvv/vector-interleave-fixed.ll | 16 +++----
.../CodeGen/RISCV/rvv/vector-interleave.ll | 42 +++++++++----------
4 files changed, 39 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d06db590f80e009..199de71e4d04d4e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4273,7 +4273,7 @@ static SDValue getWideningInterleave(SDValue EvenV, SDValue OddV,
Subtarget.getXLenVT()));
Interleaved = DAG.getNode(RISCVISD::VWSLL_VL, DL, WideContainerVT, OddV,
OffsetVec, Passthru, Mask, VL);
- Interleaved = DAG.getNode(RISCVISD::VWADD_W_VL, DL, WideContainerVT,
+ Interleaved = DAG.getNode(RISCVISD::VWADDU_W_VL, DL, WideContainerVT,
Interleaved, EvenV, Passthru, Mask, VL);
} else {
// Widen EvenV and OddV with 0s and add one copy of OddV to EvenV with
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 5dd6a39126d9897..b904323a3bfb583 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -642,6 +642,15 @@ foreach vtiToWti = AllWidenableIntVectors in {
wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
(vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+ def : Pat<(riscv_vwsll_vl
+ (vti.Vector vti.RegClass:$rs2),
+ (vti.Vector vti.RegClass:$rs1),
+ (wti.Vector wti.RegClass:$merge),
+ (vti.Mask V0), VLOpFrag),
+ (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
+ wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+ (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
def : Pat<(riscv_vwsll_vl
(vti.Vector vti.RegClass:$rs2),
(vti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 564eab2c6cd8e7b..ff236d7def7d6e1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -37,7 +37,7 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
; ZVBB-NEXT: vslidedown.vi v10, v8, 16
; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; ZVBB-NEXT: vwsll.vi v12, v10, 8
-; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vwaddu.wv v12, v12, v8
; ZVBB-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; ZVBB-NEXT: vmsne.vi v0, v12, 0
; ZVBB-NEXT: ret
@@ -59,7 +59,7 @@ define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVBB-NEXT: vwsll.vi v10, v9, 16
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv2r.v v8, v10
; ZVBB-NEXT: ret
%res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
@@ -81,7 +81,7 @@ define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; ZVBB-NEXT: vwsll.vx v10, v9, a0
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv2r.v v8, v10
; ZVBB-NEXT: ret
%res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
@@ -143,7 +143,7 @@ define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) {
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; ZVBB-NEXT: vwsll.vi v10, v9, 16
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv1r.v v8, v10
; ZVBB-NEXT: ret
%res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
@@ -164,7 +164,7 @@ define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) {
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVBB-NEXT: vwsll.vi v10, v9, 16
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv1r.v v8, v10
; ZVBB-NEXT: ret
%res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
@@ -186,7 +186,7 @@ define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; ZVBB-NEXT: vwsll.vx v10, v9, a0
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv1r.v v8, v10
; ZVBB-NEXT: ret
%res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
@@ -207,7 +207,7 @@ define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b)
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVBB-NEXT: vwsll.vi v10, v9, 16
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv2r.v v8, v10
; ZVBB-NEXT: ret
%res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
@@ -229,7 +229,7 @@ define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; ZVBB-NEXT: vwsll.vx v10, v9, a0
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv2r.v v8, v10
; ZVBB-NEXT: ret
%res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index aadf97a0b4514c1..4b5edaa1cd3f7d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -36,7 +36,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
; ZVBB-NEXT: vmv1r.v v0, v8
; ZVBB-NEXT: vmerge.vim v8, v10, 1, v0
; ZVBB-NEXT: vwsll.vi v16, v8, 8
-; ZVBB-NEXT: vwadd.wv v16, v16, v12
+; ZVBB-NEXT: vwaddu.wv v16, v16, v12
; ZVBB-NEXT: vmsne.vi v8, v18, 0
; ZVBB-NEXT: vmsne.vi v0, v16, 0
; ZVBB-NEXT: csrr a0, vlenb
@@ -63,7 +63,7 @@ define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8>
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; ZVBB-NEXT: vwsll.vi v12, v10, 8
-; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vwaddu.wv v12, v12, v8
; ZVBB-NEXT: vmv4r.v v8, v12
; ZVBB-NEXT: ret
%res = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
@@ -84,7 +84,7 @@ define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVBB-NEXT: vwsll.vi v12, v10, 16
-; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vwaddu.wv v12, v12, v8
; ZVBB-NEXT: vmv4r.v v8, v12
; ZVBB-NEXT: ret
%res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
@@ -106,7 +106,7 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32>
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; ZVBB-NEXT: vwsll.vx v12, v10, a0
-; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vwaddu.wv v12, v12, v8
; ZVBB-NEXT: vmv4r.v v8, v12
; ZVBB-NEXT: ret
%res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -185,12 +185,12 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
; ZVBB-NEXT: vmerge.vim v8, v24, 1, v0
; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
; ZVBB-NEXT: vwsll.vi v24, v8, 8
-; ZVBB-NEXT: vwadd.wv v24, v24, v16
+; ZVBB-NEXT: vwaddu.wv v24, v24, v16
; ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; ZVBB-NEXT: vmsne.vi v0, v24, 0
; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
; ZVBB-NEXT: vwsll.vi v24, v12, 8
-; ZVBB-NEXT: vwadd.wv v24, v24, v20
+; ZVBB-NEXT: vwaddu.wv v24, v24, v20
; ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma
; ZVBB-NEXT: vmsne.vi v8, v24, 0
; ZVBB-NEXT: ret
@@ -216,9 +216,9 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
; ZVBB-NEXT: vmv8r.v v24, v8
; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
; ZVBB-NEXT: vwsll.vi v8, v16, 8
-; ZVBB-NEXT: vwadd.wv v8, v8, v24
+; ZVBB-NEXT: vwaddu.wv v8, v8, v24
; ZVBB-NEXT: vwsll.vi v0, v20, 8
-; ZVBB-NEXT: vwadd.wv v0, v0, v28
+; ZVBB-NEXT: vwaddu.wv v0, v0, v28
; ZVBB-NEXT: vmv8r.v v16, v0
; ZVBB-NEXT: ret
%res = call <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
@@ -243,9 +243,9 @@ define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i
; ZVBB-NEXT: vmv8r.v v24, v8
; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVBB-NEXT: vwsll.vi v8, v16, 16
-; ZVBB-NEXT: vwadd.wv v8, v8, v24
+; ZVBB-NEXT: vwaddu.wv v8, v8, v24
; ZVBB-NEXT: vwsll.vi v0, v20, 16
-; ZVBB-NEXT: vwadd.wv v0, v0, v28
+; ZVBB-NEXT: vwaddu.wv v0, v0, v28
; ZVBB-NEXT: vmv8r.v v16, v0
; ZVBB-NEXT: ret
%res = call <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
@@ -270,9 +270,9 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; ZVBB-NEXT: vwsll.vx v24, v16, a0
-; ZVBB-NEXT: vwadd.wv v24, v24, v8
+; ZVBB-NEXT: vwaddu.wv v24, v24, v8
; ZVBB-NEXT: vwsll.vx v0, v20, a0
-; ZVBB-NEXT: vwadd.wv v0, v0, v12
+; ZVBB-NEXT: vwaddu.wv v0, v0, v12
; ZVBB-NEXT: vmv8r.v v8, v24
; ZVBB-NEXT: vmv8r.v v16, v0
; ZVBB-NEXT: ret
@@ -383,7 +383,7 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
; ZVBB-NEXT: vwsll.vi v10, v9, 16
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: srli a0, a0, 2
; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
@@ -411,7 +411,7 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
; ZVBB-NEXT: vwsll.vi v10, v9, 16
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv2r.v v8, v10
; ZVBB-NEXT: ret
%res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
@@ -433,7 +433,7 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x flo
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; ZVBB-NEXT: vwsll.vx v10, v9, a0
-; ZVBB-NEXT: vwadd.wv v10, v10, v8
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
; ZVBB-NEXT: vmv2r.v v8, v10
; ZVBB-NEXT: ret
%res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
@@ -454,7 +454,7 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x ha
; ZVBB: # %bb.0:
; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
; ZVBB-NEXT: vwsll.vi v12, v10, 16
-; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vwaddu.wv v12, v12, v8
; ZVBB-NEXT: vmv4r.v v8, v12
; ZVBB-NEXT: ret
%res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
@@ -476,7 +476,7 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x flo
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; ZVBB-NEXT: vwsll.vx v12, v10, a0
-; ZVBB-NEXT: vwadd.wv v12, v12, v8
+; ZVBB-NEXT: vwaddu.wv v12, v12, v8
; ZVBB-NEXT: vmv4r.v v8, v12
; ZVBB-NEXT: ret
%res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
@@ -543,9 +543,9 @@ define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x
; ZVBB-NEXT: vmv8r.v v24, v8
; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVBB-NEXT: vwsll.vi v8, v16, 16
-; ZVBB-NEXT: vwadd.wv v8, v8, v24
+; ZVBB-NEXT: vwaddu.wv v8, v8, v24
; ZVBB-NEXT: vwsll.vi v0, v20, 16
-; ZVBB-NEXT: vwadd.wv v0, v0, v28
+; ZVBB-NEXT: vwaddu.wv v0, v0, v28
; ZVBB-NEXT: vmv8r.v v16, v0
; ZVBB-NEXT: ret
%res = call <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
@@ -570,9 +570,9 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
; ZVBB-NEXT: li a0, 32
; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; ZVBB-NEXT: vwsll.vx v24, v16, a0
-; ZVBB-NEXT: vwadd.wv v24, v24, v8
+; ZVBB-NEXT: vwaddu.wv v24, v24, v8
; ZVBB-NEXT: vwsll.vx v0, v20, a0
-; ZVBB-NEXT: vwadd.wv v0, v0, v12
+; ZVBB-NEXT: vwaddu.wv v0, v0, v12
; ZVBB-NEXT: vmv8r.v v8, v24
; ZVBB-NEXT: vmv8r.v v16, v0
; ZVBB-NEXT: ret
More information about the llvm-commits
mailing list