[llvm] 544a240 - [RISCV] Use v(f)slide1up for shuffle+insert idiom
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Tue May 30 07:37:49 PDT 2023
Author: Philip Reames
Date: 2023-05-30T07:37:41-07:00
New Revision: 544a240ff7ff5bbacd3d50692335a93665ded8d5
URL: https://github.com/llvm/llvm-project/commit/544a240ff7ff5bbacd3d50692335a93665ded8d5
DIFF: https://github.com/llvm/llvm-project/commit/544a240ff7ff5bbacd3d50692335a93665ded8d5.diff
LOG: [RISCV] Use v(f)slide1up for shuffle+insert idiom
This is pretty straight forward in the basic form. I did need to move the slideup matching earlier, but that looks generally profitable on it's own.
As follow ups, I plan to explore the v(f)slide1down variants, and see what I can do to canonicalize the shuffle then insert pattern (see _inverse tests at the end of the vslide1up.ll test).
Differential Revision: https://reviews.llvm.org/D151468
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7010228351d..3dc04d0f29e9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3731,6 +3731,20 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT,
MVT XLenVT = Subtarget.getXLenVT();
MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first;
+ if (Index == 1 && NumSubElts + Index == (int)NumElts &&
+ isa<BuildVectorSDNode>(InPlace)) {
+ if (SDValue Splat = cast<BuildVectorSDNode>(InPlace)->getSplatValue()) {
+ auto OpCode =
+ VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL;
+ auto Vec = DAG.getNode(OpCode, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT),
+ convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget),
+ Splat, TrueMask,
+ DAG.getConstant(NumSubElts + Index, DL, XLenVT));
+ return convertFromScalableVector(VT, Vec, DAG, Subtarget);
+ }
+ }
+
// We slide up by the index that the subvector is being inserted at, and set
// VL to the index + the number of elements being inserted.
unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC;
@@ -3967,6 +3981,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
Subtarget, DAG);
}
+ if (SDValue V =
+ lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// Detect an interleave shuffle and lower to
// (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
int EvenSrc, OddSrc;
@@ -3989,10 +4007,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
}
- if (SDValue V =
- lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
- return V;
-
// Detect shuffles which can be re-expressed as vector selects; these are
// shuffles in which each element in the destination is taken from an element
// at the corresponding index in either source vectors.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
index 688e88202106..37f67cad23e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll
@@ -171,11 +171,8 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-LABEL: trn1.v2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vwaddu.vv v10, v8, v9
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v10, a0, v9
-; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: ret
%tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %tmp0
@@ -256,11 +253,8 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
; CHECK-LABEL: trn1.v2f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vwaddu.vv v10, v8, v9
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v10, a0, v9
-; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: ret
%tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
ret <2 x float> %tmp0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll
index f7b667a36fa6..21fb38643bf2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll
@@ -8,11 +8,7 @@ define <2 x i8> @vslide1up_2xi8(<2 x i8> %v, i8 %b) {
; CHECK-LABEL: vslide1up_2xi8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vmv.v.x v10, a0
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vwaddu.vv v9, v10, v8
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v9, a0, v8
+; CHECK-NEXT: vslide1up.vx v9, v8, a0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x i8> poison, i8 %b, i64 0
@@ -33,8 +29,7 @@ define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) {
; RV64-LABEL: vslide1up_4xi8:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT: vmv.v.x v9, a0
-; RV64-NEXT: vslideup.vi v9, v8, 1
+; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
@@ -55,8 +50,7 @@ define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) {
; RV64-LABEL: vslide1up_4xi8_swapped:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT: vmv.v.x v9, a0
-; RV64-NEXT: vslideup.vi v9, v8, 1
+; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
@@ -68,22 +62,16 @@ define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) {
; RV32-LABEL: vslide1up_2xi16:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vmv.s.x v10, a0
-; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; RV32-NEXT: vwaddu.vv v9, v10, v8
-; RV32-NEXT: li a0, -1
-; RV32-NEXT: vwmaccu.vx v9, a0, v8
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vslideup.vi v9, v8, 1
; RV32-NEXT: vmv1r.v v8, v9
; RV32-NEXT: ret
;
; RV64-LABEL: vslide1up_2xi16:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; RV64-NEXT: vmv.v.x v10, a0
-; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT: vwaddu.vv v9, v10, v8
-; RV64-NEXT: li a0, -1
-; RV64-NEXT: vwmaccu.vx v9, a0, v8
+; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <2 x i16> poison, i16 %b, i64 0
@@ -95,8 +83,7 @@ define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) {
; RV32-LABEL: vslide1up_4xi16:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v9, a0
-; RV32-NEXT: vslideup.vi v9, v8, 1
+; RV32-NEXT: vslide1up.vx v9, v8, a0
; RV32-NEXT: vmv1r.v v8, v9
; RV32-NEXT: ret
;
@@ -117,22 +104,16 @@ define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) {
; RV32-LABEL: vslide1up_2xi32:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a0
-; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vwaddu.vv v9, v10, v8
-; RV32-NEXT: li a0, -1
-; RV32-NEXT: vwmaccu.vx v9, a0, v8
+; RV32-NEXT: vslide1up.vx v9, v8, a0
; RV32-NEXT: vmv1r.v v8, v9
; RV32-NEXT: ret
;
; RV64-LABEL: vslide1up_2xi32:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vmv.s.x v10, a0
-; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; RV64-NEXT: vwaddu.vv v9, v10, v8
-; RV64-NEXT: li a0, -1
-; RV64-NEXT: vwmaccu.vx v9, a0, v8
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vslideup.vi v9, v8, 1
; RV64-NEXT: vmv1r.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <2 x i32> poison, i32 %b, i64 0
@@ -144,8 +125,7 @@ define <4 x i32> @vslide1up_4xi32(<4 x i32> %v, i32 %b) {
; CHECK-LABEL: vslide1up_4xi32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a0
-; CHECK-NEXT: vslideup.vi v9, v8, 1
+; CHECK-NEXT: vslide1up.vx v9, v8, a0
; CHECK-NEXT: vmv.v.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <4 x i32> poison, i32 %b, i64 0
@@ -171,8 +151,7 @@ define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) {
; RV64-LABEL: vslide1up_2xi64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v9, a0
-; RV64-NEXT: vslideup.vi v9, v8, 1
+; RV64-NEXT: vslide1up.vx v9, v8, a0
; RV64-NEXT: vmv.v.v v8, v9
; RV64-NEXT: ret
%vb = insertelement <2 x i64> poison, i64 %b, i64 0
@@ -198,8 +177,7 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) {
; RV64-LABEL: vslide1up_4xi64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vmv.v.x v10, a0
-; RV64-NEXT: vslideup.vi v10, v8, 1
+; RV64-NEXT: vslide1up.vx v10, v8, a0
; RV64-NEXT: vmv.v.v v8, v10
; RV64-NEXT: ret
%vb = insertelement <4 x i64> poison, i64 %b, i64 0
@@ -211,11 +189,7 @@ define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) {
; CHECK-LABEL: vslide1up_2xf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vfmv.v.f v10, fa0
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vwaddu.vv v9, v10, v8
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v9, a0, v8
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x half> poison, half %b, i64 0
@@ -227,8 +201,7 @@ define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) {
; CHECK-LABEL: vslide1up_4xf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vfmv.v.f v9, fa0
-; CHECK-NEXT: vslideup.vi v9, v8, 1
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <4 x half> poison, half %b, i64 0
@@ -240,11 +213,7 @@ define <2 x float> @vslide1up_2xf32(<2 x float> %v, float %b) {
; CHECK-LABEL: vslide1up_2xf32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vfmv.v.f v10, fa0
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vwaddu.vv v9, v10, v8
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v9, a0, v8
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x float> poison, float %b, i64 0
@@ -256,8 +225,7 @@ define <4 x float> @vslide1up_4xf32(<4 x float> %v, float %b) {
; CHECK-LABEL: vslide1up_4xf32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vfmv.v.f v9, fa0
-; CHECK-NEXT: vslideup.vi v9, v8, 1
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv.v.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <4 x float> poison, float %b, i64 0
@@ -269,8 +237,7 @@ define <2 x double> @vslide1up_2xf64(<2 x double> %v, double %b) {
; CHECK-LABEL: vslide1up_2xf64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT: vfmv.v.f v9, fa0
-; CHECK-NEXT: vslideup.vi v9, v8, 1
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
; CHECK-NEXT: vmv.v.v v8, v9
; CHECK-NEXT: ret
%vb = insertelement <2 x double> poison, double %b, i64 0
@@ -291,6 +258,24 @@ define <4 x double> @vslide1up_4xf64(<4 x double> %v, double %b) {
ret <4 x double> %v1
}
+define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) {
+; CHECK-LABEL: vslide1up_4xi8_with_splat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 14
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vid.v v9
+; CHECK-NEXT: vadd.vi v10, v9, -1
+; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
+; CHECK-NEXT: vmv1r.v v8, v9
+; CHECK-NEXT: ret
+ %vb = insertelement <4 x i8> poison, i8 %b, i64 0
+ %v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer
+ %v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> <i32 1, i32 4, i32 5, i32 6>
+ ret <4 x i8> %v2
+}
+
define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) {
; CHECK-LABEL: vslide1up_v2f64_inverted:
; CHECK: # %bb.0:
@@ -320,7 +305,8 @@ define <4 x i8> @vslide1up_4xi8_inverted(<4 x i8> %v, i8 %b) {
}
-; The length of the shift is less than the suffix
+; The length of the shift is less than the suffix, since we'd have to
+; materailize the splat, using the vslide1up doesn't help us.
define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) {
; CHECK-LABEL: vslide1up_4xi32_neg1:
; CHECK: # %bb.0:
@@ -335,3 +321,15 @@ define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) {
%v1 = shufflevector <4 x i32> %v, <4 x i32> %vb2, <4 x i32> <i32 4, i32 0, i32 1, i32 7>
ret <4 x i32> %v1
}
+
+; We don't know the scalar to do the vslide1up
+define <4 x i32> @vslide1up_4xi32_neg2(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: vslide1up_4xi32_neg2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 1
+; CHECK-NEXT: vmv.v.v v8, v9
+; CHECK-NEXT: ret
+ %res = shufflevector <4 x i32> %v1, <4 x i32> %v2, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+ ret <4 x i32> %res
+}
More information about the llvm-commits
mailing list