[llvm] 6e9c24e - [RISCV] Lower insert subvector shuffles as vslideups

Fri Mar 24 10:31:19 PDT 2023

Author: Luke Lau
Date: 2023-03-24T17:30:31Z
New Revision: 6e9c24edf052756e1f14d6080bf5af7fb75f17e7

URL: https://github.com/llvm/llvm-project/commit/6e9c24edf052756e1f14d6080bf5af7fb75f17e7
DIFF: https://github.com/llvm/llvm-project/commit/6e9c24edf052756e1f14d6080bf5af7fb75f17e7.diff

LOG: [RISCV] Lower insert subvector shuffles as vslideups

A shuffle with an insert subvector mask is functionally equivalent to:
(insert_subvector v0, (extract_subvector v1, len), index)
We can emulate by doing a vslideup on v1 into the right index, and
carefully selecting VL so that we don't overwrite any more destination
elements than what we have to.
This avoids the need for a select with a mask.

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
    llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9310c8161cd46..7256416520e60 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3315,6 +3315,53 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlidedown(const SDLoc &DL, MVT VT,
       DAG.getConstant(0, DL, XLenVT));
 }
 
+// Because vslideup leaves the destination elements at the start intact, we can
+// use it to perform shuffles that insert subvectors:
+//
+// vector_shuffle v8:v8i8, v9:v8i8, <0, 1, 2, 3, 8, 9, 10, 11>
+// ->
+// vsetvli zero, 8, e8, mf2, ta, ma
+// vslideup.vi v8, v9, 4
+//
+// vector_shuffle v8:v8i8, v9:v8i8 <0, 1, 8, 9, 10, 5, 6, 7>
+// ->
+// vsetvli zero, 5, e8, mf2, tu, ma
+// vslideup.v1 v8, v9, 2
+static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT,
+                                             SDValue V1, SDValue V2,
+                                             ArrayRef<int> Mask,
+                                             const RISCVSubtarget &Subtarget,
+                                             SelectionDAG &DAG) {
+  unsigned NumElts = VT.getVectorNumElements();
+  int NumSubElts, Index;
+  if (!ShuffleVectorInst::isInsertSubvectorMask(Mask, NumElts, NumSubElts,
+                                                Index))
+    return SDValue();
+
+  bool OpsSwapped = Mask[Index] < (int)NumElts;
+  SDValue InPlace = OpsSwapped ? V2 : V1;
+  SDValue ToInsert = OpsSwapped ? V1 : V2;
+
+  MVT XLenVT = Subtarget.getXLenVT();
+  MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+  auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first;
+  // We slide up by the index that the subvector is being inserted at, and set
+  // VL to the index + the number of elements being inserted.
+  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC;
+  // If the we're adding a suffix to the in place vector, i.e. inserting right
+  // up to the very end of it, then we don't actually care about the tail.
+  if (NumSubElts + Index >= (int)NumElts)
+    Policy |= RISCVII::TAIL_AGNOSTIC;
+  SDValue Slideup = getVSlideup(
+      DAG, Subtarget, DL, ContainerVT,
+      convertToScalableVector(ContainerVT, InPlace, DAG, Subtarget),
+      convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget),
+      DAG.getConstant(Index, DL, XLenVT), TrueMask,
+      DAG.getConstant(NumSubElts + Index, DL, XLenVT),
+      Policy);
+  return convertFromScalableVector(VT, Slideup, DAG, Subtarget);
+}
+
 // Given two input vectors of <[vscale x ]n x ty>, use vwaddu.vv and vwmaccu.vx
 // to create an interleaved vector of <[vscale x] n*2 x ty>.
 // This requires that the size of ty is less than the subtarget's maximum ELEN.
@@ -3551,6 +3598,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
   }
 
+  if (SDValue V =
+          lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
   // Detect shuffles which can be re-expressed as vector selects; these are
   // shuffles in which each element in the destination is taken from an element
   // at the corresponding index in either source vectors.

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index f8e5fce58cd34..0d3169d75cb6b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -41,14 +41,11 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-NEXT:    vrgather.vi v12, v8, 0
 ; LMULMAX1-NEXT:    vrgather.vi v12, v9, 3, v0.t
-; LMULMAX1-NEXT:    li a0, 8
-; LMULMAX1-NEXT:    vmv.s.x v0, a0
-; LMULMAX1-NEXT:    vrgather.vi v9, v10, 0
-; LMULMAX1-NEXT:    li a0, 3
-; LMULMAX1-NEXT:    vmv.s.x v8, a0
-; LMULMAX1-NEXT:    vrgather.vi v9, v11, 3, v0.t
-; LMULMAX1-NEXT:    vmv.v.v v0, v8
-; LMULMAX1-NEXT:    vmerge.vvm v8, v9, v12, v0
+; LMULMAX1-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
+; LMULMAX1-NEXT:    vslideup.vi v11, v10, 2
+; LMULMAX1-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; LMULMAX1-NEXT:    vslideup.vi v11, v12, 0
+; LMULMAX1-NEXT:    vmv1r.v v8, v11
 ; LMULMAX1-NEXT:    ret
 ;
 ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 9a1cfef9bbc83..3aefc3e4dd903 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -648,16 +648,36 @@ entry:
 define <8 x i8> @merge_start_into_end(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_start_into_end:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    ret
+  %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i8> %res
+}
+
+define <8 x i8> @merge_start_into_end_undef(<8 x i8> %v, <8 x i8> %w) {
+; CHECK-LABEL: merge_start_into_end_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    ret
+  %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 undef, i32 10, i32 11>
+  ret <8 x i8> %res
+}
+
+define <8 x i8> @merge_start_into_end_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
+; CHECK-LABEL: merge_start_into_end_undef_at_start:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v11
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    li a0, 240
+; CHECK-NEXT:    li a0, 224
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vadd.vi v8, v11, -4
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
   ret <8 x i8> %res
 }
 
@@ -680,10 +700,9 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @merge_end_into_end(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_end_into_end:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 15
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 0
+; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %res
@@ -692,14 +711,8 @@ define <8 x i8> @merge_end_into_end(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @merge_start_into_middle(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_start_into_middle:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    li a0, 30
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
@@ -708,10 +721,8 @@ define <8 x i8> @merge_start_into_middle(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_start_into_start:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 240
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
@@ -758,8 +769,8 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vadd.vi v11, v10, 2
-; CHECK-NEXT:    lui a0, %hi(.LCPI44_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI44_0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; CHECK-NEXT:    vle8.v v12, (a0)
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 7996e05853a91..3c9eb9d491cf5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -150,13 +150,10 @@ define void @vnsrl_0_i32(ptr %in, ptr %out) {
 ; ZVE32F:       # %bb.0: # %entry
 ; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
-; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; ZVE32F-NEXT:    li a0, 2
-; ZVE32F-NEXT:    vmv.s.x v0, a0
-; ZVE32F-NEXT:    vrgather.vi v10, v8, 0
-; ZVE32F-NEXT:    vrgather.vi v10, v9, 0, v0.t
-; ZVE32F-NEXT:    vse32.v v10, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; ZVE32F-NEXT:    vse32.v v8, (a1)
 ; ZVE32F-NEXT:    ret
 entry:
   %0 = load <4 x i32>, ptr %in, align 4
@@ -209,13 +206,10 @@ define void @vnsrl_0_float(ptr %in, ptr %out) {
 ; ZVE32F:       # %bb.0: # %entry
 ; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
-; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; ZVE32F-NEXT:    li a0, 2
-; ZVE32F-NEXT:    vmv.s.x v0, a0
-; ZVE32F-NEXT:    vrgather.vi v10, v8, 0
-; ZVE32F-NEXT:    vrgather.vi v10, v9, 0, v0.t
-; ZVE32F-NEXT:    vse32.v v10, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; ZVE32F-NEXT:    vse32.v v8, (a1)
 ; ZVE32F-NEXT:    ret
 entry:
   %0 = load <4 x float>, ptr %in, align 4
@@ -259,13 +253,10 @@ define void @vnsrl_0_i64(ptr %in, ptr %out) {
 ; V:       # %bb.0: # %entry
 ; V-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; V-NEXT:    vle64.v v8, (a0)
-; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:    vslidedown.vi v9, v8, 2
-; V-NEXT:    li a0, 2
-; V-NEXT:    vmv.s.x v0, a0
-; V-NEXT:    vrgather.vi v10, v8, 0
-; V-NEXT:    vrgather.vi v10, v9, 0, v0.t
-; V-NEXT:    vse64.v v10, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse64.v v8, (a1)
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: vnsrl_0_i64:
@@ -315,13 +306,10 @@ define void @vnsrl_0_double(ptr %in, ptr %out) {
 ; V:       # %bb.0: # %entry
 ; V-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; V-NEXT:    vle64.v v8, (a0)
-; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:    vslidedown.vi v9, v8, 2
-; V-NEXT:    li a0, 2
-; V-NEXT:    vmv.s.x v0, a0
-; V-NEXT:    vrgather.vi v10, v8, 0
-; V-NEXT:    vrgather.vi v10, v9, 0, v0.t
-; V-NEXT:    vse64.v v10, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse64.v v8, (a1)
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: vnsrl_0_double:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index efcfd326f129d..1c8463978cf2f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -292,15 +292,13 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-LABEL: vector_deinterleave_v2i64_v4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v8, 2
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
 ; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v10, v8, 0
-; CHECK-NEXT:    vrgather.vi v10, v12, 0, v0.t
 ; CHECK-NEXT:    vrgather.vi v9, v8, 1
-; CHECK-NEXT:    vrgather.vi v9, v12, 1, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
 ret {<2 x i64>, <2 x i64>} %retval
@@ -381,15 +379,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-LABEL: vector_deinterleave_v2f64_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v8, 2
+; CHECK-NEXT:    vslidedown.vi v10, v8, 2
 ; CHECK-NEXT:    li a0, 2
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v10, v8, 0
-; CHECK-NEXT:    vrgather.vi v10, v12, 0, v0.t
 ; CHECK-NEXT:    vrgather.vi v9, v8, 1
-; CHECK-NEXT:    vrgather.vi v9, v12, 1, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
 ret {<2 x double>, <2 x double>} %retval