[llvm] d69ccf3 - [RISCV] Combine shuffle of shuffles to a single shuffle (#178095)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 9 14:48:36 PST 2026
Author: Ryan Buchner
Date: 2026-02-09T14:48:31-08:00
New Revision: d69ccf3b34e7ca8fa42e4c078f026b6e898e2c90
URL: https://github.com/llvm/llvm-project/commit/d69ccf3b34e7ca8fa42e4c078f026b6e898e2c90
DIFF: https://github.com/llvm/llvm-project/commit/d69ccf3b34e7ca8fa42e4c078f026b6e898e2c90.diff
LOG: [RISCV] Combine shuffle of shuffles to a single shuffle (#178095)
Compressing to a single shuffle doesn't remove any information and the backend can better apply specific optimizations to a single shuffle.
Addresses #176218.
---------
Co-authored-by: Luke Lau <luke_lau at igalia.com>
Added:
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-merge.ll
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 975baa7e2e504..c30d585bf89dc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5486,6 +5486,56 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT,
return convertFromScalableVector(VT, Res, DAG, Subtarget);
}
+// A shuffle of shuffles where the final data only is drawn from 2 input ops
+// can be compressed into a single shuffle
+static SDValue compressShuffleOfShuffles(ShuffleVectorSDNode *SVN,
+ const RISCVSubtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue V1 = SVN->getOperand(0);
+ SDValue V2 = SVN->getOperand(1);
+
+ if (V1.getOpcode() != ISD::VECTOR_SHUFFLE ||
+ V2.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ if (!V1.hasOneUse() || !V2.hasOneUse())
+ return SDValue();
+
+ ArrayRef<int> Mask = SVN->getMask();
+ ArrayRef<int> V1Mask = cast<ShuffleVectorSDNode>(V1.getNode())->getMask();
+ ArrayRef<int> V2Mask = cast<ShuffleVectorSDNode>(V2.getNode())->getMask();
+ unsigned NumElts = Mask.size();
+ SmallVector<int> NewMask(NumElts, -1);
+ for (unsigned Idx : seq<unsigned>(NumElts)) {
+ int Lane = Mask[Idx];
+ // Don't assign if poison
+ if (Lane == -1)
+ continue;
+ int OrigLane;
+ bool SecondOp = false;
+ if ((unsigned)Lane < NumElts) {
+ OrigLane = V1Mask[Lane];
+ } else {
+ OrigLane = V2Mask[Lane - NumElts];
+ SecondOp = true;
+ }
+ if (OrigLane == -1)
+ continue;
+ // Don't handle if shuffling from a second operand
+ if ((unsigned)OrigLane >= NumElts)
+ return SDValue();
+ if (SecondOp)
+ OrigLane += NumElts;
+ NewMask[Idx] = OrigLane;
+ }
+
+ MVT VT = SVN->getSimpleValueType(0);
+ SDLoc DL(SVN);
+
+ return DAG.getVectorShuffle(VT, DL, V1->getOperand(0), V2->getOperand(0),
+ NewMask);
+}
+
/// Match v(f)slide1up/down idioms. These operations involve sliding
/// N-1 elements to make room for an inserted scalar at one end.
static SDValue lowerVECTOR_SHUFFLEAsVSlide1(const SDLoc &DL, MVT VT,
@@ -20435,7 +20485,8 @@ static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG,
const unsigned NumElts = VT.getVectorNumElements();
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+ ArrayRef<int> Mask = SVN->getMask();
MVT XLenVT = Subtarget.getXLenVT();
// Recognized a disguised select of add/sub.
@@ -20464,6 +20515,9 @@ static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, DL, VT, A, NewB);
}
+ if (SDValue V = compressShuffleOfShuffles(SVN, Subtarget, DAG))
+ return V;
+
// Custom legalize <N x i128> or <N x i256> to <M x ELEN>. This runs
// during the combine phase before type legalization, and relies on
// DAGCombine not undoing the transform if isShuffleMaskLegal returns false
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
index 5683476852683..e398a858684b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
@@ -295,12 +295,9 @@ define <4 x i8> @interleave_shuffles(<4 x i8> %x) {
; CHECK-LABEL: interleave_shuffles:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vrgather.vi v9, v8, 0
-; CHECK-NEXT: vrgather.vi v10, v8, 1
-; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vwaddu.vv v8, v9, v10
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v8, a0, v10
+; CHECK-NEXT: vmv1r.v v9, v8
+; CHECK-NEXT: vslideup.vi v9, v8, 2
+; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%y = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
%z = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-merge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-merge.ll
new file mode 100644
index 0000000000000..e571dd6fde84c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-merge.ll
@@ -0,0 +1,441 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
+
+; Can be optimized as a merge followed by a shuffle
+define <16 x i16> @shuffle_shuffle_disjoint(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI0_0
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 4 # 0x4
+; CHECK-NEXT: .byte 8 # 0x8
+; CHECK-NEXT: .byte 12 # 0xc
+; CHECK-NEXT: .byte 1 # 0x1
+; CHECK-NEXT: .byte 5 # 0x5
+; CHECK-NEXT: .byte 9 # 0x9
+; CHECK-NEXT: .byte 13 # 0xd
+; CHECK-NEXT: .byte 3 # 0x3
+; CHECK-NEXT: .byte 7 # 0x7
+; CHECK-NEXT: .byte 11 # 0xb
+; CHECK-NEXT: .byte 15 # 0xf
+; CHECK-NEXT: .byte 2 # 0x2
+; CHECK-NEXT: .byte 6 # 0x6
+; CHECK-NEXT: .byte 10 # 0xa
+; CHECK-NEXT: .byte 14 # 0xe
+; CHECK-LABEL: shuffle_shuffle_disjoint:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle8.v v14, (a0)
+; CHECK-NEXT: lui a0, 3
+; CHECK-NEXT: addi a0, a0, 819
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
+; CHECK-NEXT: vsext.vf2 v12, v14
+; CHECK-NEXT: vrgather.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i16> %merge
+}
+
+; Can be optimized as a merge followed by a shuffle
+define <16 x i16> @shuffle_shuffle_disjoint_unordered(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI1_0
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 15 # 0xf
+; CHECK-NEXT: .byte 14 # 0xe
+; CHECK-NEXT: .byte 12 # 0xc
+; CHECK-NEXT: .byte 1 # 0x1
+; CHECK-NEXT: .byte 2 # 0x2
+; CHECK-NEXT: .byte 9 # 0x9
+; CHECK-NEXT: .byte 13 # 0xd
+; CHECK-NEXT: .byte 3 # 0x3
+; CHECK-NEXT: .byte 7 # 0x7
+; CHECK-NEXT: .byte 11 # 0xb
+; CHECK-NEXT: .byte 4 # 0x4
+; CHECK-NEXT: .byte 5 # 0x5
+; CHECK-NEXT: .byte 6 # 0x6
+; CHECK-NEXT: .byte 10 # 0xa
+; CHECK-NEXT: .byte 8 # 0x8
+; CHECK-LABEL: shuffle_shuffle_disjoint_unordered:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle8.v v14, (a0)
+; CHECK-NEXT: lui a0, 3
+; CHECK-NEXT: addi a0, a0, 819
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
+; CHECK-NEXT: vsext.vf2 v12, v14
+; CHECK-NEXT: vrgather.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 19, i32 23, i32 3, i32 4, i32 20, i32 6, i32 7, i32 16, i32 17, i32 18, i32 1, i32 5, i32 21, i32 22, i32 2>
+ ret <16 x i16> %merge
+}
+
+; Can be optimized since the lanes are disjoint, but a single lane is used multiple times by one of the vectors
+define <16 x i16> @shuffle_shuffle_duplicated_within_operand(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI2_0
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 8 # 0x8
+; CHECK-NEXT: .byte 12 # 0xc
+; CHECK-NEXT: .byte 1 # 0x1
+; CHECK-NEXT: .byte 5 # 0x5
+; CHECK-NEXT: .byte 9 # 0x9
+; CHECK-NEXT: .byte 13 # 0xd
+; CHECK-NEXT: .byte 3 # 0x3
+; CHECK-NEXT: .byte 7 # 0x7
+; CHECK-NEXT: .byte 11 # 0xb
+; CHECK-NEXT: .byte 15 # 0xf
+; CHECK-NEXT: .byte 2 # 0x2
+; CHECK-NEXT: .byte 6 # 0x6
+; CHECK-NEXT: .byte 10 # 0xa
+; CHECK-NEXT: .byte 14 # 0xe
+; CHECK-LABEL: shuffle_shuffle_duplicated_within_operand:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle8.v v14, (a0)
+; CHECK-NEXT: lui a0, 3
+; CHECK-NEXT: addi a0, a0, 803
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
+; CHECK-NEXT: vsext.vf2 v12, v14
+; CHECK-NEXT: vrgather.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 0, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i16> %merge
+}
+
+; Can't be optimized as merge-shuffle since the same lane (8) is used from both operands
+define <16 x i16> @shuffle_shuffle_duplicated_lane(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI3_0
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .half 3 # 0x3
+; CHECK-NEXT: .half 7 # 0x7
+; CHECK-NEXT: .half 8 # 0x8
+; CHECK-NEXT: .half 15 # 0xf
+; CHECK-NEXT: .half 2 # 0x2
+; CHECK-NEXT: .half 6 # 0x6
+; CHECK-NEXT: .half 10 # 0xa
+; CHECK-NEXT: .half 14 # 0xe
+; CHECK-LABEL: LCPI3_1
+; CHECK-NEXT: .half 0 # 0x0
+; CHECK-NEXT: .half 0 # 0x0
+; CHECK-NEXT: .half 8 # 0x8
+; CHECK-NEXT: .half 12 # 0xc
+; CHECK-NEXT: .half 1 # 0x1
+; CHECK-NEXT: .half 5 # 0x5
+; CHECK-NEXT: .half 9 # 0x9
+; CHECK-NEXT: .half 13 # 0xd
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-LABEL: shuffle_shuffle_duplicated_lane:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI3_1)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI3_1)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT: vle16.v v14, (a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI3_0)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: li a0, -256
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgather.vv v12, v8, v14
+; CHECK-NEXT: vrgather.vv v12, v10, v16, v0.t
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 0, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 8, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i16> %merge
+}
+
+; Can't be optimized since shuff0 is used twice
+define <16 x i16> @shuffle_shuffle_multiple_uses(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI4_0
+; CHECK-NEXT: .half 0 # 0x0
+; CHECK-NEXT: .half 4 # 0x4
+; CHECK-NEXT: .half 8 # 0x8
+; CHECK-NEXT: .half 12 # 0xc
+; CHECK-NEXT: .half 1 # 0x1
+; CHECK-NEXT: .half 5 # 0x5
+; CHECK-NEXT: .half 9 # 0x9
+; CHECK-NEXT: .half 13 # 0xd
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-LABEL: LCPI4_1
+; CHECK-NEXT: .half 3 # 0x3
+; CHECK-NEXT: .half 7 # 0x7
+; CHECK-NEXT: .half 11 # 0xb
+; CHECK-NEXT: .half 15 # 0xf
+; CHECK-NEXT: .half 2 # 0x2
+; CHECK-NEXT: .half 6 # 0x6
+; CHECK-NEXT: .half 10 # 0xa
+; CHECK-NEXT: .half 14 # 0xe
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-LABEL: shuffle_shuffle_multiple_uses:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle16.v v12, (a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI4_1)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_1)
+; CHECK-NEXT: vle16.v v14, (a0)
+; CHECK-NEXT: vrgather.vv v16, v8, v12
+; CHECK-NEXT: vrgather.vv v8, v10, v14
+; CHECK-NEXT: vmv.v.v v10, v16
+; CHECK-NEXT: vslideup.vi v10, v8, 8
+; CHECK-NEXT: vadd.vv v8, v16, v10
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %add = add <16 x i16> %shuff0, %merge
+ ret <16 x i16> %add
+}
+
+; Can be optimized as a merge followed by a shuffle
+define <16 x i16> @shuffle_shuffle_unbalanced(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI5_0
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 4 # 0x4
+; CHECK-NEXT: .byte 8 # 0x8
+; CHECK-NEXT: .byte 12 # 0xc
+; CHECK-NEXT: .byte 1 # 0x1
+; CHECK-NEXT: .byte 5 # 0x5
+; CHECK-NEXT: .byte 9 # 0x9
+; CHECK-NEXT: .byte 13 # 0xd
+; CHECK-NEXT: .byte 3 # 0x3
+; CHECK-NEXT: .byte 7 # 0x7
+; CHECK-NEXT: .byte 11 # 0xb
+; CHECK-NEXT: .byte 15 # 0xf
+; CHECK-NEXT: .byte 2 # 0x2
+; CHECK-NEXT: .byte 6 # 0x6
+; CHECK-NEXT: .byte 10 # 0xa
+; CHECK-NEXT: .byte 14 # 0xe
+; CHECK-LABEL: shuffle_shuffle_unbalanced:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle8.v v14, (a0)
+; CHECK-NEXT: lui a0, 7
+; CHECK-NEXT: addi a0, a0, 1843
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
+; CHECK-NEXT: vsext.vf2 v12, v14
+; CHECK-NEXT: vrgather.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 8, i32 9>
+ ret <16 x i16> %merge
+}
+
+; Can't be optimized since final one of the elements used is shuffled to a poison index
+define <16 x i16> @shuffle_shuffle_poison(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI6_0
+; CHECK-NEXT: .half 0 # 0x0
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .half 8 # 0x8
+; CHECK-NEXT: .half 12 # 0xc
+; CHECK-NEXT: .half 1 # 0x1
+; CHECK-NEXT: .half 5 # 0x5
+; CHECK-NEXT: .half 9 # 0x9
+; CHECK-NEXT: .half 13 # 0xd
+; CHECK-NEXT: .half 3 # 0x3
+; CHECK-NEXT: .half 7 # 0x7
+; CHECK-NEXT: .half 11 # 0xb
+; CHECK-NEXT: .half 15 # 0xf
+; CHECK-NEXT: .half 2 # 0x2
+; CHECK-NEXT: .half 6 # 0x6
+; CHECK-NEXT: .half 10 # 0xa
+; CHECK-NEXT: .half 14 # 0xe
+; CHECK-LABEL: shuffle_shuffle_poison:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle16.v v12, (a0)
+; CHECK-NEXT: lui a0, 3
+; CHECK-NEXT: addi a0, a0, 803
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
+; CHECK-NEXT: vrgather.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 poison, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i16> %merge
+}
+
+; Can't be optimized since final one of the elements used is shuffled to a poison index
+define <16 x i16> @shuffle_shuffle_poison2(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI7_0
+; CHECK-NEXT: .half 0 # 0x0
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .half 8 # 0x8
+; CHECK-NEXT: .half 12 # 0xc
+; CHECK-NEXT: .half 1 # 0x1
+; CHECK-NEXT: .half 5 # 0x5
+; CHECK-NEXT: .half 9 # 0x9
+; CHECK-NEXT: .half 13 # 0xd
+; CHECK-NEXT: .half 3 # 0x3
+; CHECK-NEXT: .half 7 # 0x7
+; CHECK-NEXT: .half 11 # 0xb
+; CHECK-NEXT: .half 15 # 0xf
+; CHECK-NEXT: .half 2 # 0x2
+; CHECK-NEXT: .half 6 # 0x6
+; CHECK-NEXT: .half 10 # 0xa
+; CHECK-NEXT: .half 14 # 0xe
+; CHECK-LABEL: shuffle_shuffle_poison2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI7_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle16.v v12, (a0)
+; CHECK-NEXT: lui a0, 3
+; CHECK-NEXT: addi a0, a0, 803
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
+; CHECK-NEXT: vrgather.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i16> %merge
+}
+
+; Can't be optimized since first shuffle uses from two
diff erent operands
+define <16 x i16> @shuffle_shuffle_first_multi_shuffle(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI8_0
+; CHECK-NEXT: .half 0 # 0x0
+; CHECK-NEXT: .half 4 # 0x4
+; CHECK-NEXT: .half 8 # 0x8
+; CHECK-NEXT: .half 12 # 0xc
+; CHECK-NEXT: .half 1 # 0x1
+; CHECK-NEXT: .half 5 # 0x5
+; CHECK-NEXT: .half 9 # 0x9
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-LABEL: LCPI8_1
+; CHECK-NEXT: .half 3 # 0x3
+; CHECK-NEXT: .half 7 # 0x7
+; CHECK-NEXT: .half 11 # 0xb
+; CHECK-NEXT: .half 15 # 0xf
+; CHECK-NEXT: .half 2 # 0x2
+; CHECK-NEXT: .half 6 # 0x6
+; CHECK-NEXT: .half 10 # 0xa
+; CHECK-NEXT: .half 14 # 0xe
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .zero 2
+; CHECK-LABEL: shuffle_shuffle_first_multi_shuffle:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT: vle16.v v14, (a0)
+; CHECK-NEXT: lui a0, %hi(.LCPI8_1)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vrgather.vv v12, v8, v14
+; CHECK-NEXT: vrgather.vi v12, v10, 0, v0.t
+; CHECK-NEXT: vrgather.vv v8, v10, v16
+; CHECK-NEXT: vslideup.vi v12, v8, 8
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> %op1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i16> %merge
+}
+
+; Can optimize, the first two shuffles use the same lanes, but only a disjoint
+; set of lanes is used for the final shuffle
+define <16 x i16> @shuffle_shuffle_duplicates_not_used(<16 x i16> %op0, <16 x i16> %op1) {
+; CHECK-LABEL: LCPI9_0
+; CHECK-NEXT: .byte 0 # 0x0
+; CHECK-NEXT: .byte 4 # 0x4
+; CHECK-NEXT: .byte 8 # 0x8
+; CHECK-NEXT: .byte 12 # 0xc
+; CHECK-NEXT: .byte 1 # 0x1
+; CHECK-NEXT: .byte 5 # 0x5
+; CHECK-NEXT: .byte 9 # 0x9
+; CHECK-NEXT: .byte 13 # 0xd
+; CHECK-NEXT: .byte 3 # 0x3
+; CHECK-NEXT: .byte 7 # 0x7
+; CHECK-NEXT: .byte 11 # 0xb
+; CHECK-NEXT: .byte 15 # 0xf
+; CHECK-NEXT: .byte 2 # 0x2
+; CHECK-NEXT: .byte 6 # 0x6
+; CHECK-NEXT: .byte 10 # 0xa
+; CHECK-NEXT: .byte 14 # 0xe
+; CHECK-LABEL: shuffle_shuffle_duplicates_not_used:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle8.v v14, (a0)
+; CHECK-NEXT: lui a0, 3
+; CHECK-NEXT: addi a0, a0, 819
+; CHECK-NEXT: vmv.s.x v0, a0
+; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
+; CHECK-NEXT: vsext.vf2 v12, v14
+; CHECK-NEXT: vrgather.vv v8, v10, v12
+; CHECK-NEXT: ret
+ %shuff0 = shufflevector <16 x i16> %op0, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14>
+ %shuff1 = shufflevector <16 x i16> %op1, <16 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 2, i32 6, i32 10, i32 14, i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
+ %merge = shufflevector <16 x i16> %shuff0, <16 x i16> %shuff1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i16> %merge
+}
More information about the llvm-commits
mailing list