[llvm] [RISCV] Custom legalize <N x i128>, <4 x i256>, etc.. shuffles (PR #122352)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 9 11:46:56 PST 2025
https://github.com/preames created https://github.com/llvm/llvm-project/pull/122352
I have a particular user downstream who likes to write shuffles in terms of unions involving _BitInt(128) types. This isn't completely crazy because there's a bunch of code in the wild which was written with SSE in mind, so 128 bits is a common data fragment size.
The problem is that generic lowering scalarizes this to ELEN, and we end up with really terrible extract/insert sequences if the i128 shuffle is between other (non-i128) operations.
I explored trying to do this via generic lowering infrastructure, and frankly got lost. Doing this a target specific DAG is a bit ugly - really, there's nothing hugely target specific here - but oh well. If reviewers prefer, I could probably phrase this as a generic DAG combine, but I'm not sure that's hugely better. If reviewers have a strong preference on how to handle this, let me know, but I may need a bit of help.
A couple notes:
* The argument passing weirdness is due to a missing combine to turn a build_vector of adjacent i64 loads back into a vector load. I'm a bit surprised we don't get that, but the isel output clearly has the build_vector at i64.
* The splat case I plan to revisit in another patch. That's a relatively common pattern, and the fact I have to scalarize that to avoid an infinite loop is non-ideal.
>From ae3c85d432a9af391a601186ec842a4da8a6fff7 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 9 Jan 2025 09:54:47 -0800
Subject: [PATCH] [RISCV] Custom legalize <N x i128>, <4 x i256>, etc..
shuffles
I have a particular user downstream who likes to write shuffles
in terms of unions involving _BitInt(128) types. This isn't completely
crazy because there's a bunch of code in the wild which was written
with SSE in mind, so 128 bits is a common data fragment size.
The problem is that generic lowering scalarizes this to ELEN, and we
end up with really terrible extract/insert sequences if the i128
shuffle is between other (non-i128) operations.
I explored trying to do this via generic lowering infrastructure, and
frankly got lost. Doing this a target specific DAG is a bit ugly -
really, there's nothing hugely target specific here - but oh well.
If reviewers prefer, I could probably phrase this as a generic DAG
combine, but I'm not sure that's hugely better. If reviewers have
a strong preference on how to handle this, let me know, but I may
need a bit of help.
A couple notes:
* The argument passing weirdness is due to a missing combine to turn
a build_vector of adjacent i64 loads back into a vector load. I'm
a bit surprised we don't get that, but the isel output clearly has
the build_vector at i64.
* The splat case I plan to revisit in another patch. That's a relatively
common pattern, and the fact I have to scalarize that to avoid an
infinite loop is non-ideal.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 38 +-
.../RISCV/rvv/fixed-vectors-int-shuffles.ll | 367 ++++++++++--------
.../rvv/fixed-vectors-shuffle-exact-vlen.ll | 189 ++-------
3 files changed, 255 insertions(+), 339 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6c58989b1afb4c..d3fc182319bba1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1530,7 +1530,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL,
ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
- ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP});
+ ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP,
+ ISD::VECTOR_SHUFFLE});
if (Subtarget.hasVendorXTHeadMemPair())
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
if (Subtarget.useRVVForFixedLengthVectors())
@@ -16926,6 +16927,37 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(VT.getSimpleVT(), StridedLoad);
}
+/// Custom legalize <N x i128> or <N x i256> to <M x ELEN>. This runs
+/// during the combine phase before type legalization, and relies on
+/// DAGCombine not undoing the transform if isShuffleMaskLegal returns false
+/// for the source mask.
+static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget,
+ const RISCVTargetLowering &TLI) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ const unsigned ElementSize = VT.getScalarSizeInBits();
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+
+ if (TLI.isTypeLegal(VT) || ElementSize <= Subtarget.getELen() ||
+ !isPowerOf2_64(ElementSize) || VT.getVectorNumElements() % 2 != 0 ||
+ VT.isFloatingPoint() || TLI.isShuffleMaskLegal(Mask, VT))
+ return SDValue();
+
+ SmallVector<int, 8> NewMask;
+ narrowShuffleMaskElts(2, Mask, NewMask);
+
+ LLVMContext &C = *DAG.getContext();
+ EVT NewEltVT = EVT::getIntegerVT(C, ElementSize / 2);
+ EVT NewVT = EVT::getVectorVT(C, NewEltVT, VT.getVectorNumElements() * 2);
+ SDValue Res = DAG.getVectorShuffle(NewVT, DL, DAG.getBitcast(NewVT, V1),
+ DAG.getBitcast(NewVT, V2), NewMask);
+ return DAG.getBitcast(VT, Res);
+}
+
+
static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
@@ -18155,6 +18187,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
return V;
break;
+ case ISD::VECTOR_SHUFFLE:
+ if (SDValue V = performVECTOR_SHUFFLECombine(N, DAG, Subtarget, *this))
+ return V;
+ break;
case ISD::INSERT_VECTOR_ELT:
if (SDValue V = performINSERT_VECTOR_ELTCombine(N, DAG, Subtarget, *this))
return V;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 8915603471ec7f..f397fdde2cba7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1145,132 +1145,200 @@ define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
define <4 x i128> @shuffle_i128(<4 x i128> %a) {
; RV32-LABEL: shuffle_i128:
; RV32: # %bb.0:
-; RV32-NEXT: lw a2, 0(a1)
-; RV32-NEXT: lw a3, 4(a1)
-; RV32-NEXT: lw a4, 8(a1)
-; RV32-NEXT: lw a5, 12(a1)
-; RV32-NEXT: lw a6, 48(a1)
-; RV32-NEXT: lw a7, 52(a1)
-; RV32-NEXT: lw t0, 56(a1)
-; RV32-NEXT: lw t1, 60(a1)
-; RV32-NEXT: lw t2, 32(a1)
-; RV32-NEXT: lw t3, 36(a1)
-; RV32-NEXT: lw t4, 40(a1)
-; RV32-NEXT: lw a1, 44(a1)
-; RV32-NEXT: sw t2, 48(a0)
-; RV32-NEXT: sw t3, 52(a0)
-; RV32-NEXT: sw t4, 56(a0)
-; RV32-NEXT: sw a1, 60(a0)
-; RV32-NEXT: sw a6, 32(a0)
-; RV32-NEXT: sw a7, 36(a0)
-; RV32-NEXT: sw t0, 40(a0)
-; RV32-NEXT: sw t1, 44(a0)
-; RV32-NEXT: sw a2, 16(a0)
-; RV32-NEXT: sw a3, 20(a0)
-; RV32-NEXT: sw a4, 24(a0)
-; RV32-NEXT: sw a5, 28(a0)
-; RV32-NEXT: sw a2, 0(a0)
-; RV32-NEXT: sw a3, 4(a0)
-; RV32-NEXT: sw a4, 8(a0)
-; RV32-NEXT: sw a5, 12(a0)
+; RV32-NEXT: addi sp, sp, -128
+; RV32-NEXT: .cfi_def_cfa_offset 128
+; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: addi s0, sp, 128
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: lw a2, 60(a1)
+; RV32-NEXT: sw a2, 60(sp)
+; RV32-NEXT: lw a2, 56(a1)
+; RV32-NEXT: sw a2, 56(sp)
+; RV32-NEXT: lw a2, 52(a1)
+; RV32-NEXT: sw a2, 52(sp)
+; RV32-NEXT: lw a2, 48(a1)
+; RV32-NEXT: sw a2, 48(sp)
+; RV32-NEXT: lw a2, 44(a1)
+; RV32-NEXT: sw a2, 44(sp)
+; RV32-NEXT: lw a2, 40(a1)
+; RV32-NEXT: sw a2, 40(sp)
+; RV32-NEXT: lw a2, 36(a1)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: lw a2, 32(a1)
+; RV32-NEXT: sw a2, 32(sp)
+; RV32-NEXT: lw a2, 12(a1)
+; RV32-NEXT: sw a2, 12(sp)
+; RV32-NEXT: lw a2, 8(a1)
+; RV32-NEXT: sw a2, 8(sp)
+; RV32-NEXT: lw a2, 4(a1)
+; RV32-NEXT: sw a2, 4(sp)
+; RV32-NEXT: lw a1, 0(a1)
+; RV32-NEXT: mv a2, sp
+; RV32-NEXT: sw a1, 0(sp)
+; RV32-NEXT: lui a1, %hi(.LCPI78_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI78_0)
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v8, (a2)
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vle16.v v12, (a1)
+; RV32-NEXT: vrgatherei16.vv v16, v8, v12
+; RV32-NEXT: vse64.v v16, (a0)
+; RV32-NEXT: addi sp, s0, -128
+; RV32-NEXT: .cfi_def_cfa sp, 128
+; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 128
+; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: shuffle_i128:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -128
+; RV64-NEXT: .cfi_def_cfa_offset 128
+; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 128
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: ld a2, 56(a1)
+; RV64-NEXT: sd a2, 56(sp)
; RV64-NEXT: ld a2, 48(a1)
-; RV64-NEXT: ld a3, 56(a1)
-; RV64-NEXT: ld a4, 0(a1)
-; RV64-NEXT: ld a5, 8(a1)
-; RV64-NEXT: ld a6, 32(a1)
-; RV64-NEXT: ld a1, 40(a1)
-; RV64-NEXT: sd a2, 32(a0)
-; RV64-NEXT: sd a3, 40(a0)
-; RV64-NEXT: sd a6, 48(a0)
-; RV64-NEXT: sd a1, 56(a0)
-; RV64-NEXT: sd a4, 0(a0)
-; RV64-NEXT: sd a5, 8(a0)
-; RV64-NEXT: sd a4, 16(a0)
-; RV64-NEXT: sd a5, 24(a0)
+; RV64-NEXT: sd a2, 48(sp)
+; RV64-NEXT: ld a2, 40(a1)
+; RV64-NEXT: sd a2, 40(sp)
+; RV64-NEXT: ld a2, 32(a1)
+; RV64-NEXT: sd a2, 32(sp)
+; RV64-NEXT: ld a2, 8(a1)
+; RV64-NEXT: sd a2, 8(sp)
+; RV64-NEXT: ld a1, 0(a1)
+; RV64-NEXT: mv a2, sp
+; RV64-NEXT: sd a1, 0(sp)
+; RV64-NEXT: lui a1, %hi(.LCPI78_0)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI78_0)
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vle64.v v8, (a2)
+; RV64-NEXT: vle16.v v12, (a1)
+; RV64-NEXT: vrgatherei16.vv v16, v8, v12
+; RV64-NEXT: vse64.v v16, (a0)
+; RV64-NEXT: addi sp, s0, -128
+; RV64-NEXT: .cfi_def_cfa sp, 128
+; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 128
+; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
%res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
ret <4 x i128> %res
}
define void @shuffle_i128_ldst(ptr %p) {
-; RV32-LABEL: shuffle_i128_ldst:
+; CHECK-LABEL: shuffle_i128_ldst:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: lui a1, %hi(.LCPI79_0)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI79_0)
+; CHECK-NEXT: vle16.v v12, (a1)
+; CHECK-NEXT: vrgatherei16.vv v16, v8, v12
+; CHECK-NEXT: vse64.v v16, (a0)
+; CHECK-NEXT: ret
+ %a = load <4 x i128>, ptr %p
+ %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+ store <4 x i128> %res, ptr %p
+ ret void
+}
+
+define void @shuffle_i256_ldst(ptr %p) {
+; CHECK-LABEL: shuffle_i256_ldst:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a1, %hi(.LCPI80_0)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI80_0)
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vle8.v v8, (a1)
+; CHECK-NEXT: vle64.v v16, (a0)
+; CHECK-NEXT: vsext.vf2 v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v24, v16, v10
+; CHECK-NEXT: vse64.v v24, (a0)
+; CHECK-NEXT: ret
+ %a = load <4 x i256>, ptr %p
+ %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+ store <4 x i256> %res, ptr %p
+ ret void
+}
+
+define void @shuffle_i64_splat(ptr %p) nounwind {
+; RV32-LABEL: shuffle_i64_splat:
; RV32: # %bb.0:
-; RV32-NEXT: lw a1, 48(a0)
-; RV32-NEXT: lw a2, 52(a0)
-; RV32-NEXT: lw a3, 56(a0)
-; RV32-NEXT: lw a4, 60(a0)
-; RV32-NEXT: lw a5, 0(a0)
-; RV32-NEXT: lw a6, 4(a0)
-; RV32-NEXT: lw a7, 8(a0)
-; RV32-NEXT: lw t0, 12(a0)
-; RV32-NEXT: lw t1, 32(a0)
-; RV32-NEXT: lw t2, 36(a0)
-; RV32-NEXT: lw t3, 40(a0)
-; RV32-NEXT: lw t4, 44(a0)
-; RV32-NEXT: sw t1, 48(a0)
-; RV32-NEXT: sw t2, 52(a0)
-; RV32-NEXT: sw t3, 56(a0)
-; RV32-NEXT: sw t4, 60(a0)
-; RV32-NEXT: sw a5, 16(a0)
-; RV32-NEXT: sw a6, 20(a0)
-; RV32-NEXT: sw a7, 24(a0)
-; RV32-NEXT: sw t0, 28(a0)
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: shuffle_i64_splat:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a1, 0(a0)
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a1
+; RV64-NEXT: vse64.v v8, (a0)
+; RV64-NEXT: ret
+ %a = load <4 x i64>, ptr %p
+ %res = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ store <4 x i64> %res, ptr %p
+ ret void
+}
+
+define void @shuffle_i128_splat(ptr %p) nounwind {
+; RV32-LABEL: shuffle_i128_splat:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a1, 0(a0)
+; RV32-NEXT: lw a2, 4(a0)
+; RV32-NEXT: lw a3, 8(a0)
+; RV32-NEXT: lw a4, 12(a0)
+; RV32-NEXT: sw a1, 48(a0)
+; RV32-NEXT: sw a2, 52(a0)
+; RV32-NEXT: sw a3, 56(a0)
+; RV32-NEXT: sw a4, 60(a0)
+; RV32-NEXT: sw a1, 16(a0)
+; RV32-NEXT: sw a2, 20(a0)
+; RV32-NEXT: sw a3, 24(a0)
+; RV32-NEXT: sw a4, 28(a0)
; RV32-NEXT: sw a1, 32(a0)
; RV32-NEXT: sw a2, 36(a0)
; RV32-NEXT: sw a3, 40(a0)
; RV32-NEXT: sw a4, 44(a0)
; RV32-NEXT: ret
;
-; RV64-LABEL: shuffle_i128_ldst:
+; RV64-LABEL: shuffle_i128_splat:
; RV64: # %bb.0:
; RV64-NEXT: ld a1, 0(a0)
; RV64-NEXT: ld a2, 8(a0)
-; RV64-NEXT: ld a3, 32(a0)
-; RV64-NEXT: ld a4, 40(a0)
-; RV64-NEXT: ld a5, 48(a0)
-; RV64-NEXT: ld a6, 56(a0)
-; RV64-NEXT: sd a3, 48(a0)
-; RV64-NEXT: sd a4, 56(a0)
+; RV64-NEXT: sd a1, 48(a0)
+; RV64-NEXT: sd a2, 56(a0)
; RV64-NEXT: sd a1, 16(a0)
; RV64-NEXT: sd a2, 24(a0)
-; RV64-NEXT: sd a5, 32(a0)
-; RV64-NEXT: sd a6, 40(a0)
+; RV64-NEXT: sd a1, 32(a0)
+; RV64-NEXT: sd a2, 40(a0)
; RV64-NEXT: ret
%a = load <4 x i128>, ptr %p
- %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+ %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
store <4 x i128> %res, ptr %p
ret void
}
-define void @shuffle_i256_ldst(ptr %p) {
-; RV32-LABEL: shuffle_i256_ldst:
+define void @shuffle_i256_splat(ptr %p) nounwind {
+; RV32-LABEL: shuffle_i256_splat:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
-; RV32-NEXT: .cfi_offset s2, -12
-; RV32-NEXT: .cfi_offset s3, -16
-; RV32-NEXT: .cfi_offset s4, -20
-; RV32-NEXT: .cfi_offset s5, -24
-; RV32-NEXT: .cfi_offset s6, -28
-; RV32-NEXT: .cfi_offset s7, -32
-; RV32-NEXT: .cfi_offset s8, -36
-; RV32-NEXT: .cfi_offset s9, -40
; RV32-NEXT: lw a1, 0(a0)
; RV32-NEXT: lw a2, 4(a0)
; RV32-NEXT: lw a3, 8(a0)
@@ -1279,38 +1347,22 @@ define void @shuffle_i256_ldst(ptr %p) {
; RV32-NEXT: lw a6, 20(a0)
; RV32-NEXT: lw a7, 24(a0)
; RV32-NEXT: lw t0, 28(a0)
-; RV32-NEXT: lw t1, 96(a0)
-; RV32-NEXT: lw t2, 100(a0)
-; RV32-NEXT: lw t3, 104(a0)
-; RV32-NEXT: lw t4, 108(a0)
-; RV32-NEXT: lw t5, 112(a0)
-; RV32-NEXT: lw t6, 116(a0)
-; RV32-NEXT: lw s0, 120(a0)
-; RV32-NEXT: lw s1, 124(a0)
-; RV32-NEXT: lw s2, 64(a0)
-; RV32-NEXT: lw s3, 68(a0)
-; RV32-NEXT: lw s4, 72(a0)
-; RV32-NEXT: lw s5, 76(a0)
-; RV32-NEXT: lw s6, 80(a0)
-; RV32-NEXT: lw s7, 84(a0)
-; RV32-NEXT: lw s8, 88(a0)
-; RV32-NEXT: lw s9, 92(a0)
-; RV32-NEXT: sw s6, 112(a0)
-; RV32-NEXT: sw s7, 116(a0)
-; RV32-NEXT: sw s8, 120(a0)
-; RV32-NEXT: sw s9, 124(a0)
-; RV32-NEXT: sw s2, 96(a0)
-; RV32-NEXT: sw s3, 100(a0)
-; RV32-NEXT: sw s4, 104(a0)
-; RV32-NEXT: sw s5, 108(a0)
-; RV32-NEXT: sw t5, 80(a0)
-; RV32-NEXT: sw t6, 84(a0)
-; RV32-NEXT: sw s0, 88(a0)
-; RV32-NEXT: sw s1, 92(a0)
-; RV32-NEXT: sw t1, 64(a0)
-; RV32-NEXT: sw t2, 68(a0)
-; RV32-NEXT: sw t3, 72(a0)
-; RV32-NEXT: sw t4, 76(a0)
+; RV32-NEXT: sw a5, 112(a0)
+; RV32-NEXT: sw a6, 116(a0)
+; RV32-NEXT: sw a7, 120(a0)
+; RV32-NEXT: sw t0, 124(a0)
+; RV32-NEXT: sw a1, 96(a0)
+; RV32-NEXT: sw a2, 100(a0)
+; RV32-NEXT: sw a3, 104(a0)
+; RV32-NEXT: sw a4, 108(a0)
+; RV32-NEXT: sw a5, 80(a0)
+; RV32-NEXT: sw a6, 84(a0)
+; RV32-NEXT: sw a7, 88(a0)
+; RV32-NEXT: sw t0, 92(a0)
+; RV32-NEXT: sw a1, 64(a0)
+; RV32-NEXT: sw a2, 68(a0)
+; RV32-NEXT: sw a3, 72(a0)
+; RV32-NEXT: sw a4, 76(a0)
; RV32-NEXT: sw a5, 48(a0)
; RV32-NEXT: sw a6, 52(a0)
; RV32-NEXT: sw a7, 56(a0)
@@ -1319,59 +1371,30 @@ define void @shuffle_i256_ldst(ptr %p) {
; RV32-NEXT: sw a2, 36(a0)
; RV32-NEXT: sw a3, 40(a0)
; RV32-NEXT: sw a4, 44(a0)
-; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: .cfi_restore s1
-; RV32-NEXT: .cfi_restore s2
-; RV32-NEXT: .cfi_restore s3
-; RV32-NEXT: .cfi_restore s4
-; RV32-NEXT: .cfi_restore s5
-; RV32-NEXT: .cfi_restore s6
-; RV32-NEXT: .cfi_restore s7
-; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
-; RV64-LABEL: shuffle_i256_ldst:
+; RV64-LABEL: shuffle_i256_splat:
; RV64: # %bb.0:
-; RV64-NEXT: ld a1, 96(a0)
-; RV64-NEXT: ld a2, 104(a0)
-; RV64-NEXT: ld a3, 112(a0)
-; RV64-NEXT: ld a4, 120(a0)
-; RV64-NEXT: ld a5, 0(a0)
-; RV64-NEXT: ld a6, 8(a0)
-; RV64-NEXT: ld a7, 16(a0)
-; RV64-NEXT: ld t0, 24(a0)
-; RV64-NEXT: ld t1, 64(a0)
-; RV64-NEXT: ld t2, 72(a0)
-; RV64-NEXT: ld t3, 80(a0)
-; RV64-NEXT: ld t4, 88(a0)
-; RV64-NEXT: sd t1, 96(a0)
-; RV64-NEXT: sd t2, 104(a0)
-; RV64-NEXT: sd t3, 112(a0)
-; RV64-NEXT: sd t4, 120(a0)
-; RV64-NEXT: sd a5, 32(a0)
-; RV64-NEXT: sd a6, 40(a0)
-; RV64-NEXT: sd a7, 48(a0)
-; RV64-NEXT: sd t0, 56(a0)
+; RV64-NEXT: ld a1, 0(a0)
+; RV64-NEXT: ld a2, 8(a0)
+; RV64-NEXT: ld a3, 16(a0)
+; RV64-NEXT: ld a4, 24(a0)
+; RV64-NEXT: sd a1, 96(a0)
+; RV64-NEXT: sd a2, 104(a0)
+; RV64-NEXT: sd a3, 112(a0)
+; RV64-NEXT: sd a4, 120(a0)
+; RV64-NEXT: sd a1, 32(a0)
+; RV64-NEXT: sd a2, 40(a0)
+; RV64-NEXT: sd a3, 48(a0)
+; RV64-NEXT: sd a4, 56(a0)
; RV64-NEXT: sd a1, 64(a0)
; RV64-NEXT: sd a2, 72(a0)
; RV64-NEXT: sd a3, 80(a0)
; RV64-NEXT: sd a4, 88(a0)
; RV64-NEXT: ret
%a = load <4 x i256>, ptr %p
- %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+ %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
store <4 x i256> %res, ptr %p
ret void
}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 4603c0d24f5d79..a42a4b0d3531c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -402,49 +402,16 @@ entry:
}
define void @shuffle_i128_ldst(ptr %p) vscale_range(2,2) {
-; RV32-LABEL: shuffle_i128_ldst:
-; RV32: # %bb.0:
-; RV32-NEXT: lw a1, 48(a0)
-; RV32-NEXT: lw a2, 52(a0)
-; RV32-NEXT: lw a3, 56(a0)
-; RV32-NEXT: lw a4, 60(a0)
-; RV32-NEXT: lw a5, 0(a0)
-; RV32-NEXT: lw a6, 4(a0)
-; RV32-NEXT: lw a7, 8(a0)
-; RV32-NEXT: lw t0, 12(a0)
-; RV32-NEXT: lw t1, 32(a0)
-; RV32-NEXT: lw t2, 36(a0)
-; RV32-NEXT: lw t3, 40(a0)
-; RV32-NEXT: lw t4, 44(a0)
-; RV32-NEXT: sw t1, 48(a0)
-; RV32-NEXT: sw t2, 52(a0)
-; RV32-NEXT: sw t3, 56(a0)
-; RV32-NEXT: sw t4, 60(a0)
-; RV32-NEXT: sw a5, 16(a0)
-; RV32-NEXT: sw a6, 20(a0)
-; RV32-NEXT: sw a7, 24(a0)
-; RV32-NEXT: sw t0, 28(a0)
-; RV32-NEXT: sw a1, 32(a0)
-; RV32-NEXT: sw a2, 36(a0)
-; RV32-NEXT: sw a3, 40(a0)
-; RV32-NEXT: sw a4, 44(a0)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: shuffle_i128_ldst:
-; RV64: # %bb.0:
-; RV64-NEXT: ld a1, 0(a0)
-; RV64-NEXT: ld a2, 8(a0)
-; RV64-NEXT: ld a3, 32(a0)
-; RV64-NEXT: ld a4, 40(a0)
-; RV64-NEXT: ld a5, 48(a0)
-; RV64-NEXT: ld a6, 56(a0)
-; RV64-NEXT: sd a3, 48(a0)
-; RV64-NEXT: sd a4, 56(a0)
-; RV64-NEXT: sd a1, 16(a0)
-; RV64-NEXT: sd a2, 24(a0)
-; RV64-NEXT: sd a5, 32(a0)
-; RV64-NEXT: sd a6, 40(a0)
-; RV64-NEXT: ret
+; CHECK-LABEL: shuffle_i128_ldst:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl4re64.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v8
+; CHECK-NEXT: vmv4r.v v12, v8
+; CHECK-NEXT: vmv1r.v v14, v11
+; CHECK-NEXT: vmv1r.v v15, v10
+; CHECK-NEXT: vs4r.v v12, (a0)
+; CHECK-NEXT: ret
%a = load <4 x i128>, ptr %p
%res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
store <4 x i128> %res, ptr %p
@@ -452,129 +419,19 @@ define void @shuffle_i128_ldst(ptr %p) vscale_range(2,2) {
}
define void @shuffle_i256_ldst(ptr %p) vscale_range(2,2) {
-; RV32-LABEL: shuffle_i256_ldst:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
-; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
-; RV32-NEXT: .cfi_offset s2, -12
-; RV32-NEXT: .cfi_offset s3, -16
-; RV32-NEXT: .cfi_offset s4, -20
-; RV32-NEXT: .cfi_offset s5, -24
-; RV32-NEXT: .cfi_offset s6, -28
-; RV32-NEXT: .cfi_offset s7, -32
-; RV32-NEXT: .cfi_offset s8, -36
-; RV32-NEXT: .cfi_offset s9, -40
-; RV32-NEXT: lw a1, 0(a0)
-; RV32-NEXT: lw a2, 4(a0)
-; RV32-NEXT: lw a3, 8(a0)
-; RV32-NEXT: lw a4, 12(a0)
-; RV32-NEXT: lw a5, 16(a0)
-; RV32-NEXT: lw a6, 20(a0)
-; RV32-NEXT: lw a7, 24(a0)
-; RV32-NEXT: lw t0, 28(a0)
-; RV32-NEXT: lw t1, 96(a0)
-; RV32-NEXT: lw t2, 100(a0)
-; RV32-NEXT: lw t3, 104(a0)
-; RV32-NEXT: lw t4, 108(a0)
-; RV32-NEXT: lw t5, 112(a0)
-; RV32-NEXT: lw t6, 116(a0)
-; RV32-NEXT: lw s0, 120(a0)
-; RV32-NEXT: lw s1, 124(a0)
-; RV32-NEXT: lw s2, 64(a0)
-; RV32-NEXT: lw s3, 68(a0)
-; RV32-NEXT: lw s4, 72(a0)
-; RV32-NEXT: lw s5, 76(a0)
-; RV32-NEXT: lw s6, 80(a0)
-; RV32-NEXT: lw s7, 84(a0)
-; RV32-NEXT: lw s8, 88(a0)
-; RV32-NEXT: lw s9, 92(a0)
-; RV32-NEXT: sw s6, 112(a0)
-; RV32-NEXT: sw s7, 116(a0)
-; RV32-NEXT: sw s8, 120(a0)
-; RV32-NEXT: sw s9, 124(a0)
-; RV32-NEXT: sw s2, 96(a0)
-; RV32-NEXT: sw s3, 100(a0)
-; RV32-NEXT: sw s4, 104(a0)
-; RV32-NEXT: sw s5, 108(a0)
-; RV32-NEXT: sw t5, 80(a0)
-; RV32-NEXT: sw t6, 84(a0)
-; RV32-NEXT: sw s0, 88(a0)
-; RV32-NEXT: sw s1, 92(a0)
-; RV32-NEXT: sw t1, 64(a0)
-; RV32-NEXT: sw t2, 68(a0)
-; RV32-NEXT: sw t3, 72(a0)
-; RV32-NEXT: sw t4, 76(a0)
-; RV32-NEXT: sw a5, 48(a0)
-; RV32-NEXT: sw a6, 52(a0)
-; RV32-NEXT: sw a7, 56(a0)
-; RV32-NEXT: sw t0, 60(a0)
-; RV32-NEXT: sw a1, 32(a0)
-; RV32-NEXT: sw a2, 36(a0)
-; RV32-NEXT: sw a3, 40(a0)
-; RV32-NEXT: sw a4, 44(a0)
-; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: .cfi_restore s1
-; RV32-NEXT: .cfi_restore s2
-; RV32-NEXT: .cfi_restore s3
-; RV32-NEXT: .cfi_restore s4
-; RV32-NEXT: .cfi_restore s5
-; RV32-NEXT: .cfi_restore s6
-; RV32-NEXT: .cfi_restore s7
-; RV32-NEXT: .cfi_restore s8
-; RV32-NEXT: .cfi_restore s9
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: shuffle_i256_ldst:
-; RV64: # %bb.0:
-; RV64-NEXT: ld a1, 96(a0)
-; RV64-NEXT: ld a2, 104(a0)
-; RV64-NEXT: ld a3, 112(a0)
-; RV64-NEXT: ld a4, 120(a0)
-; RV64-NEXT: ld a5, 0(a0)
-; RV64-NEXT: ld a6, 8(a0)
-; RV64-NEXT: ld a7, 16(a0)
-; RV64-NEXT: ld t0, 24(a0)
-; RV64-NEXT: ld t1, 64(a0)
-; RV64-NEXT: ld t2, 72(a0)
-; RV64-NEXT: ld t3, 80(a0)
-; RV64-NEXT: ld t4, 88(a0)
-; RV64-NEXT: sd t1, 96(a0)
-; RV64-NEXT: sd t2, 104(a0)
-; RV64-NEXT: sd t3, 112(a0)
-; RV64-NEXT: sd t4, 120(a0)
-; RV64-NEXT: sd a5, 32(a0)
-; RV64-NEXT: sd a6, 40(a0)
-; RV64-NEXT: sd a7, 48(a0)
-; RV64-NEXT: sd t0, 56(a0)
-; RV64-NEXT: sd a1, 64(a0)
-; RV64-NEXT: sd a2, 72(a0)
-; RV64-NEXT: sd a3, 80(a0)
-; RV64-NEXT: sd a4, 88(a0)
-; RV64-NEXT: ret
+; CHECK-LABEL: shuffle_i256_ldst:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl8re64.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v8
+; CHECK-NEXT: vmv1r.v v11, v9
+; CHECK-NEXT: vmv8r.v v16, v8
+; CHECK-NEXT: vmv1r.v v20, v14
+; CHECK-NEXT: vmv1r.v v21, v15
+; CHECK-NEXT: vmv1r.v v22, v12
+; CHECK-NEXT: vmv1r.v v23, v13
+; CHECK-NEXT: vs8r.v v16, (a0)
+; CHECK-NEXT: ret
%a = load <4 x i256>, ptr %p
%res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
store <4 x i256> %res, ptr %p
More information about the llvm-commits
mailing list