[llvm] [RISCV] Prefer vrgatherei16 for shuffles (PR #66291)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 13 14:08:04 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
<details>
<summary>Changes</summary>
If the data type is larger than e16, and the requires more than LMUL1 register class, prefer the use of vrgatherei16. This has three major benefits: 1) Less work needed to evaluate the constant for e.g. vid sequences. Remember that arithmetic generally scales lineary with LMUL. 2) Less register pressure. In particular, the source and indices registers *can* overlap so using a smaller index can significantly help at m8. 3) Smaller constants. We've got a bunch of tricks for materializing small constants, and if needed, can use a EEW=16 load.
Reviewers, this is a resurrection of something I started a few months back. I just stumbled across the local branch again. I vaguely remember there being a problem with constant materialization interaction which appears resolved, but I'm a bit worried I've forgotten something here. Careful consideration appreciated.
--
Patch is 139.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66291.diff
11 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+9)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll (+6-5)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll (+47-103)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll (+42-68)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll (+47-103)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll (+44-55)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+413-621)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll (+108-356)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll (+4-4)
- (modified) llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll (+107-138)
- (modified) llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll (+8-4)
<pre>
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a470ceae90ce591..961c3942ce8976e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4651,6 +4651,15 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
IndexVT = IndexVT.changeVectorElementType(MVT::i16);
}
+ // If the mask allows, we can do all the index computation in 16 bits. This
+ // requires less work and less register pressure at high LMUL, and creates
+ // smaller constants which may be cheaper to materialize.
+ if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts * 2) &&
+ (IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
+ GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
+ IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+ }
+
MVT IndexContainerVT =
ContainerVT.changeVectorElementType(IndexVT.getScalarType());
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 19d563c0ecbacc0..a2fde2addc14e66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -33,16 +33,17 @@ define void @buildvec_no_vid_v4f32(<4 x float>* %x) {
define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize {
; CHECK-LABEL: hang_when_merging_stores_after_legalization:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: li a0, 7
; CHECK-NEXT: vmul.vx v14, v12, a0
-; CHECK-NEXT: vrgather.vv v12, v8, v14
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vadd.vi v8, v14, -14
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v0, 12
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT: vrgather.vv v12, v10, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v12
; CHECK-NEXT: ret
%z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 6fa9cddde622ce5..c1a4aa4b05d4bec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -36,34 +36,20 @@ define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) {
; One vXf64 test case to very that we don't optimize it.
; FIXME: Is there better codegen we can do here?
define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
-; RV32-V128-LABEL: interleave_v2f64:
-; RV32-V128: # %bb.0:
-; RV32-V128-NEXT: vmv1r.v v12, v9
-; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; RV32-V128-NEXT: vid.v v9
-; RV32-V128-NEXT: vsrl.vi v14, v9, 1
-; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v14
-; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-V128-NEXT: vmv.v.i v0, 10
-; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t
-; RV32-V128-NEXT: vmv.v.v v8, v10
-; RV32-V128-NEXT: ret
-;
-; RV64-V128-LABEL: interleave_v2f64:
-; RV64-V128: # %bb.0:
-; RV64-V128-NEXT: vmv1r.v v12, v9
-; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-V128-NEXT: vid.v v10
-; RV64-V128-NEXT: vsrl.vi v14, v10, 1
-; RV64-V128-NEXT: vrgather.vv v10, v8, v14
-; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-V128-NEXT: vmv.v.i v0, 10
-; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t
-; RV64-V128-NEXT: vmv.v.v v8, v10
-; RV64-V128-NEXT: ret
+; V128-LABEL: interleave_v2f64:
+; V128: # %bb.0:
+; V128-NEXT: vmv1r.v v12, v9
+; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; V128-NEXT: vid.v v9
+; V128-NEXT: vsrl.vi v14, v9, 1
+; V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; V128-NEXT: vrgatherei16.vv v10, v8, v14
+; V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; V128-NEXT: vmv.v.i v0, 10
+; V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t
+; V128-NEXT: vmv.v.v v8, v10
+; V128-NEXT: ret
;
; RV32-V512-LABEL: interleave_v2f64:
; RV32-V512: # %bb.0:
@@ -255,56 +241,34 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; RV32-V128-NEXT: addi sp, sp, -16
; RV32-V128-NEXT: .cfi_def_cfa_offset 16
; RV32-V128-NEXT: csrr a0, vlenb
-; RV32-V128-NEXT: li a1, 24
-; RV32-V128-NEXT: mul a0, a0, a1
+; RV32-V128-NEXT: slli a0, a0, 2
; RV32-V128-NEXT: sub sp, sp, a0
-; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV32-V128-NEXT: csrr a0, vlenb
-; RV32-V128-NEXT: slli a0, a0, 3
-; RV32-V128-NEXT: add a0, sp, a0
-; RV32-V128-NEXT: addi a0, a0, 16
-; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-V128-NEXT: addi a0, sp, 16
-; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0)
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
; RV32-V128-NEXT: li a1, 32
; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
-; RV32-V128-NEXT: vle32.v v24, (a0)
+; RV32-V128-NEXT: vle16.v v4, (a0)
; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1)
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
-; RV32-V128-NEXT: vle32.v v16, (a0)
-; RV32-V128-NEXT: csrr a0, vlenb
-; RV32-V128-NEXT: slli a0, a0, 4
-; RV32-V128-NEXT: add a0, sp, a0
-; RV32-V128-NEXT: addi a0, a0, 16
-; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-V128-NEXT: vle16.v v24, (a0)
+; RV32-V128-NEXT: addi a0, sp, 16
+; RV32-V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
; RV32-V128-NEXT: lui a0, 699051
; RV32-V128-NEXT: addi a0, a0, -1366
; RV32-V128-NEXT: vmv.s.x v0, a0
-; RV32-V128-NEXT: vrgather.vv v16, v8, v24
-; RV32-V128-NEXT: csrr a0, vlenb
-; RV32-V128-NEXT: slli a0, a0, 4
-; RV32-V128-NEXT: add a0, sp, a0
-; RV32-V128-NEXT: addi a0, a0, 16
-; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-V128-NEXT: csrr a0, vlenb
-; RV32-V128-NEXT: slli a0, a0, 3
-; RV32-V128-NEXT: add a0, sp, a0
-; RV32-V128-NEXT: addi a0, a0, 16
-; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-V128-NEXT: vrgather.vv v16, v8, v24, v0.t
-; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-V128-NEXT: vmv4r.v v24, v8
+; RV32-V128-NEXT: vrgatherei16.vv v24, v8, v4
; RV32-V128-NEXT: addi a0, sp, 16
-; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-V128-NEXT: vwaddu.vv v0, v8, v24
+; RV32-V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
+; RV32-V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
+; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-V128-NEXT: vwaddu.vv v0, v8, v16
; RV32-V128-NEXT: li a0, -1
-; RV32-V128-NEXT: vwmaccu.vx v0, a0, v24
+; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16
; RV32-V128-NEXT: vmv8r.v v8, v0
+; RV32-V128-NEXT: vmv8r.v v16, v24
; RV32-V128-NEXT: csrr a0, vlenb
-; RV32-V128-NEXT: li a1, 24
-; RV32-V128-NEXT: mul a0, a0, a1
+; RV32-V128-NEXT: slli a0, a0, 2
; RV32-V128-NEXT: add sp, sp, a0
; RV32-V128-NEXT: addi sp, sp, 16
; RV32-V128-NEXT: ret
@@ -314,56 +278,34 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; RV64-V128-NEXT: addi sp, sp, -16
; RV64-V128-NEXT: .cfi_def_cfa_offset 16
; RV64-V128-NEXT: csrr a0, vlenb
-; RV64-V128-NEXT: li a1, 24
-; RV64-V128-NEXT: mul a0, a0, a1
+; RV64-V128-NEXT: slli a0, a0, 2
; RV64-V128-NEXT: sub sp, sp, a0
-; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; RV64-V128-NEXT: csrr a0, vlenb
-; RV64-V128-NEXT: slli a0, a0, 3
-; RV64-V128-NEXT: add a0, sp, a0
-; RV64-V128-NEXT: addi a0, a0, 16
-; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-V128-NEXT: addi a0, sp, 16
-; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0)
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
; RV64-V128-NEXT: li a1, 32
; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
-; RV64-V128-NEXT: vle32.v v24, (a0)
+; RV64-V128-NEXT: vle16.v v4, (a0)
; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1)
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
-; RV64-V128-NEXT: vle32.v v16, (a0)
-; RV64-V128-NEXT: csrr a0, vlenb
-; RV64-V128-NEXT: slli a0, a0, 4
-; RV64-V128-NEXT: add a0, sp, a0
-; RV64-V128-NEXT: addi a0, a0, 16
-; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-V128-NEXT: vle16.v v24, (a0)
+; RV64-V128-NEXT: addi a0, sp, 16
+; RV64-V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
; RV64-V128-NEXT: lui a0, 699051
; RV64-V128-NEXT: addiw a0, a0, -1366
; RV64-V128-NEXT: vmv.s.x v0, a0
-; RV64-V128-NEXT: vrgather.vv v16, v8, v24
-; RV64-V128-NEXT: csrr a0, vlenb
-; RV64-V128-NEXT: slli a0, a0, 4
-; RV64-V128-NEXT: add a0, sp, a0
-; RV64-V128-NEXT: addi a0, a0, 16
-; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV64-V128-NEXT: csrr a0, vlenb
-; RV64-V128-NEXT: slli a0, a0, 3
-; RV64-V128-NEXT: add a0, sp, a0
-; RV64-V128-NEXT: addi a0, a0, 16
-; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-V128-NEXT: vrgather.vv v16, v8, v24, v0.t
-; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV64-V128-NEXT: vmv4r.v v24, v8
+; RV64-V128-NEXT: vrgatherei16.vv v24, v8, v4
; RV64-V128-NEXT: addi a0, sp, 16
-; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-V128-NEXT: vwaddu.vv v0, v8, v24
+; RV64-V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
+; RV64-V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
+; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-V128-NEXT: vwaddu.vv v0, v8, v16
; RV64-V128-NEXT: li a0, -1
-; RV64-V128-NEXT: vwmaccu.vx v0, a0, v24
+; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16
; RV64-V128-NEXT: vmv8r.v v8, v0
+; RV64-V128-NEXT: vmv8r.v v16, v24
; RV64-V128-NEXT: csrr a0, vlenb
-; RV64-V128-NEXT: li a1, 24
-; RV64-V128-NEXT: mul a0, a0, a1
+; RV64-V128-NEXT: slli a0, a0, 2
; RV64-V128-NEXT: add sp, sp, a0
; RV64-V128-NEXT: addi sp, sp, 16
; RV64-V128-NEXT: ret
@@ -450,10 +392,12 @@ define <4 x double> @unary_interleave_v4f64(<4 x double> %x) {
; RV64-V128: # %bb.0:
; RV64-V128-NEXT: lui a0, 12304
; RV64-V128-NEXT: addiw a0, a0, 512
-; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64-V128-NEXT: vmv.s.x v10, a0
-; RV64-V128-NEXT: vsext.vf8 v12, v10
-; RV64-V128-NEXT: vrgather.vv v10, v8, v12
+; RV64-V128-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-V128-NEXT: vsext.vf2 v12, v10
+; RV64-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64-V128-NEXT: vrgatherei16.vv v10, v8, v12
; RV64-V128-NEXT: vmv.v.v v8, v10
; RV64-V128-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 8d66248a1e57df4..a7852ea5843d8a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -72,10 +72,12 @@ define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) {
; RV64: # %bb.0:
; RV64-NEXT: lui a0, 4096
; RV64-NEXT: addiw a0, a0, 513
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64-NEXT: vmv.s.x v10, a0
-; RV64-NEXT: vsext.vf8 v12, v10
-; RV64-NEXT: vrgather.vv v10, v8, v12
+; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT: vsext.vf2 v12, v10
+; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v10, v8, v12
; RV64-NEXT: vmv.v.v v8, v10
; RV64-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 0, i32 1>
@@ -100,10 +102,12 @@ define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) {
; RV64: # %bb.0:
; RV64-NEXT: lui a0, 4096
; RV64-NEXT: addiw a0, a0, 513
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64-NEXT: vmv.s.x v10, a0
-; RV64-NEXT: vsext.vf8 v12, v10
-; RV64-NEXT: vrgather.vv v10, v8, v12
+; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT: vsext.vf2 v12, v10
+; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v10, v8, v12
; RV64-NEXT: vmv.v.v v8, v10
; RV64-NEXT: ret
%s = shufflevector <4 x double> poison, <4 x double> %x, <4 x i32> <i32 5, i32 6, i32 4, i32 5>
@@ -111,66 +115,37 @@ define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) {
}
define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) {
-; RV32-LABEL: vrgather_shuffle_vv_v4f64:
-; RV32: # %bb.0:
-; RV32-NEXT: lui a0, %hi(.LCPI6_0)
-; RV32-NEXT: addi a0, a0, %lo(.LCPI6_0)
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vle16.v v14, (a0)
-; RV32-NEXT: vrgatherei16.vv v12, v8, v14
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 8
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; RV32-NEXT: vrgather.vi v12, v10, 1, v0.t
-; RV32-NEXT: vmv.v.v v8, v12
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vrgather_shuffle_vv_v4f64:
-; RV64: # %bb.0:
-; RV64-NEXT: lui a0, %hi(.LCPI6_0)
-; RV64-NEXT: addi a0, a0, %lo(.LCPI6_0)
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vle64.v v14, (a0)
-; RV64-NEXT: vrgather.vv v12, v8, v14
-; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.i v0, 8
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; RV64-NEXT: vrgather.vi v12, v10, 1, v0.t
-; RV64-NEXT: vmv.v.v v8, v12
-; RV64-NEXT: ret
+; CHECK-LABEL: vrgather_shuffle_vv_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0)
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vle16.v v14, (a0)
+; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 8
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu
+; CHECK-NEXT: vrgather.vi v12, v10, 1, v0.t
+; CHECK-NEXT: vmv.v.v v8, v12
+; CHECK-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
ret <4 x double> %s
}
define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
-; RV32-LABEL: vrgather_shuffle_xv_v4f64:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; RV32-NEXT: vid.v v12
-; RV32-NEXT: lui a0, %hi(.LCPI7_0)
-; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0)
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vrsub.vi v12, v12, 4
-; RV32-NEXT: vmv.v.i v0, 12
-; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu
-; RV32-NEXT: vrgatherei16.vv v10, v8, v12, v0.t
-; RV32-NEXT: vmv.v.v v8, v10
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vrgather_shuffle_xv_v4f64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vid.v v10
-; RV64-NEXT: vrsub.vi v12, v10, 4
-; RV64-NEXT: lui a0, %hi(.LCPI7_0)
-; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0)
-; RV64-NEXT: vlse64.v v10, (a0), zero
-; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.i v0, 12
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; RV64-NEXT: vrgather.vv v10, v8, v12, v0.t
-; RV64-NEXT: vmv.v.v v8, v10
-; RV64-NEXT: ret
+; CHECK-LABEL: vrgather_shuffle_xv_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vid.v v12
+; CHECK-NEXT: lui a0, %hi(.LCPI7_0)
+; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0)
+; CHECK-NEXT: vlse64.v v10, (a0), zero
+; CHECK-NEXT: vrsub.vi v12, v12, 4
+; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t
+; CHECK-NEXT: vmv.v.v v8, v10
+; CHECK-NEXT: ret
%s = shufflevector <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x double> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
ret <4 x double> %s
}
@@ -193,17 +168,16 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
;
; RV64-LABEL: vrgather_shuffle_vx_v4f64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vid.v v10
-; RV64-NEXT: li a0, 3
-; RV64-NEXT: vmul.vx v12, v10, a0
+; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT: vid.v v12
; RV64-NEXT: lui a0, %hi(.LCPI8_0)
; RV64-NEXT: addi a0, a0, %lo(.LCPI8_0)
; RV64-NEXT: vlse64.v v10, (a0), zero
-; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: li a0, 3
+; RV64-NEXT: vmul.vx v12, v12, a0
; RV64-NEXT: vmv.v.i v0, 3
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; RV64-NEXT: vrgather.vv v10, v8, v12, v0.t
+; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; RV64-NEXT: vrgatherei16.vv v10, v8, v12, v0.t
; RV64-NEXT: vmv.v.v v8, v10
; RV64-NEXT: ret
%s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index f9a64498afacc10..83e64651c5c6313 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -49,34 +49,20 @@ define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) {
; One vXi64 test case to very that we don't optimize it.
; FIXME: Is there better codegen we can do here?
de...
<truncated>
</pre>
</details>
https://github.com/llvm/llvm-project/pull/66291
More information about the llvm-commits
mailing list