[llvm] [RISCV] Use subreg extract for extract_vector_elt when vlen is known (PR #72666)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 27 08:26:31 PST 2023
https://github.com/preames updated https://github.com/llvm/llvm-project/pull/72666
>From 799588a8e20f6bcfa0e034e727d03f43b2764dbf Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 16 Nov 2023 15:15:43 -0800
Subject: [PATCH 1/2] [RISCV] Use subreg extract for extract_vector_elt when
vlen is known
This is the first in a planned patch series to teach our vector lowering
how to exploit register boundaries in LMUL>1 types when VLEN is known
to be an exact constant. This corresponds to code compiled by clang
with the -mrvv-vector-bits=zvl option.
For extract_vector_elt, if we have a constant index and a known vlen,
then we can identify which register out of a register group is being
accessed. Given this, we can do a sub-register extract for that
register, and then shift any remaining index.
This results in all constant index extracts becoming m1 operations,
and thus eliminates the complexity concern for explode-vector idioms
at high lmul.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 23 ++++
.../CodeGen/RISCV/rvv/extractelt-int-rv64.ll | 29 +++-
.../RISCV/rvv/fixed-vectors-extract.ll | 28 ++++
.../rvv/fixed-vectors-int-explodevector.ll | 130 ++++++++++++++++++
4 files changed, 206 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f89f300a4e9e50c..e9e6e92ea06fbac 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7895,6 +7895,29 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
}
+ // If we're compiling for an exact VLEN value and we have a known
+ // constant index, we can always perform the extract in m1 (or
+ // smaller) as we can determine the register corresponding to
+ // the index in the register group.
+ const unsigned MinVLen = Subtarget.getRealMinVLen();
+ const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
+ IdxC && MinVLen == MaxVLen &&
+ VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
+ unsigned OrigIdx = IdxC->getZExtValue();
+ EVT ElemVT = VecVT.getVectorElementType();
+ unsigned ElemSize = ElemVT.getSizeInBits().getKnownMinValue();
+ unsigned ElemsPerVReg = MinVLen / ElemSize;
+ unsigned RemIdx = OrigIdx % ElemsPerVReg;
+ unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
+ unsigned ExtractIdx =
+ SubRegIdx * ContainerVT.getVectorElementCount().getKnownMinValue();
+ ContainerVT = getLMUL1VT(ContainerVT);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ Idx = DAG.getVectorIdxConstant(RemIdx, DL);
+ }
+
// Reduce the LMUL of our slidedown and vmv.x.s to the smallest LMUL which
// contains our index.
std::optional<uint64_t> MaxIdx;
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index 34dcce3fe058bc9..9df0871046959ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -697,6 +697,27 @@ define i64 @extractelt_nxv8i64_imm(<vscale x 8 x i64> %v) {
ret i64 %r
}
+define i64 @extractelt_nxv8i64_2_exact_vlen(<vscale x 8 x i64> %v) vscale_range(2,2) {
+; CHECK-LABEL: extractelt_nxv8i64_2_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: ret
+ %r = extractelement <vscale x 8 x i64> %v, i32 2
+ ret i64 %r
+}
+
+define i64 @extractelt_nxv8i64_15_exact_vlen(<vscale x 8 x i64> %v) vscale_range(2,2) {
+; CHECK-LABEL: extractelt_nxv8i64_15_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v15, 1
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %r = extractelement <vscale x 8 x i64> %v, i32 15
+ ret i64 %r
+}
+
define i64 @extractelt_nxv8i64_idx(<vscale x 8 x i64> %v, i32 zeroext %idx) {
; CHECK-LABEL: extractelt_nxv8i64_idx:
; CHECK: # %bb.0:
@@ -860,10 +881,10 @@ define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: vs8r.v v16, (a3)
-; CHECK-NEXT: bltu a2, a1, .LBB72_2
+; CHECK-NEXT: bltu a2, a1, .LBB74_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB72_2:
+; CHECK-NEXT: .LBB74_2:
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: ld a0, 0(a0)
@@ -893,10 +914,10 @@ define i64 @extractelt_nxv16i64_idx(<vscale x 16 x i64> %v, i32 zeroext %idx) {
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a2, a1, 1
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: bltu a0, a2, .LBB74_2
+; CHECK-NEXT: bltu a0, a2, .LBB76_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: .LBB74_2:
+; CHECK-NEXT: .LBB76_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: .cfi_def_cfa_offset 80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index 95c1beb284c4003..d3c4b0f5cddd127 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -1137,3 +1137,31 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) {
%ext = extractelement <4 x float> %bo, i32 2
ret float %ext
}
+
+define i32 @extractelt_v16i32_idx7_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
+; CHECK-LABEL: extractelt_v16i32_idx7_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v9, 3
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %a = load <16 x i32>, ptr %x
+ %b = extractelement <16 x i32> %a, i32 7
+ ret i32 %b
+}
+
+define i32 @extractelt_v16i32_idx15_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
+; CHECK-LABEL: extractelt_v16i32_idx15_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v11, 3
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %a = load <16 x i32>, ptr %x
+ %b = extractelement <16 x i32> %a, i32 15
+ ret i32 %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index f3570495600f3c3..e5bbbd661e6a1df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -1084,3 +1084,133 @@ define i64 @explode_16xi64(<16 x i64> %v) {
%add14 = add i64 %add13, %e15
ret i64 %add14
}
+
+define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
+; RV32-LABEL: explode_16xi32_exact_vlen:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vmv.x.s a0, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vmv.x.s a2, v9
+; RV32-NEXT: vslidedown.vi v12, v9, 1
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vslidedown.vi v12, v9, 2
+; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vslidedown.vi v9, v9, 3
+; RV32-NEXT: vmv.x.s a5, v9
+; RV32-NEXT: vmv.x.s a6, v10
+; RV32-NEXT: vslidedown.vi v9, v10, 1
+; RV32-NEXT: vmv.x.s a7, v9
+; RV32-NEXT: vslidedown.vi v9, v10, 2
+; RV32-NEXT: vmv.x.s t0, v9
+; RV32-NEXT: vslidedown.vi v9, v10, 3
+; RV32-NEXT: vmv.x.s t1, v9
+; RV32-NEXT: vmv.x.s t2, v11
+; RV32-NEXT: vslidedown.vi v9, v11, 1
+; RV32-NEXT: vmv.x.s t3, v9
+; RV32-NEXT: vslidedown.vi v9, v11, 2
+; RV32-NEXT: vmv.x.s t4, v9
+; RV32-NEXT: vslidedown.vi v9, v11, 3
+; RV32-NEXT: vmv.x.s t5, v9
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s t6, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, t6, a0
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, a2, a4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: add a5, a5, t0
+; RV32-NEXT: add a0, a0, a5
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: add t1, t1, t3
+; RV32-NEXT: add t1, t1, t4
+; RV32-NEXT: add t1, t1, t5
+; RV32-NEXT: add a0, a0, t1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_16xi32_exact_vlen:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vmv.x.s a0, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vmv.x.s a2, v9
+; RV64-NEXT: vslidedown.vi v12, v9, 1
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vslidedown.vi v12, v9, 2
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v9, v9, 3
+; RV64-NEXT: vmv.x.s a5, v9
+; RV64-NEXT: vmv.x.s a6, v10
+; RV64-NEXT: vslidedown.vi v9, v10, 1
+; RV64-NEXT: vmv.x.s a7, v9
+; RV64-NEXT: vslidedown.vi v9, v10, 2
+; RV64-NEXT: vmv.x.s t0, v9
+; RV64-NEXT: vslidedown.vi v9, v10, 3
+; RV64-NEXT: vmv.x.s t1, v9
+; RV64-NEXT: vmv.x.s t2, v11
+; RV64-NEXT: vslidedown.vi v9, v11, 1
+; RV64-NEXT: vmv.x.s t3, v9
+; RV64-NEXT: vslidedown.vi v9, v11, 2
+; RV64-NEXT: vmv.x.s t4, v9
+; RV64-NEXT: vslidedown.vi v9, v11, 3
+; RV64-NEXT: vmv.x.s t5, v9
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s t6, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, t6, a0
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: add a5, a5, a7
+; RV64-NEXT: add a5, a5, t0
+; RV64-NEXT: add a0, a0, a5
+; RV64-NEXT: add t1, t1, t2
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add t1, t1, t4
+; RV64-NEXT: add t1, t1, t5
+; RV64-NEXT: addw a0, a0, t1
+; RV64-NEXT: ret
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %e8 = extractelement <16 x i32> %v, i32 8
+ %e9 = extractelement <16 x i32> %v, i32 9
+ %e10 = extractelement <16 x i32> %v, i32 10
+ %e11 = extractelement <16 x i32> %v, i32 11
+ %e12 = extractelement <16 x i32> %v, i32 12
+ %e13 = extractelement <16 x i32> %v, i32 13
+ %e14 = extractelement <16 x i32> %v, i32 14
+ %e15 = extractelement <16 x i32> %v, i32 15
+ %add0 = xor i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ %add7 = add i32 %add6, %e8
+ %add8 = add i32 %add7, %e9
+ %add9 = add i32 %add8, %e10
+ %add10 = add i32 %add9, %e11
+ %add11 = add i32 %add10, %e12
+ %add12 = add i32 %add11, %e13
+ %add13 = add i32 %add12, %e14
+ %add14 = add i32 %add13, %e15
+ ret i32 %add14
+}
>From c0ad734630a13f4b9da1df460db84c7fba5bfe6b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Mon, 27 Nov 2023 08:17:16 -0800
Subject: [PATCH 2/2] Fix bug in ExtractIdx computation
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e9e6e92ea06fbac..c5c75ae19daa998 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7904,6 +7904,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
IdxC && MinVLen == MaxVLen &&
VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
+ MVT M1VT = getLMUL1VT(ContainerVT);
unsigned OrigIdx = IdxC->getZExtValue();
EVT ElemVT = VecVT.getVectorElementType();
unsigned ElemSize = ElemVT.getSizeInBits().getKnownMinValue();
@@ -7911,11 +7912,11 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
unsigned RemIdx = OrigIdx % ElemsPerVReg;
unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
unsigned ExtractIdx =
- SubRegIdx * ContainerVT.getVectorElementCount().getKnownMinValue();
- ContainerVT = getLMUL1VT(ContainerVT);
- Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
+ SubRegIdx * M1VT.getVectorElementCount().getKnownMinValue();
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, Vec,
DAG.getVectorIdxConstant(ExtractIdx, DL));
Idx = DAG.getVectorIdxConstant(RemIdx, DL);
+ ContainerVT = M1VT;
}
// Reduce the LMUL of our slidedown and vmv.x.s to the smallest LMUL which
More information about the llvm-commits
mailing list