[llvm] cf17a24 - [RISCV] Use subreg extract for extract_vector_elt when vlen is known (#72666)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 27 14:33:20 PST 2023
Author: Philip Reames
Date: 2023-11-27T14:33:16-08:00
New Revision: cf17a24a4b47dde8b0222a3e641d55fe8f36a083
URL: https://github.com/llvm/llvm-project/commit/cf17a24a4b47dde8b0222a3e641d55fe8f36a083
DIFF: https://github.com/llvm/llvm-project/commit/cf17a24a4b47dde8b0222a3e641d55fe8f36a083.diff
LOG: [RISCV] Use subreg extract for extract_vector_elt when vlen is known (#72666)
This is the first in a planned patch series to teach our vector lowering
how to exploit register boundaries in LMUL>1 types when VLEN is known to
be an exact constant. This corresponds to code compiled by clang with
the -mrvv-vector-bits=zvl option.
For extract_vector_elt, if we have a constant index and a known vlen,
then we can identify which register out of a register group is being
accessed. Given this, we can do a sub-register extract for that
register, and then shift any remaining index.
This results in all constant index extracts becoming m1 operations, and
thus eliminates the complexity concern for explode-vector idioms at high
lmul.
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e23a7fc1c8e946b..a417b6fe05e59df 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7908,6 +7908,30 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
}
+ // If we're compiling for an exact VLEN value and we have a known
+ // constant index, we can always perform the extract in m1 (or
+ // smaller) as we can determine the register corresponding to
+ // the index in the register group.
+ const unsigned MinVLen = Subtarget.getRealMinVLen();
+ const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
+ IdxC && MinVLen == MaxVLen &&
+ VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
+ MVT M1VT = getLMUL1VT(ContainerVT);
+ unsigned OrigIdx = IdxC->getZExtValue();
+ EVT ElemVT = VecVT.getVectorElementType();
+ unsigned ElemSize = ElemVT.getSizeInBits().getKnownMinValue();
+ unsigned ElemsPerVReg = MinVLen / ElemSize;
+ unsigned RemIdx = OrigIdx % ElemsPerVReg;
+ unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
+ unsigned ExtractIdx =
+ SubRegIdx * M1VT.getVectorElementCount().getKnownMinValue();
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, Vec,
+ DAG.getVectorIdxConstant(ExtractIdx, DL));
+ Idx = DAG.getVectorIdxConstant(RemIdx, DL);
+ ContainerVT = M1VT;
+ }
+
// Reduce the LMUL of our slidedown and vmv.x.s to the smallest LMUL which
// contains our index.
std::optional<uint64_t> MaxIdx;
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index 34dcce3fe058bc9..9df0871046959ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -697,6 +697,27 @@ define i64 @extractelt_nxv8i64_imm(<vscale x 8 x i64> %v) {
ret i64 %r
}
+define i64 @extractelt_nxv8i64_2_exact_vlen(<vscale x 8 x i64> %v) vscale_range(2,2) {
+; CHECK-LABEL: extractelt_nxv8i64_2_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: ret
+ %r = extractelement <vscale x 8 x i64> %v, i32 2
+ ret i64 %r
+}
+
+define i64 @extractelt_nxv8i64_15_exact_vlen(<vscale x 8 x i64> %v) vscale_range(2,2) {
+; CHECK-LABEL: extractelt_nxv8i64_15_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v15, 1
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %r = extractelement <vscale x 8 x i64> %v, i32 15
+ ret i64 %r
+}
+
define i64 @extractelt_nxv8i64_idx(<vscale x 8 x i64> %v, i32 zeroext %idx) {
; CHECK-LABEL: extractelt_nxv8i64_idx:
; CHECK: # %bb.0:
@@ -860,10 +881,10 @@ define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: vs8r.v v16, (a3)
-; CHECK-NEXT: bltu a2, a1, .LBB72_2
+; CHECK-NEXT: bltu a2, a1, .LBB74_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB72_2:
+; CHECK-NEXT: .LBB74_2:
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a0, a0, a2
; CHECK-NEXT: ld a0, 0(a0)
@@ -893,10 +914,10 @@ define i64 @extractelt_nxv16i64_idx(<vscale x 16 x i64> %v, i32 zeroext %idx) {
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a2, a1, 1
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: bltu a0, a2, .LBB74_2
+; CHECK-NEXT: bltu a0, a2, .LBB76_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: .LBB74_2:
+; CHECK-NEXT: .LBB76_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: .cfi_def_cfa_offset 80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index 95c1beb284c4003..d3c4b0f5cddd127 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -1137,3 +1137,31 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) {
%ext = extractelement <4 x float> %bo, i32 2
ret float %ext
}
+
+define i32 @extractelt_v16i32_idx7_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
+; CHECK-LABEL: extractelt_v16i32_idx7_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v9, 3
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %a = load <16 x i32>, ptr %x
+ %b = extractelement <16 x i32> %a, i32 7
+ ret i32 %b
+}
+
+define i32 @extractelt_v16i32_idx15_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
+; CHECK-LABEL: extractelt_v16i32_idx15_exact_vlen:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v11, 3
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+ %a = load <16 x i32>, ptr %x
+ %b = extractelement <16 x i32> %a, i32 15
+ ret i32 %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index f3570495600f3c3..e5bbbd661e6a1df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -1084,3 +1084,133 @@ define i64 @explode_16xi64(<16 x i64> %v) {
%add14 = add i64 %add13, %e15
ret i64 %add14
}
+
+define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
+; RV32-LABEL: explode_16xi32_exact_vlen:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vmv.x.s a0, v12
+; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vmv.x.s a1, v12
+; RV32-NEXT: vmv.x.s a2, v9
+; RV32-NEXT: vslidedown.vi v12, v9, 1
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vslidedown.vi v12, v9, 2
+; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vslidedown.vi v9, v9, 3
+; RV32-NEXT: vmv.x.s a5, v9
+; RV32-NEXT: vmv.x.s a6, v10
+; RV32-NEXT: vslidedown.vi v9, v10, 1
+; RV32-NEXT: vmv.x.s a7, v9
+; RV32-NEXT: vslidedown.vi v9, v10, 2
+; RV32-NEXT: vmv.x.s t0, v9
+; RV32-NEXT: vslidedown.vi v9, v10, 3
+; RV32-NEXT: vmv.x.s t1, v9
+; RV32-NEXT: vmv.x.s t2, v11
+; RV32-NEXT: vslidedown.vi v9, v11, 1
+; RV32-NEXT: vmv.x.s t3, v9
+; RV32-NEXT: vslidedown.vi v9, v11, 2
+; RV32-NEXT: vmv.x.s t4, v9
+; RV32-NEXT: vslidedown.vi v9, v11, 3
+; RV32-NEXT: vmv.x.s t5, v9
+; RV32-NEXT: vmv.s.x v9, zero
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vredxor.vs v8, v8, v9
+; RV32-NEXT: vmv.x.s t6, v8
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, t6, a0
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, a2, a4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add a5, a5, a7
+; RV32-NEXT: add a5, a5, t0
+; RV32-NEXT: add a0, a0, a5
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: add t1, t1, t3
+; RV32-NEXT: add t1, t1, t4
+; RV32-NEXT: add t1, t1, t5
+; RV32-NEXT: add a0, a0, t1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: explode_16xi32_exact_vlen:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v8, 2
+; RV64-NEXT: vmv.x.s a0, v12
+; RV64-NEXT: vslidedown.vi v12, v8, 3
+; RV64-NEXT: vmv.x.s a1, v12
+; RV64-NEXT: vmv.x.s a2, v9
+; RV64-NEXT: vslidedown.vi v12, v9, 1
+; RV64-NEXT: vmv.x.s a3, v12
+; RV64-NEXT: vslidedown.vi v12, v9, 2
+; RV64-NEXT: vmv.x.s a4, v12
+; RV64-NEXT: vslidedown.vi v9, v9, 3
+; RV64-NEXT: vmv.x.s a5, v9
+; RV64-NEXT: vmv.x.s a6, v10
+; RV64-NEXT: vslidedown.vi v9, v10, 1
+; RV64-NEXT: vmv.x.s a7, v9
+; RV64-NEXT: vslidedown.vi v9, v10, 2
+; RV64-NEXT: vmv.x.s t0, v9
+; RV64-NEXT: vslidedown.vi v9, v10, 3
+; RV64-NEXT: vmv.x.s t1, v9
+; RV64-NEXT: vmv.x.s t2, v11
+; RV64-NEXT: vslidedown.vi v9, v11, 1
+; RV64-NEXT: vmv.x.s t3, v9
+; RV64-NEXT: vslidedown.vi v9, v11, 2
+; RV64-NEXT: vmv.x.s t4, v9
+; RV64-NEXT: vslidedown.vi v9, v11, 3
+; RV64-NEXT: vmv.x.s t5, v9
+; RV64-NEXT: vmv.s.x v9, zero
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vredxor.vs v8, v8, v9
+; RV64-NEXT: vmv.x.s t6, v8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: add a0, t6, a0
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: add a5, a5, a7
+; RV64-NEXT: add a5, a5, t0
+; RV64-NEXT: add a0, a0, a5
+; RV64-NEXT: add t1, t1, t2
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add t1, t1, t4
+; RV64-NEXT: add t1, t1, t5
+; RV64-NEXT: addw a0, a0, t1
+; RV64-NEXT: ret
+ %e0 = extractelement <16 x i32> %v, i32 0
+ %e1 = extractelement <16 x i32> %v, i32 1
+ %e2 = extractelement <16 x i32> %v, i32 2
+ %e3 = extractelement <16 x i32> %v, i32 3
+ %e4 = extractelement <16 x i32> %v, i32 4
+ %e5 = extractelement <16 x i32> %v, i32 5
+ %e6 = extractelement <16 x i32> %v, i32 6
+ %e7 = extractelement <16 x i32> %v, i32 7
+ %e8 = extractelement <16 x i32> %v, i32 8
+ %e9 = extractelement <16 x i32> %v, i32 9
+ %e10 = extractelement <16 x i32> %v, i32 10
+ %e11 = extractelement <16 x i32> %v, i32 11
+ %e12 = extractelement <16 x i32> %v, i32 12
+ %e13 = extractelement <16 x i32> %v, i32 13
+ %e14 = extractelement <16 x i32> %v, i32 14
+ %e15 = extractelement <16 x i32> %v, i32 15
+ %add0 = xor i32 %e0, %e1
+ %add1 = add i32 %add0, %e2
+ %add2 = add i32 %add1, %e3
+ %add3 = add i32 %add2, %e4
+ %add4 = add i32 %add3, %e5
+ %add5 = add i32 %add4, %e6
+ %add6 = add i32 %add5, %e7
+ %add7 = add i32 %add6, %e8
+ %add8 = add i32 %add7, %e9
+ %add9 = add i32 %add8, %e10
+ %add10 = add i32 %add9, %e11
+ %add11 = add i32 %add10, %e12
+ %add12 = add i32 %add11, %e13
+ %add13 = add i32 %add12, %e14
+ %add14 = add i32 %add13, %e15
+ ret i32 %add14
+}
More information about the llvm-commits
mailing list