[llvm] [RISCV] Fold extract_vector_elt of a load into the scalar load (PR #76151)
Liao Chunyu via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 25 18:18:50 PST 2023
https://github.com/ChunyuLiao updated https://github.com/llvm/llvm-project/pull/76151
>From 996c25a0a3db67f179f14f1e5e99a275e077e8c1 Mon Sep 17 00:00:00 2001
From: Liao Chunyu <chunyu at iscas.ac.cn>
Date: Thu, 21 Dec 2023 03:48:55 -0500
Subject: [PATCH] [RISCV] Fold extract_vector_elt of a load info the scalar
load
extract_vec_elt (load X), C --> scalar load (X+C)
Scalars seem to be cheaper when getting only one element.
X86 has this fold - combineExtractVectorElt.
DAGCombiner has this fold - DAGCombiner::scalarizeExtractedVectorLoad(),
but run after legalization, not for riscv.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 59 +++-
.../RISCV/rvv/fixed-vectors-extract.ll | 285 ++++--------------
.../RISCV/rvv/fixed-vectors-reduction-fp.ll | 12 +-
.../RISCV/rvv/fixed-vectors-reduction-int.ll | 225 ++++----------
llvm/test/CodeGen/RISCV/vecloadextract.ll | 43 +++
5 files changed, 227 insertions(+), 397 deletions(-)
create mode 100644 llvm/test/CodeGen/RISCV/vecloadextract.ll
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index de15bea72e4666..dd8ef309574c6f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1399,7 +1399,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR,
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL,
- ISD::INSERT_VECTOR_ELT});
+ ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT});
if (Subtarget.hasVendorXTHeadMemPair())
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
if (Subtarget.useRVVForFixedLengthVectors())
@@ -14466,6 +14466,59 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
}
+static SDValue
+performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget,
+ const RISCVTargetLowering &TLI) {
+ SDValue InputVec = N->getOperand(0);
+ SDValue EltIdx = N->getOperand(1);
+ SDLoc DL(N);
+
+ EVT InVecVT = InputVec.getValueType();
+ if (InVecVT.isScalableVector())
+ return SDValue();
+
+ if (!InputVec.hasOneUse())
+ return SDValue();
+
+ auto *LoadVec = dyn_cast<LoadSDNode>(InputVec);
+ EVT VecEltVT = InVecVT.getVectorElementType();
+ // extract_vec_elt (load X), C --> scalar load (X+C)
+ if (LoadVec && ISD::isNormalLoad(LoadVec) && LoadVec->isSimple()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue NewPtr = TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(),
+ InVecVT, EltIdx);
+ Align Alignment = LoadVec->getAlign();
+ MachinePointerInfo MPI;
+ if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltIdx)) {
+ int Elt = ConstEltNo->getZExtValue();
+ unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+ MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
+ Alignment = commonAlignment(Alignment, PtrOff);
+ } else {
+ // Discard the pointer info except the address space because the memory
+ // operand can't represent this new access since the offset is variable.
+ MPI = MachinePointerInfo(LoadVec->getPointerInfo().getAddrSpace());
+ Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
+ }
+
+ // Don't perform the combination if unaligned access is not allowed.
+ unsigned IsFast = 0;
+ if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ VecEltVT, LoadVec->getAddressSpace(), Alignment,
+ LoadVec->getMemOperand()->getFlags(),
+ &IsFast) ||
+ !IsFast)
+ return SDValue();
+
+ SDValue Load =
+ DAG.getLoad(VecEltVT, DL, LoadVec->getChain(), NewPtr, MPI, Alignment,
+ LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
+ DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
+ return Load;
+ }
+ return SDValue();
+}
// If we're concatenating a series of vector loads like
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
// Then we can turn this into a strided load by widening the vector elements
@@ -15535,6 +15588,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue V = performINSERT_VECTOR_ELTCombine(N, DAG, Subtarget, *this))
return V;
break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ if (SDValue V = performEXTRACT_VECTOR_ELTCombine(N, DAG, Subtarget, *this))
+ return V;
+ break;
case RISCVISD::VFMV_V_F_VL: {
const MVT VT = N->getSimpleValueType(0);
SDValue Passthru = N->getOperand(0);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index 06d1ada300a1d2..9a808cf8e0adcd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -7,10 +7,7 @@
define i8 @extractelt_v16i8(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v16i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 7(a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
%b = extractelement <16 x i8> %a, i32 7
@@ -20,10 +17,7 @@ define i8 @extractelt_v16i8(ptr %x) nounwind {
define i16 @extractelt_v8i16(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v8i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 14(a0)
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %x
%b = extractelement <8 x i16> %a, i32 7
@@ -33,10 +27,7 @@ define i16 @extractelt_v8i16(ptr %x) nounwind {
define i32 @extractelt_v4i32(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 8(a0)
; CHECK-NEXT: ret
%a = load <4 x i32>, ptr %x
%b = extractelement <4 x i32> %a, i32 2
@@ -46,20 +37,14 @@ define i32 @extractelt_v4i32(ptr %x) nounwind {
define i64 @extractelt_v2i64(ptr %x) nounwind {
; RV32-LABEL: extractelt_v2i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: extractelt_v2i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
%b = extractelement <2 x i64> %a, i32 0
@@ -69,10 +54,7 @@ define i64 @extractelt_v2i64(ptr %x) nounwind {
define half @extractelt_v8f16(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v8f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: flh fa0, 14(a0)
; CHECK-NEXT: ret
%a = load <8 x half>, ptr %x
%b = extractelement <8 x half> %a, i32 7
@@ -82,10 +64,7 @@ define half @extractelt_v8f16(ptr %x) nounwind {
define float @extractelt_v4f32(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v4f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: flw fa0, 8(a0)
; CHECK-NEXT: ret
%a = load <4 x float>, ptr %x
%b = extractelement <4 x float> %a, i32 2
@@ -95,9 +74,7 @@ define float @extractelt_v4f32(ptr %x) nounwind {
define double @extractelt_v2f64(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: fld fa0, 0(a0)
; CHECK-NEXT: ret
%a = load <2 x double>, ptr %x
%b = extractelement <2 x double> %a, i32 0
@@ -107,12 +84,7 @@ define double @extractelt_v2f64(ptr %x) nounwind {
define i8 @extractelt_v32i8(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v32i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 7(a0)
; CHECK-NEXT: ret
%a = load <32 x i8>, ptr %x
%b = extractelement <32 x i8> %a, i32 7
@@ -122,11 +94,7 @@ define i8 @extractelt_v32i8(ptr %x) nounwind {
define i16 @extractelt_v16i16(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v16i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 14(a0)
; CHECK-NEXT: ret
%a = load <16 x i16>, ptr %x
%b = extractelement <16 x i16> %a, i32 7
@@ -136,11 +104,7 @@ define i16 @extractelt_v16i16(ptr %x) nounwind {
define i32 @extractelt_v8i32(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 6
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 24(a0)
; CHECK-NEXT: ret
%a = load <8 x i32>, ptr %x
%b = extractelement <8 x i32> %a, i32 6
@@ -150,23 +114,14 @@ define i32 @extractelt_v8i32(ptr %x) nounwind {
define i64 @extractelt_v4i64(ptr %x) nounwind {
; RV32-LABEL: extractelt_v4i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v10, v8, a0
-; RV32-NEXT: vmv.x.s a1, v10
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 24(a0)
+; RV32-NEXT: lw a1, 28(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: extractelt_v4i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 24(a0)
; RV64-NEXT: ret
%a = load <4 x i64>, ptr %x
%b = extractelement <4 x i64> %a, i32 3
@@ -176,11 +131,7 @@ define i64 @extractelt_v4i64(ptr %x) nounwind {
define half @extractelt_v16f16(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v16f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: flh fa0, 14(a0)
; CHECK-NEXT: ret
%a = load <16 x half>, ptr %x
%b = extractelement <16 x half> %a, i32 7
@@ -190,11 +141,7 @@ define half @extractelt_v16f16(ptr %x) nounwind {
define float @extractelt_v8f32(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: flw fa0, 8(a0)
; CHECK-NEXT: ret
%a = load <8 x float>, ptr %x
%b = extractelement <8 x float> %a, i32 2
@@ -204,9 +151,7 @@ define float @extractelt_v8f32(ptr %x) nounwind {
define double @extractelt_v4f64(ptr %x) nounwind {
; CHECK-LABEL: extractelt_v4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: fld fa0, 0(a0)
; CHECK-NEXT: ret
%a = load <4 x double>, ptr %x
%b = extractelement <4 x double> %a, i32 0
@@ -220,22 +165,14 @@ define double @extractelt_v4f64(ptr %x) nounwind {
define i64 @extractelt_v3i64(ptr %x) nounwind {
; RV32-LABEL: extractelt_v3i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 3, e64, m2, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 4
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vslidedown.vi v8, v8, 5
-; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: lw a2, 16(a0)
+; RV32-NEXT: lw a1, 20(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: extractelt_v3i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 3, e64, m2, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 2
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 16(a0)
; RV64-NEXT: ret
%a = load <3 x i64>, ptr %x
%b = extractelement <3 x i64> %a, i32 2
@@ -244,43 +181,10 @@ define i64 @extractelt_v3i64(ptr %x) nounwind {
; A LMUL8 type
define i32 @extractelt_v32i32(ptr %x) nounwind {
-; RV32-LABEL: extractelt_v32i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -256
-; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 256
-; RV32-NEXT: andi sp, sp, -128
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: lw a0, 124(sp)
-; RV32-NEXT: addi sp, s0, -256
-; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 256
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extractelt_v32i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -256
-; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill
-; RV64-NEXT: addi s0, sp, 256
-; RV64-NEXT: andi sp, sp, -128
-; RV64-NEXT: li a1, 32
-; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: lw a0, 124(sp)
-; RV64-NEXT: addi sp, s0, -256
-; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 256
-; RV64-NEXT: ret
+; CHECK-LABEL: extractelt_v32i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lw a0, 124(a0)
+; CHECK-NEXT: ret
%a = load <32 x i32>, ptr %x
%b = extractelement <32 x i32> %a, i32 31
ret i32 %b
@@ -288,45 +192,10 @@ define i32 @extractelt_v32i32(ptr %x) nounwind {
; Exercise type legalization for type beyond LMUL8
define i32 @extractelt_v64i32(ptr %x) nounwind {
-; RV32-LABEL: extractelt_v64i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -256
-; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 256
-; RV32-NEXT: andi sp, sp, -128
-; RV32-NEXT: addi a0, a0, 128
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: vse32.v v8, (a0)
-; RV32-NEXT: lw a0, 124(sp)
-; RV32-NEXT: addi sp, s0, -256
-; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 256
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extractelt_v64i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -256
-; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill
-; RV64-NEXT: addi s0, sp, 256
-; RV64-NEXT: andi sp, sp, -128
-; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: li a1, 32
-; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: vse32.v v8, (a0)
-; RV64-NEXT: lw a0, 124(sp)
-; RV64-NEXT: addi sp, s0, -256
-; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 256
-; RV64-NEXT: ret
+; CHECK-LABEL: extractelt_v64i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lw a0, 252(a0)
+; CHECK-NEXT: ret
%a = load <64 x i32>, ptr %x
%b = extractelement <64 x i32> %a, i32 63
ret i32 %b
@@ -335,10 +204,9 @@ define i32 @extractelt_v64i32(ptr %x) nounwind {
define i8 @extractelt_v16i8_idx(ptr %x, i32 zeroext %idx) nounwind {
; CHECK-LABEL: extractelt_v16i8_idx:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vslidedown.vx v8, v8, a1
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: andi a1, a1, 15
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
%b = extractelement <16 x i8> %a, i32 %idx
@@ -348,10 +216,10 @@ define i8 @extractelt_v16i8_idx(ptr %x, i32 zeroext %idx) nounwind {
define i16 @extractelt_v8i16_idx(ptr %x, i32 zeroext %idx) nounwind {
; CHECK-LABEL: extractelt_v8i16_idx:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vslidedown.vx v8, v8, a1
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: andi a1, a1, 7
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %x
%b = extractelement <8 x i16> %a, i32 %idx
@@ -449,12 +317,9 @@ define double @extractelt_v2f64_idx(ptr %x, i32 zeroext %idx) nounwind {
define i8 @extractelt_v32i8_idx(ptr %x, i32 zeroext %idx) nounwind {
; CHECK-LABEL: extractelt_v32i8_idx:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a2, 32
-; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a1
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: andi a1, a1, 31
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%a = load <32 x i8>, ptr %x
%b = extractelement <32 x i8> %a, i32 %idx
@@ -464,11 +329,10 @@ define i8 @extractelt_v32i8_idx(ptr %x, i32 zeroext %idx) nounwind {
define i16 @extractelt_v16i16_idx(ptr %x, i32 zeroext %idx) nounwind {
; CHECK-LABEL: extractelt_v16i16_idx:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a1
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: andi a1, a1, 15
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%a = load <16 x i16>, ptr %x
%b = extractelement <16 x i16> %a, i32 %idx
@@ -775,11 +639,8 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
define void @store_extractelt_v16i8(ptr %x, ptr %p) nounwind {
; CHECK-LABEL: store_extractelt_v16i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: lbu a0, 7(a0)
+; CHECK-NEXT: sb a0, 0(a1)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
%b = extractelement <16 x i8> %a, i32 7
@@ -790,11 +651,8 @@ define void @store_extractelt_v16i8(ptr %x, ptr %p) nounwind {
define void @store_extractelt_v8i16(ptr %x, ptr %p) nounwind {
; CHECK-LABEL: store_extractelt_v8i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a1)
+; CHECK-NEXT: lh a0, 14(a0)
+; CHECK-NEXT: sh a0, 0(a1)
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %x
%b = extractelement <8 x i16> %a, i32 7
@@ -805,11 +663,8 @@ define void @store_extractelt_v8i16(ptr %x, ptr %p) nounwind {
define void @store_extractelt_v4i32(ptr %x, ptr %p) nounwind {
; CHECK-LABEL: store_extractelt_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v8, (a1)
+; CHECK-NEXT: lw a0, 8(a0)
+; CHECK-NEXT: sw a0, 0(a1)
; CHECK-NEXT: ret
%a = load <4 x i32>, ptr %x
%b = extractelement <4 x i32> %a, i32 2
@@ -821,25 +676,16 @@ define void @store_extractelt_v4i32(ptr %x, ptr %p) nounwind {
define void @store_extractelt_v2i64(ptr %x, ptr %p) nounwind {
; RV32-LABEL: store_extractelt_v2i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a0, v9
-; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: lw a2, 8(a0)
+; RV32-NEXT: lw a0, 12(a0)
; RV32-NEXT: sw a2, 0(a1)
; RV32-NEXT: sw a0, 4(a1)
; RV32-NEXT: ret
;
; RV64-LABEL: store_extractelt_v2i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vslidedown.vi v8, v8, 1
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vse64.v v8, (a1)
+; RV64-NEXT: ld a0, 8(a0)
+; RV64-NEXT: sd a0, 0(a1)
; RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
%b = extractelement <2 x i64> %a, i64 1
@@ -850,11 +696,8 @@ define void @store_extractelt_v2i64(ptr %x, ptr %p) nounwind {
define void @store_extractelt_v2f64(ptr %x, ptr %p) nounwind {
; CHECK-LABEL: store_extractelt_v2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
-; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT: vse64.v v8, (a1)
+; CHECK-NEXT: fld fa5, 8(a0)
+; CHECK-NEXT: fsd fa5, 0(a1)
; CHECK-NEXT: ret
%a = load <2 x double>, ptr %x
%b = extractelement <2 x double> %a, i64 1
@@ -1141,11 +984,7 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) {
define i32 @extractelt_v16i32_idx7_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
; CHECK-LABEL: extractelt_v16i32_idx7_exact_vlen:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1re32.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 3
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 28(a0)
; CHECK-NEXT: ret
%a = load <16 x i32>, ptr %x
%b = extractelement <16 x i32> %a, i32 7
@@ -1155,11 +994,7 @@ define i32 @extractelt_v16i32_idx7_exact_vlen(ptr %x) nounwind vscale_range(2,2)
define i32 @extractelt_v16i32_idx15_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
; CHECK-LABEL: extractelt_v16i32_idx15_exact_vlen:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, a0, 48
-; CHECK-NEXT: vl1re32.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v8, v8, 3
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 60(a0)
; CHECK-NEXT: ret
%a = load <16 x i32>, ptr %x
%b = extractelement <16 x i32> %a, i32 15
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 95a4c9a249e7f1..dd386c64fac290 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -7,9 +7,7 @@ declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>)
define half @vreduce_fadd_v1f16(ptr %x, half %s) {
; CHECK-LABEL: vreduce_fadd_v1f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: flh fa5, 0(a0)
; CHECK-NEXT: fadd.h fa0, fa0, fa5
; CHECK-NEXT: ret
%v = load <1 x half>, ptr %x
@@ -258,9 +256,7 @@ declare float @llvm.vector.reduce.fadd.v1f32(float, <1 x float>)
define float @vreduce_fadd_v1f32(ptr %x, float %s) {
; CHECK-LABEL: vreduce_fadd_v1f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: flw fa5, 0(a0)
; CHECK-NEXT: fadd.s fa0, fa0, fa5
; CHECK-NEXT: ret
%v = load <1 x float>, ptr %x
@@ -727,9 +723,7 @@ declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
define double @vreduce_fadd_v1f64(ptr %x, double %s) {
; CHECK-LABEL: vreduce_fadd_v1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: fld fa5, 0(a0)
; CHECK-NEXT: fadd.d fa0, fa0, fa5
; CHECK-NEXT: ret
%v = load <1 x double>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 6c75c9b9c29498..617140ecd219ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -7,9 +7,7 @@ declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
define i8 @vreduce_add_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_add_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %v)
@@ -156,9 +154,7 @@ declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
define i16 @vreduce_add_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_add_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %v)
@@ -568,9 +564,7 @@ declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
define i32 @vreduce_add_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_add_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %v)
@@ -927,19 +921,14 @@ declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
define i64 @vreduce_add_v1i64(ptr %x) {
; RV32-LABEL: vreduce_add_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_add_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %v)
@@ -1723,9 +1712,7 @@ declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
define i8 @vreduce_and_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_and_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %v)
@@ -1864,9 +1851,7 @@ declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16>)
define i16 @vreduce_and_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_and_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %v)
@@ -1989,9 +1974,7 @@ declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32>)
define i32 @vreduce_and_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_and_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %v)
@@ -2098,19 +2081,14 @@ declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
define i64 @vreduce_and_v1i64(ptr %x) {
; RV32-LABEL: vreduce_and_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_and_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %v)
@@ -2308,9 +2286,7 @@ declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>)
define i8 @vreduce_or_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_or_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> %v)
@@ -2449,9 +2425,7 @@ declare i16 @llvm.vector.reduce.or.v1i16(<1 x i16>)
define i16 @vreduce_or_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_or_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.or.v1i16(<1 x i16> %v)
@@ -2574,9 +2548,7 @@ declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32>)
define i32 @vreduce_or_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_or_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %v)
@@ -2683,19 +2655,14 @@ declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
define i64 @vreduce_or_v1i64(ptr %x) {
; RV32-LABEL: vreduce_or_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_or_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %v)
@@ -2893,9 +2860,7 @@ declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>)
define i8 @vreduce_xor_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_xor_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> %v)
@@ -3042,9 +3007,7 @@ declare i16 @llvm.vector.reduce.xor.v1i16(<1 x i16>)
define i16 @vreduce_xor_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_xor_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.xor.v1i16(<1 x i16> %v)
@@ -3174,9 +3137,7 @@ declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32>)
define i32 @vreduce_xor_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_xor_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %v)
@@ -3289,19 +3250,14 @@ declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
define i64 @vreduce_xor_v1i64(ptr %x) {
; RV32-LABEL: vreduce_xor_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_xor_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %v)
@@ -3511,9 +3467,7 @@ declare i8 @llvm.vector.reduce.smin.v1i8(<1 x i8>)
define i8 @vreduce_smin_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_smin_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.smin.v1i8(<1 x i8> %v)
@@ -3652,9 +3606,7 @@ declare i16 @llvm.vector.reduce.smin.v1i16(<1 x i16>)
define i16 @vreduce_smin_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_smin_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.smin.v1i16(<1 x i16> %v)
@@ -3777,9 +3729,7 @@ declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32>)
define i32 @vreduce_smin_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_smin_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %v)
@@ -3886,19 +3836,14 @@ declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
define i64 @vreduce_smin_v1i64(ptr %x) {
; RV32-LABEL: vreduce_smin_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_smin_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %v)
@@ -4096,9 +4041,7 @@ declare i8 @llvm.vector.reduce.smax.v1i8(<1 x i8>)
define i8 @vreduce_smax_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_smax_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.smax.v1i8(<1 x i8> %v)
@@ -4237,9 +4180,7 @@ declare i16 @llvm.vector.reduce.smax.v1i16(<1 x i16>)
define i16 @vreduce_smax_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_smax_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.smax.v1i16(<1 x i16> %v)
@@ -4362,9 +4303,7 @@ declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32>)
define i32 @vreduce_smax_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_smax_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %v)
@@ -4471,19 +4410,14 @@ declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
define i64 @vreduce_smax_v1i64(ptr %x) {
; RV32-LABEL: vreduce_smax_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_smax_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %v)
@@ -4681,9 +4615,7 @@ declare i8 @llvm.vector.reduce.umin.v1i8(<1 x i8>)
define i8 @vreduce_umin_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_umin_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.umin.v1i8(<1 x i8> %v)
@@ -4822,9 +4754,7 @@ declare i16 @llvm.vector.reduce.umin.v1i16(<1 x i16>)
define i16 @vreduce_umin_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_umin_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.umin.v1i16(<1 x i16> %v)
@@ -4947,9 +4877,7 @@ declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32>)
define i32 @vreduce_umin_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_umin_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %v)
@@ -5056,19 +4984,14 @@ declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
define i64 @vreduce_umin_v1i64(ptr %x) {
; RV32-LABEL: vreduce_umin_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_umin_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %v)
@@ -5266,9 +5189,7 @@ declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8>)
define i8 @vreduce_umax_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_umax_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %v)
@@ -5407,9 +5328,7 @@ declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16>)
define i16 @vreduce_umax_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_umax_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %v)
@@ -5532,9 +5451,7 @@ declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32>)
define i32 @vreduce_umax_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_umax_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %v)
@@ -5641,19 +5558,14 @@ declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
define i64 @vreduce_umax_v1i64(ptr %x) {
; RV32-LABEL: vreduce_umax_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_umax_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %v)
@@ -5851,9 +5763,7 @@ declare i8 @llvm.vector.reduce.mul.v1i8(<1 x i8>)
define i8 @vreduce_mul_v1i8(ptr %x) {
; CHECK-LABEL: vreduce_mul_v1i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lbu a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i8>, ptr %x
%red = call i8 @llvm.vector.reduce.mul.v1i8(<1 x i8> %v)
@@ -6059,9 +5969,7 @@ declare i16 @llvm.vector.reduce.mul.v1i16(<1 x i16>)
define i16 @vreduce_mul_v1i16(ptr %x) {
; CHECK-LABEL: vreduce_mul_v1i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lh a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i16>, ptr %x
%red = call i16 @llvm.vector.reduce.mul.v1i16(<1 x i16> %v)
@@ -6233,9 +6141,7 @@ declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32>)
define i32 @vreduce_mul_v1i32(ptr %x) {
; CHECK-LABEL: vreduce_mul_v1i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: lw a0, 0(a0)
; CHECK-NEXT: ret
%v = load <1 x i32>, ptr %x
%red = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %v)
@@ -6376,19 +6282,14 @@ declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
define i64 @vreduce_mul_v1i64(ptr %x) {
; RV32-LABEL: vreduce_mul_v1i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: lw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a0)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: vreduce_mul_v1i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: ld a0, 0(a0)
; RV64-NEXT: ret
%v = load <1 x i64>, ptr %x
%red = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %v)
diff --git a/llvm/test/CodeGen/RISCV/vecloadextract.ll b/llvm/test/CodeGen/RISCV/vecloadextract.ll
new file mode 100644
index 00000000000000..cd86c061e382c4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vecloadextract.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+m,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+m,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+m,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+m,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+
+; This test copy from x86.
+
+define i32 @const_index(ptr %v) {
+; CHECK-LABEL: const_index:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lw a0, 4(a0)
+; CHECK-NEXT: ret
+ %a = load <8 x i32>, ptr %v
+ %b = extractelement <8 x i32> %a, i32 1
+ ret i32 %b
+}
+
+define i32 @variable_index(ptr %v, i32 %i) {
+; CHECK-LABEL: variable_index:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a1, a1, 7
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: lw a0, 0(a0)
+; CHECK-NEXT: ret
+ %a = load <8 x i32>, ptr %v
+ %b = extractelement <8 x i32> %a, i32 %i
+ ret i32 %b
+}
+
+define i32 @variable_index_with_addrspace(ptr addrspace(1) %v, i32 %i) {
+; CHECK-LABEL: variable_index_with_addrspace:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a1, a1, 7
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: lw a0, 0(a0)
+; CHECK-NEXT: ret
+ %a = load <8 x i32>, ptr addrspace(1) %v
+ %b = extractelement <8 x i32> %a, i32 %i
+ ret i32 %b
+}
More information about the llvm-commits
mailing list