[llvm] [RISCV] Lower fixed-length strided VP loads and stores for zvfhmin/zvfbfmin (PR #114750)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 3 23:49:21 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Luke Lau (lukel97)
<details>
<summary>Changes</summary>
Similarly to #<!-- -->114731, these don't actually require any instructions from the extensions.
The motivation for this and #<!-- -->114731 is to eventually enable isLegalElementTypeForRVV for f16 with zvfhmin and bf16 with zvfbfmin in order to enable scalable vectorization.
Although the scalable codegen support for f16 and bf16 is now complete enough for anything the loop vectorizer may emit, enabling isLegalElementTypeForRVV would make certian hooks like isLegalInterleavedAccessType and isLegalStridedLoadStore return true for f16 and bf16. This means SLP would start emitting these intrinsics, so we need to add fixed-length codegen support.
---
Full diff: https://github.com/llvm/llvm-project/pull/114750.diff
3 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+5-4)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll (+103-28)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll (+58-6)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 920b06c7ba6ecd..6ef857ca2e2b48 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1347,6 +1347,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
Custom);
+ setOperationAction({ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
+ ISD::EXPERIMENTAL_VP_STRIDED_STORE},
+ VT, Custom);
+
if (VT.getVectorElementType() == MVT::f16 &&
!Subtarget.hasVInstructionsF16()) {
setOperationAction(ISD::BITCAST, VT, Custom);
@@ -1410,10 +1414,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(
{ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom);
- setOperationAction({ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
- ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
- ISD::VP_SCATTER},
- VT, Custom);
+ setOperationAction({ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom);
setOperationAction({ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN, ISD::FSQRT,
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 14cc3edffb8c14..47efa058df641a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -1,16 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh,+optimized-zero-stride-load \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh,+zvfbfmin,+optimized-zero-stride-load \
; RUN: -verify-machineinstrs < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh,+optimized-zero-stride-load \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh,+zvfbfmin,+optimized-zero-stride-load \
; RUN: -verify-machineinstrs < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh,+zvfbfmin \
; RUN: -verify-machineinstrs < %s \
-; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh,+zvfbfmin \
; RUN: -verify-machineinstrs < %s \
-; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin,+optimized-zero-stride-load \
+; RUN: -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin,+optimized-zero-stride-load \
+; RUN: -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin \
+; RUN: -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin \
+; RUN: -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-ZVFHMIN
declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32)
@@ -278,6 +290,62 @@ define <8 x i64> @strided_vpload_v8i64(ptr %ptr, i32 signext %stride, <8 x i1> %
ret <8 x i64> %load
}
+declare <2 x bfloat> @llvm.experimental.vp.strided.load.v2bf16.p0.i32(ptr, i32, <2 x i1>, i32)
+
+define <2 x bfloat> @strided_vpload_v2bf16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ %load = call <2 x bfloat> @llvm.experimental.vp.strided.load.v2bf16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
+ ret <2 x bfloat> %load
+}
+
+define <2 x bfloat> @strided_vpload_v2bf16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_v2bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT: vlse16.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %load = call <2 x bfloat> @llvm.experimental.vp.strided.load.v2bf16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> splat (i1 true), i32 %evl)
+ ret <2 x bfloat> %load
+}
+
+declare <4 x bfloat> @llvm.experimental.vp.strided.load.v4bf16.p0.i32(ptr, i32, <4 x i1>, i32)
+
+define <4 x bfloat> @strided_vpload_v4bf16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_v4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ %load = call <4 x bfloat> @llvm.experimental.vp.strided.load.v4bf16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
+ ret <4 x bfloat> %load
+}
+
+declare <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i32(ptr, i32, <8 x i1>, i32)
+
+define <8 x bfloat> @strided_vpload_v8bf16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_v8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ %load = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
+ ret <8 x bfloat> %load
+}
+
+define <8 x bfloat> @strided_vpload_v8bf16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpload_v8bf16_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ %load = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
+ ret <8 x bfloat> %load
+}
+
declare <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr, i32, <2 x i1>, i32)
define <2 x half> @strided_vpload_v2f16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
@@ -477,10 +545,10 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: vmv1r.v v9, v0
; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB40_2
+; CHECK-NEXT: bltu a2, a4, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
-; CHECK-NEXT: .LBB40_2:
+; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: mul a4, a3, a1
; CHECK-NEXT: add a4, a0, a4
; CHECK-NEXT: addi a5, a2, -16
@@ -505,10 +573,10 @@ define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %
; CHECK: # %bb.0:
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB41_2
+; CHECK-NEXT: bltu a2, a4, .LBB46_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
-; CHECK-NEXT: .LBB41_2:
+; CHECK-NEXT: .LBB46_2:
; CHECK-NEXT: mul a4, a3, a1
; CHECK-NEXT: add a4, a0, a4
; CHECK-NEXT: addi a5, a2, -16
@@ -533,10 +601,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: li a5, 32
; CHECK-RV32-NEXT: vmv1r.v v8, v0
; CHECK-RV32-NEXT: mv a3, a4
-; CHECK-RV32-NEXT: bltu a4, a5, .LBB42_2
+; CHECK-RV32-NEXT: bltu a4, a5, .LBB47_2
; CHECK-RV32-NEXT: # %bb.1:
; CHECK-RV32-NEXT: li a3, 32
-; CHECK-RV32-NEXT: .LBB42_2:
+; CHECK-RV32-NEXT: .LBB47_2:
; CHECK-RV32-NEXT: mul a6, a3, a2
; CHECK-RV32-NEXT: addi a5, a4, -32
; CHECK-RV32-NEXT: sltu a7, a4, a5
@@ -544,10 +612,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: and a7, a7, a5
; CHECK-RV32-NEXT: li a5, 16
; CHECK-RV32-NEXT: add a6, a1, a6
-; CHECK-RV32-NEXT: bltu a7, a5, .LBB42_4
+; CHECK-RV32-NEXT: bltu a7, a5, .LBB47_4
; CHECK-RV32-NEXT: # %bb.3:
; CHECK-RV32-NEXT: li a7, 16
-; CHECK-RV32-NEXT: .LBB42_4:
+; CHECK-RV32-NEXT: .LBB47_4:
; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4
; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, ma
@@ -556,10 +624,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: sltu a3, a3, a6
; CHECK-RV32-NEXT: addi a3, a3, -1
; CHECK-RV32-NEXT: and a3, a3, a6
-; CHECK-RV32-NEXT: bltu a4, a5, .LBB42_6
+; CHECK-RV32-NEXT: bltu a4, a5, .LBB47_6
; CHECK-RV32-NEXT: # %bb.5:
; CHECK-RV32-NEXT: li a4, 16
-; CHECK-RV32-NEXT: .LBB42_6:
+; CHECK-RV32-NEXT: .LBB47_6:
; CHECK-RV32-NEXT: mul a5, a4, a2
; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 2
@@ -583,10 +651,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: li a5, 32
; CHECK-RV64-NEXT: vmv1r.v v8, v0
; CHECK-RV64-NEXT: mv a4, a3
-; CHECK-RV64-NEXT: bltu a3, a5, .LBB42_2
+; CHECK-RV64-NEXT: bltu a3, a5, .LBB47_2
; CHECK-RV64-NEXT: # %bb.1:
; CHECK-RV64-NEXT: li a4, 32
-; CHECK-RV64-NEXT: .LBB42_2:
+; CHECK-RV64-NEXT: .LBB47_2:
; CHECK-RV64-NEXT: mul a6, a4, a2
; CHECK-RV64-NEXT: addi a5, a3, -32
; CHECK-RV64-NEXT: sltu a7, a3, a5
@@ -594,10 +662,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: and a7, a7, a5
; CHECK-RV64-NEXT: li a5, 16
; CHECK-RV64-NEXT: add a6, a1, a6
-; CHECK-RV64-NEXT: bltu a7, a5, .LBB42_4
+; CHECK-RV64-NEXT: bltu a7, a5, .LBB47_4
; CHECK-RV64-NEXT: # %bb.3:
; CHECK-RV64-NEXT: li a7, 16
-; CHECK-RV64-NEXT: .LBB42_4:
+; CHECK-RV64-NEXT: .LBB47_4:
; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4
; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma
@@ -606,10 +674,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: sltu a4, a4, a6
; CHECK-RV64-NEXT: addi a4, a4, -1
; CHECK-RV64-NEXT: and a4, a4, a6
-; CHECK-RV64-NEXT: bltu a3, a5, .LBB42_6
+; CHECK-RV64-NEXT: bltu a3, a5, .LBB47_6
; CHECK-RV64-NEXT: # %bb.5:
; CHECK-RV64-NEXT: li a3, 16
-; CHECK-RV64-NEXT: .LBB42_6:
+; CHECK-RV64-NEXT: .LBB47_6:
; CHECK-RV64-NEXT: mul a5, a3, a2
; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 2
@@ -659,12 +727,19 @@ define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
; CHECK-OPT-NEXT: ret
;
-; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
-; CHECK-NO-OPT: # %bb.0:
-; CHECK-NO-OPT-NEXT: flh fa5, 0(a0)
-; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
-; CHECK-NO-OPT-NEXT: vfmv.v.f v8, fa5
-; CHECK-NO-OPT-NEXT: ret
+; CHECK-NO-OPT-ZVFH-LABEL: zero_strided_unmasked_vpload_4f16:
+; CHECK-NO-OPT-ZVFH: # %bb.0:
+; CHECK-NO-OPT-ZVFH-NEXT: flh fa5, 0(a0)
+; CHECK-NO-OPT-ZVFH-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
+; CHECK-NO-OPT-ZVFH-NEXT: vfmv.v.f v8, fa5
+; CHECK-NO-OPT-ZVFH-NEXT: ret
+;
+; CHECK-NO-OPT-ZVFHMIN-LABEL: zero_strided_unmasked_vpload_4f16:
+; CHECK-NO-OPT-ZVFHMIN: # %bb.0:
+; CHECK-NO-OPT-ZVFHMIN-NEXT: lh a0, 0(a0)
+; CHECK-NO-OPT-ZVFHMIN-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
+; CHECK-NO-OPT-ZVFHMIN-NEXT: vmv.v.x v8, a0
+; CHECK-NO-OPT-ZVFHMIN-NEXT: ret
%load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)
ret <4 x half> %load
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
index ddd86c3082021a..7ca329835b7aca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
@@ -1,8 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh,+zvfbfmin \
; RUN: -verify-machineinstrs < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh,+zvfbfmin \
+; RUN: -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin \
+; RUN: -verify-machineinstrs < %s \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin \
; RUN: -verify-machineinstrs < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
@@ -232,6 +238,52 @@ define void @strided_vpstore_v8i64(<8 x i64> %val, ptr %ptr, i32 signext %stride
ret void
}
+declare void @llvm.experimental.vp.strided.store.v2bf16.p0.i32(<2 x bfloat>, ptr, i32, <2 x i1>, i32)
+
+define void @strided_vpstore_v2bf16(<2 x bfloat> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
+; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v2bf16.p0.i32(<2 x bfloat> %val, ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
+ ret void
+}
+
+declare void @llvm.experimental.vp.strided.store.v4bf16.p0.i32(<4 x bfloat>, ptr, i32, <4 x i1>, i32)
+
+define void @strided_vpstore_v4bf16(<4 x bfloat> %val, ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v4bf16.p0.i32(<4 x bfloat> %val, ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
+ ret void
+}
+
+declare void @llvm.experimental.vp.strided.store.v8bf16.p0.i32(<8 x bfloat>, ptr, i32, <8 x i1>, i32)
+
+define void @strided_vpstore_v8bf16(<8 x bfloat> %val, ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v8bf16.p0.i32(<8 x bfloat> %val, ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
+ ret void
+}
+
+define void @strided_vpstore_v8bf16_unit_stride(<8 x bfloat> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v8bf16_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vse16.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v8bf16.p0.i32(<8 x bfloat> %val, ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v2f16.p0.i32(<2 x half>, ptr, i32, <2 x i1>, i32)
define void @strided_vpstore_v2f16(<2 x half> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
@@ -409,10 +461,10 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid
; CHECK: # %bb.0:
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB34_2
+; CHECK-NEXT: bltu a2, a4, .LBB38_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
-; CHECK-NEXT: .LBB34_2:
+; CHECK-NEXT: .LBB38_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t
; CHECK-NEXT: mul a3, a3, a1
@@ -435,10 +487,10 @@ define void @strided_store_v32f64_allones_mask(<32 x double> %v, ptr %ptr, i32 s
; CHECK: # %bb.0:
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB35_2
+; CHECK-NEXT: bltu a2, a4, .LBB39_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
-; CHECK-NEXT: .LBB35_2:
+; CHECK-NEXT: .LBB39_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1
; CHECK-NEXT: mul a3, a3, a1
``````````
</details>
https://github.com/llvm/llvm-project/pull/114750
More information about the llvm-commits
mailing list