[llvm] [ScalarizeMaskedMemIntrin] Use pointer alignment from pointer of masked.compressstore/expandload. (PR #83519)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 29 19:14:15 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Yeting Kuo (yetingk)
<details>
<summary>Changes</summary>
Previously we used Align(1) for all scalarized load/stores from masked.compressstore/expandload.
For targets not supporting unaligned accesses, it make backend need to split
aligned large width loads/stores to byte loads/stores.
To solve this performance issue, this patch preserves the alignment of base
pointer after scalarizing.
---
Patch is 153.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83519.diff
5 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp (+6-4)
- (added) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll (+1079)
- (added) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll (+986)
- (added) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll (+1107)
- (added) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll (+1000)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index c01d03f6447240..2fd5530ad0d0cc 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -627,6 +627,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI,
Value *Ptr = CI->getArgOperand(0);
Value *Mask = CI->getArgOperand(1);
Value *PassThru = CI->getArgOperand(2);
+ Align Alignment = Ptr->getPointerAlignment(DL);
auto *VecType = cast<FixedVectorType>(CI->getType());
@@ -659,7 +660,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI,
} else {
Value *NewPtr =
Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
- InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
+ InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Alignment,
"Load" + Twine(Idx));
ShuffleMask[Idx] = Idx;
++MemIndex;
@@ -713,7 +714,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI,
CondBlock->setName("cond.load");
Builder.SetInsertPoint(CondBlock->getTerminator());
- LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Align(1));
+ LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Alignment);
Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
// Move the pointer if there are more blocks to come.
@@ -755,6 +756,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
Value *Src = CI->getArgOperand(0);
Value *Ptr = CI->getArgOperand(1);
Value *Mask = CI->getArgOperand(2);
+ Align Alignment = Ptr->getPointerAlignment(DL);
auto *VecType = cast<FixedVectorType>(Src->getType());
@@ -778,7 +780,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
Value *OneElt =
Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
- Builder.CreateAlignedStore(OneElt, NewPtr, Align(1));
+ Builder.CreateAlignedStore(OneElt, NewPtr, Alignment);
++MemIndex;
}
CI->eraseFromParent();
@@ -824,7 +826,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
Builder.SetInsertPoint(CondBlock->getTerminator());
Value *OneElt = Builder.CreateExtractElement(Src, Idx);
- Builder.CreateAlignedStore(OneElt, Ptr, Align(1));
+ Builder.CreateAlignedStore(OneElt, Ptr, Alignment);
// Move the pointer if there are more blocks to come.
Value *NewPtr;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
new file mode 100644
index 00000000000000..8989a0c9f2ce1c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
@@ -0,0 +1,1079 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
+
+declare void @llvm.masked.compressstore.v1f16(<1 x half>, ptr, <1 x i1>)
+define void @compressstore_v1f16(ptr align 2 %base, <1 x half> %v, <1 x i1> %mask) {
+; RV32-LABEL: compressstore_v1f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV32-NEXT: vfirst.m a1, v0
+; RV32-NEXT: bnez a1, .LBB0_2
+; RV32-NEXT: # %bb.1: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: .LBB0_2: # %else
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v1f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV64-NEXT: vfirst.m a1, v0
+; RV64-NEXT: bnez a1, .LBB0_2
+; RV64-NEXT: # %bb.1: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: .LBB0_2: # %else
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v1f16(<1 x half> %v, ptr %base, <1 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v2f16(<2 x half>, ptr, <2 x i1>)
+define void @compressstore_v2f16(ptr align 2 %base, <2 x half> %v, <2 x i1> %mask) {
+; RV32-LABEL: compressstore_v2f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB1_3
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: bnez a1, .LBB1_4
+; RV32-NEXT: .LBB1_2: # %else2
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB1_3: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: beqz a1, .LBB1_2
+; RV32-NEXT: .LBB1_4: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v2f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB1_3
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: bnez a1, .LBB1_4
+; RV64-NEXT: .LBB1_2: # %else2
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB1_3: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: beqz a1, .LBB1_2
+; RV64-NEXT: .LBB1_4: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v2f16(<2 x half> %v, ptr %base, <2 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v4f16(<4 x half>, ptr, <4 x i1>)
+define void @compressstore_v4f16(ptr align 2 %base, <4 x half> %v, <4 x i1> %mask) {
+; RV32-LABEL: compressstore_v4f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB2_5
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: bnez a2, .LBB2_6
+; RV32-NEXT: .LBB2_2: # %else2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: bnez a2, .LBB2_7
+; RV32-NEXT: .LBB2_3: # %else5
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: bnez a1, .LBB2_8
+; RV32-NEXT: .LBB2_4: # %else8
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB2_5: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: beqz a2, .LBB2_2
+; RV32-NEXT: .LBB2_6: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: beqz a2, .LBB2_3
+; RV32-NEXT: .LBB2_7: # %cond.store4
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: beqz a1, .LBB2_4
+; RV32-NEXT: .LBB2_8: # %cond.store7
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v4f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB2_5
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: bnez a2, .LBB2_6
+; RV64-NEXT: .LBB2_2: # %else2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: bnez a2, .LBB2_7
+; RV64-NEXT: .LBB2_3: # %else5
+; RV64-NEXT: andi a1, a1, 8
+; RV64-NEXT: bnez a1, .LBB2_8
+; RV64-NEXT: .LBB2_4: # %else8
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB2_5: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: beqz a2, .LBB2_2
+; RV64-NEXT: .LBB2_6: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: beqz a2, .LBB2_3
+; RV64-NEXT: .LBB2_7: # %cond.store4
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a1, a1, 8
+; RV64-NEXT: beqz a1, .LBB2_4
+; RV64-NEXT: .LBB2_8: # %cond.store7
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v4f16(<4 x half> %v, ptr %base, <4 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v8f16(<8 x half>, ptr, <8 x i1>)
+define void @compressstore_v8f16(ptr align 2 %base, <8 x half> %v, <8 x i1> %mask) {
+; RV32-LABEL: compressstore_v8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB3_9
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: bnez a2, .LBB3_10
+; RV32-NEXT: .LBB3_2: # %else2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: bnez a2, .LBB3_11
+; RV32-NEXT: .LBB3_3: # %else5
+; RV32-NEXT: andi a2, a1, 8
+; RV32-NEXT: bnez a2, .LBB3_12
+; RV32-NEXT: .LBB3_4: # %else8
+; RV32-NEXT: andi a2, a1, 16
+; RV32-NEXT: bnez a2, .LBB3_13
+; RV32-NEXT: .LBB3_5: # %else11
+; RV32-NEXT: andi a2, a1, 32
+; RV32-NEXT: bnez a2, .LBB3_14
+; RV32-NEXT: .LBB3_6: # %else14
+; RV32-NEXT: andi a2, a1, 64
+; RV32-NEXT: bnez a2, .LBB3_15
+; RV32-NEXT: .LBB3_7: # %else17
+; RV32-NEXT: andi a1, a1, -128
+; RV32-NEXT: bnez a1, .LBB3_16
+; RV32-NEXT: .LBB3_8: # %else20
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB3_9: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: beqz a2, .LBB3_2
+; RV32-NEXT: .LBB3_10: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: beqz a2, .LBB3_3
+; RV32-NEXT: .LBB3_11: # %cond.store4
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 8
+; RV32-NEXT: beqz a2, .LBB3_4
+; RV32-NEXT: .LBB3_12: # %cond.store7
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 3
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 16
+; RV32-NEXT: beqz a2, .LBB3_5
+; RV32-NEXT: .LBB3_13: # %cond.store10
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 32
+; RV32-NEXT: beqz a2, .LBB3_6
+; RV32-NEXT: .LBB3_14: # %cond.store13
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 5
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 64
+; RV32-NEXT: beqz a2, .LBB3_7
+; RV32-NEXT: .LBB3_15: # %cond.store16
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 6
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a1, a1, -128
+; RV32-NEXT: beqz a1, .LBB3_8
+; RV32-NEXT: .LBB3_16: # %cond.store19
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB3_9
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: bnez a2, .LBB3_10
+; RV64-NEXT: .LBB3_2: # %else2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: bnez a2, .LBB3_11
+; RV64-NEXT: .LBB3_3: # %else5
+; RV64-NEXT: andi a2, a1, 8
+; RV64-NEXT: bnez a2, .LBB3_12
+; RV64-NEXT: .LBB3_4: # %else8
+; RV64-NEXT: andi a2, a1, 16
+; RV64-NEXT: bnez a2, .LBB3_13
+; RV64-NEXT: .LBB3_5: # %else11
+; RV64-NEXT: andi a2, a1, 32
+; RV64-NEXT: bnez a2, .LBB3_14
+; RV64-NEXT: .LBB3_6: # %else14
+; RV64-NEXT: andi a2, a1, 64
+; RV64-NEXT: bnez a2, .LBB3_15
+; RV64-NEXT: .LBB3_7: # %else17
+; RV64-NEXT: andi a1, a1, -128
+; RV64-NEXT: bnez a1, .LBB3_16
+; RV64-NEXT: .LBB3_8: # %else20
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB3_9: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: beqz a2, .LBB3_2
+; RV64-NEXT: .LBB3_10: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: beqz a2, .LBB3_3
+; RV64-NEXT: .LBB3_11: # %cond.store4
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 8
+; RV64-NEXT: beqz a2, .LBB3_4
+; RV64-NEXT: .LBB3_12: # %cond.store7
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 3
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 16
+; RV64-NEXT: beqz a2, .LBB3_5
+; RV64-NEXT: .LBB3_13: # %cond.store10
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 4
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 32
+; RV64-NEXT: beqz a2, .LBB3_6
+; RV64-NEXT: .LBB3_14: # %cond.store13
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 5
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 64
+; RV64-NEXT: beqz a2, .LBB3_7
+; RV64-NEXT: .LBB3_15: # %cond.store16
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 6
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a1, a1, -128
+; RV64-NEXT: beqz a1, .LBB3_8
+; RV64-NEXT: .LBB3_16: # %cond.store19
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v8f16(<8 x half> %v, ptr %base, <8 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v1f32(<1 x float>, ptr, <1 x i1>)
+define void @compressstore_v1f32(ptr align 4 %base, <1 x float> %v, <1 x i1> %mask) {
+; RV32-LABEL: compressstore_v1f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV32-NEXT: vfirst.m a1, v0
+; RV32-NEXT: bnez a1, .LBB4_2
+; RV32-NEXT: # %bb.1: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: .LBB4_2: # %else
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v1f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV64-NEXT: vfirst.m a1, v0
+; RV64-NEXT: bnez a1, .LBB4_2
+; RV64-NEXT: # %bb.1: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: .LBB4_2: # %else
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v1f32(<1 x float> %v, ptr %base, <1 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+define void @compressstore_v2f32(ptr align 4 %base, <2 x float> %v, <2 x i1> %mask) {
+; RV32-LABEL: compressstore_v2f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB5_3
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: bnez a1, .LBB5_4
+; RV32-NEXT: .LBB5_2: # %else2
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB5_3: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: beqz a1, .LBB5_2
+; RV32-NEXT: .LBB5_4: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v2f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB5_3
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: bnez a1, .LBB5_4
+; RV64-NEXT: .LBB5_2: # %else2
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB5_3: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 4
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: beqz a1, .LBB5_2
+; RV64-NEXT: .LBB5_4: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v2f32(<2 x float> %v, ptr %base, <2 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+define void @compressstore_v4f32(ptr align 4 %base, <4 x float> %v, <4 x i1> %mask) {
+; RV32-LABEL: compressstore_v4f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB6_5
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: bnez a2, .LBB6_6
+; RV32-NEXT: .LBB6_2: # %else2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: bnez a2, .LBB6_7
+; RV32-NEXT: .LBB6_3: # %else5
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: bnez a1, .LBB6_8
+; RV32-NEXT: .LBB6_4: # %else8
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB6_5: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: beqz a2, .LBB6_2
+; RV32-NEXT: .LBB6_6: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vse32.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: beqz a2, .LBB6_3
+; RV32-NEXT: .LBB6_7: # %cond.store4
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vse32.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: beqz a1, .LBB6_4
+; RV32-NEXT: .LBB6_8: # %cond.store7
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v4f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB6_5
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: bnez a2, .LBB6_6
+; RV64-NEXT: .LBB6_2: # %else2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/83519
More information about the llvm-commits
mailing list