[llvm] [AArch64][SVE] Add basic support for `@llvm.masked.compressstore` (PR #168350)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 24 09:50:28 PST 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/168350
>From b68452c165c3db2b77dee237d272605bbf2c2889 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 17 Nov 2025 11:21:09 +0000
Subject: [PATCH 1/5] [AArch64][SVE] Add basic support for
`@llvm.masked.compressstore`
This patch adds SVE support for the `masked.compressstore` intrinsic via
the existing `VECTOR_COMPRESS` lowering and compressing the store mask
via `VECREDUCE_ADD`.
Currently, only `nxv4[i32|f32]` and `nxv2[i64|f64]` are directly
supported, with other types promoted to these, where possible.
This is done in preparation for LV support of this intrinsic, which is
currently being worked on in #140723.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 46 +++++-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 +
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 18 ++-
.../AArch64/AArch64TargetTransformInfo.h | 23 +++
.../AArch64/sve-masked-compressstore.ll | 141 ++++++++++++++++++
5 files changed, 218 insertions(+), 11 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e91f5a877b35b..4ad1c21eee22f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1983,10 +1983,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
// We can lower types that have <vscale x {2|4}> elements to compact.
- for (auto VT :
- {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
- MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
+ for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
+ MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
+ MVT::nxv4i32, MVT::nxv4f32}) {
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ // Use a custom lowering for masked stores that could be a supported
+ // compressing store. Note: These types still use the normal (Legal)
+ // lowering for non-compressing masked stores.
setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+ }
// If we have SVE, we can use SVE logic for legal (or smaller than legal)
// NEON vectors in the lowest bits of the SVE register.
@@ -7932,7 +7937,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::MSTORE:
- return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
+ return LowerMSTORE(Op, DAG);
case ISD::MGATHER:
return LowerMGATHER(Op, DAG);
case ISD::MSCATTER:
@@ -30400,6 +30405,36 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
Store->isTruncatingStore());
}
+SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ auto *Store = cast<MaskedStoreSDNode>(Op);
+ EVT VT = Store->getValue().getValueType();
+ if (VT.isFixedLengthVector())
+ return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
+
+ if (!Store->isCompressingStore())
+ return SDValue();
+
+ EVT MaskVT = Store->getMask().getValueType();
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue CntActive =
+ DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Store->getMask());
+ SDValue CompressedValue =
+ DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(),
+ Store->getMask(), DAG.getPOISON(VT));
+ SDValue CompressedMask =
+ DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
+
+ return DAG.getMaskedStore(Store->getChain(), DL, CompressedValue,
+ Store->getBasePtr(), Store->getOffset(),
+ CompressedMask, Store->getMemoryVT(),
+ Store->getMemOperand(), Store->getAddressingMode(),
+ Store->isTruncatingStore(),
+ /*isCompressing=*/false);
+}
+
SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
SDValue Op, SelectionDAG &DAG) const {
auto *Store = cast<MaskedStoreSDNode>(Op);
@@ -30414,7 +30449,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
return DAG.getMaskedStore(
Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
Mask, Store->getMemoryVT(), Store->getMemOperand(),
- Store->getAddressingMode(), Store->isTruncatingStore());
+ Store->getAddressingMode(), Store->isTruncatingStore(),
+ Store->isCompressingStore());
}
SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ca08eb40c956a..32aa913181a21 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -761,6 +761,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 881f7707f0eb7..681f1871ae692 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -645,29 +645,34 @@ def nontrunc_masked_store :
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
cast<MaskedStoreSDNode>(N)->isUnindexed() &&
- !cast<MaskedStoreSDNode>(N)->isNonTemporal();
+ !cast<MaskedStoreSDNode>(N)->isNonTemporal() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
// truncating masked store fragments.
def trunc_masked_store :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
- cast<MaskedStoreSDNode>(N)->isUnindexed();
+ cast<MaskedStoreSDNode>(N)->isUnindexed() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def trunc_masked_store_i8 :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(trunc_masked_store node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8 &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def trunc_masked_store_i16 :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(trunc_masked_store node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16 &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def trunc_masked_store_i32 :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
(trunc_masked_store node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32 &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
def non_temporal_store :
@@ -675,7 +680,8 @@ def non_temporal_store :
(masked_st node:$val, node:$ptr, undef, node:$pred), [{
return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
cast<MaskedStoreSDNode>(N)->isUnindexed() &&
- cast<MaskedStoreSDNode>(N)->isNonTemporal();
+ cast<MaskedStoreSDNode>(N)->isNonTemporal() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
multiclass masked_gather_scatter<PatFrags GatherScatterOp> {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 6cc4987428567..9cae0a36cf9ed 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -332,6 +332,29 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
return isLegalMaskedLoadStore(DataType, Alignment);
}
+ bool isElementTypeLegalForCompressStore(Type *Ty) const {
+ if (Ty->isFloatTy() || Ty->isDoubleTy())
+ return true;
+
+ if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || Ty->isIntegerTy(32) ||
+ Ty->isIntegerTy(64))
+ return true;
+
+ return false;
+ }
+
+ bool isLegalMaskedCompressStore(Type *DataType,
+ Align Alignment) const override {
+ ElementCount EC = cast<VectorType>(DataType)->getElementCount();
+ if (EC.getKnownMinValue() != 2 && EC.getKnownMinValue() != 4)
+ return false;
+
+ if (!isElementTypeLegalForCompressStore(DataType->getScalarType()))
+ return false;
+
+ return isLegalMaskedLoadStore(DataType, Alignment);
+ }
+
bool isLegalMaskedGatherScatter(Type *DataType) const {
if (!ST->isSVEAvailable())
return false;
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
new file mode 100644
index 0000000000000..1be5b1d1fbb6d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+;; Full SVE vectors (supported with +sve)
+
+define void @test_compressstore_nxv4i32(ptr %p, <vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: cntp x8, p1, p0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv4i32(<vscale x 4 x i32> %vec, ptr align 4 %p, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv2i64(ptr %p, <vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> %vec, ptr align 8 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv4f32(ptr %p, <vscale x 4 x float> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: cntp x8, p1, p0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv4f32(<vscale x 4 x float> %vec, ptr align 4 %p, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+; TODO: Legal and nonstreaming check
+define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2f64(<vscale x 2 x double> %vec, ptr align 8 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+;; Promoted SVE vector types promoted to 32/64-bit (non-exhaustive)
+
+define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1b { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> %vec, ptr align 1 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_nxv4i16(ptr %p, <vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: cntp x8, p1, p0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1h { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv4i16(<vscale x 4 x i16> %vec, ptr align 2 %p, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+;; NEON vector types (promoted to SVE)
+
+define void @test_compressstore_v2f32(ptr %p, <2 x double> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: shl v1.2d, v1.2d, #63
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.v2f64(<2 x double> %vec, ptr align 8 %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_v4i32(ptr %p, <4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
+; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT: cntp x8, p1, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %vec, ptr align 4 %p, <4 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: shl v1.2d, v1.2d, #63
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %vec, ptr align 8 %p, <2 x i1> %mask)
+ ret void
+}
>From 39f9815fe3bd79fb79b4f9e4479c5d4fd664f9da Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 17 Nov 2025 11:47:32 +0000
Subject: [PATCH 2/5] Update
llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
---
llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
index 1be5b1d1fbb6d..5efe03a161cc1 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
@@ -56,7 +56,7 @@ define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vsc
ret void
}
-;; Promoted SVE vector types promoted to 32/64-bit (non-exhaustive)
+;; SVE vector types promoted to 32/64-bit (non-exhaustive)
define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv2i8:
>From 02d3507702994670d0d184c30060b86649e62a87 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 17 Nov 2025 19:01:02 +0000
Subject: [PATCH 3/5] Fixups
---
.../AArch64/AArch64TargetTransformInfo.h | 25 ++-
.../AArch64/sve-masked-compressstore.ll | 153 +++++++++++++++++-
2 files changed, 171 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 9cae0a36cf9ed..a1bece496c302 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -345,9 +345,28 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
bool isLegalMaskedCompressStore(Type *DataType,
Align Alignment) const override {
- ElementCount EC = cast<VectorType>(DataType)->getElementCount();
- if (EC.getKnownMinValue() != 2 && EC.getKnownMinValue() != 4)
- return false;
+ auto VecTy = cast<VectorType>(DataType);
+ Type *ElTy = VecTy->getScalarType();
+ unsigned ElSizeInBits = ElTy->getScalarSizeInBits();
+ TypeSize VecSizeInBits = VecTy->getPrimitiveSizeInBits();
+
+ if (isa<FixedVectorType>(VecTy)) {
+ // Each 128-bit segment must contain 2 or 4 elements (packed).
+ if (ElSizeInBits != 32 && ElSizeInBits != 64)
+ return false;
+ if (VecSizeInBits % 128 != 0 ||
+ VecSizeInBits > std::max(128U, ST->getMinSVEVectorSizeInBits()))
+ return false;
+ } else {
+ // Each segment must contain 2 or 4 elements, but the segments can be
+ // < 128-bits for unpacked vector types.
+ if (VecSizeInBits.getKnownMinValue() > 128)
+ return false;
+ unsigned ElementsPerSegment =
+ VecSizeInBits.getKnownMinValue() / ElSizeInBits;
+ if (ElementsPerSegment != 2 && ElementsPerSegment != 4)
+ return false;
+ }
if (!isElementTypeLegalForCompressStore(DataType->getScalarType()))
return false;
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
index 5efe03a161cc1..af0d4384f521a 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
+; RUN: llc -mtriple=aarch64 -aarch64-sve-vector-bits-min=256 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VL256
;; Full SVE vectors (supported with +sve)
@@ -42,7 +43,6 @@ define void @test_compressstore_nxv4f32(ptr %p, <vscale x 4 x float> %vec, <vsca
ret void
}
-; TODO: Legal and nonstreaming check
define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv2f64:
; CHECK: // %bb.0:
@@ -56,6 +56,21 @@ define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vsc
ret void
}
+;; Unpacked SVE vector types
+
+define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: compact z0.d, p0, z0.d
+; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1w { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv2f32(<vscale x 2 x float> %vec, ptr align 4 %p, <vscale x 2 x i1> %mask)
+ ret void
+}
+
;; SVE vector types promoted to 32/64-bit (non-exhaustive)
define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
@@ -86,8 +101,8 @@ define void @test_compressstore_nxv4i16(ptr %p, <vscale x 4 x i16> %vec, <vscale
;; NEON vector types (promoted to SVE)
-define void @test_compressstore_v2f32(ptr %p, <2 x double> %vec, <2 x i1> %mask) {
-; CHECK-LABEL: test_compressstore_v2f32:
+define void @test_compressstore_v2f64(ptr %p, <2 x double> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
@@ -139,3 +154,133 @@ define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %vec, ptr align 8 %p, <2 x i1> %mask)
ret void
}
+
+define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
+; CHECK-BASE-LABEL: test_compressstore_v8i32:
+; CHECK-BASE: // %bb.0:
+; CHECK-BASE-NEXT: shl v2.8b, v2.8b, #7
+; CHECK-BASE-NEXT: adrp x8, .LCPI10_0
+; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI10_0]
+; CHECK-BASE-NEXT: cmlt v2.8b, v2.8b, #0
+; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
+; CHECK-BASE-NEXT: addv b2, v2.8b
+; CHECK-BASE-NEXT: fmov w8, s2
+; CHECK-BASE-NEXT: tbnz w8, #0, .LBB10_9
+; CHECK-BASE-NEXT: // %bb.1: // %else
+; CHECK-BASE-NEXT: tbnz w8, #1, .LBB10_10
+; CHECK-BASE-NEXT: .LBB10_2: // %else2
+; CHECK-BASE-NEXT: tbnz w8, #2, .LBB10_11
+; CHECK-BASE-NEXT: .LBB10_3: // %else5
+; CHECK-BASE-NEXT: tbnz w8, #3, .LBB10_12
+; CHECK-BASE-NEXT: .LBB10_4: // %else8
+; CHECK-BASE-NEXT: tbnz w8, #4, .LBB10_13
+; CHECK-BASE-NEXT: .LBB10_5: // %else11
+; CHECK-BASE-NEXT: tbnz w8, #5, .LBB10_14
+; CHECK-BASE-NEXT: .LBB10_6: // %else14
+; CHECK-BASE-NEXT: tbnz w8, #6, .LBB10_15
+; CHECK-BASE-NEXT: .LBB10_7: // %else17
+; CHECK-BASE-NEXT: tbnz w8, #7, .LBB10_16
+; CHECK-BASE-NEXT: .LBB10_8: // %else20
+; CHECK-BASE-NEXT: ret
+; CHECK-BASE-NEXT: .LBB10_9: // %cond.store
+; CHECK-BASE-NEXT: st1 { v0.s }[0], [x0], #4
+; CHECK-BASE-NEXT: tbz w8, #1, .LBB10_2
+; CHECK-BASE-NEXT: .LBB10_10: // %cond.store1
+; CHECK-BASE-NEXT: st1 { v0.s }[1], [x0], #4
+; CHECK-BASE-NEXT: tbz w8, #2, .LBB10_3
+; CHECK-BASE-NEXT: .LBB10_11: // %cond.store4
+; CHECK-BASE-NEXT: st1 { v0.s }[2], [x0], #4
+; CHECK-BASE-NEXT: tbz w8, #3, .LBB10_4
+; CHECK-BASE-NEXT: .LBB10_12: // %cond.store7
+; CHECK-BASE-NEXT: st1 { v0.s }[3], [x0], #4
+; CHECK-BASE-NEXT: tbz w8, #4, .LBB10_5
+; CHECK-BASE-NEXT: .LBB10_13: // %cond.store10
+; CHECK-BASE-NEXT: st1 { v1.s }[0], [x0], #4
+; CHECK-BASE-NEXT: tbz w8, #5, .LBB10_6
+; CHECK-BASE-NEXT: .LBB10_14: // %cond.store13
+; CHECK-BASE-NEXT: st1 { v1.s }[1], [x0], #4
+; CHECK-BASE-NEXT: tbz w8, #6, .LBB10_7
+; CHECK-BASE-NEXT: .LBB10_15: // %cond.store16
+; CHECK-BASE-NEXT: st1 { v1.s }[2], [x0], #4
+; CHECK-BASE-NEXT: tbz w8, #7, .LBB10_8
+; CHECK-BASE-NEXT: .LBB10_16: // %cond.store19
+; CHECK-BASE-NEXT: st1 { v1.s }[3], [x0]
+; CHECK-BASE-NEXT: ret
+;
+; CHECK-VL256-LABEL: test_compressstore_v8i32:
+; CHECK-VL256: // %bb.0:
+; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-VL256-NEXT: ptrue p0.s, vl8
+; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-VL256-NEXT: uunpklo z2.h, z2.b
+; CHECK-VL256-NEXT: ptrue p1.s, vl4
+; CHECK-VL256-NEXT: splice z0.s, p1, z0.s, z1.s
+; CHECK-VL256-NEXT: ptrue p1.s
+; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
+; CHECK-VL256-NEXT: lsl z2.s, z2.s, #31
+; CHECK-VL256-NEXT: asr z2.s, z2.s, #31
+; CHECK-VL256-NEXT: cmpne p0.s, p0/z, z2.s, #0
+; CHECK-VL256-NEXT: cntp x8, p1, p0.s
+; CHECK-VL256-NEXT: compact z0.s, p0, z0.s
+; CHECK-VL256-NEXT: whilelo p0.s, xzr, x8
+; CHECK-VL256-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-VL256-NEXT: ret
+ tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %vec, ptr align 4 %p, <8 x i1> %mask)
+ ret void
+}
+
+define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) {
+; CHECK-BASE-LABEL: test_compressstore_v4i64:
+; CHECK-BASE: // %bb.0:
+; CHECK-BASE-NEXT: shl v2.4h, v2.4h, #15
+; CHECK-BASE-NEXT: adrp x8, .LCPI11_0
+; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI11_0]
+; CHECK-BASE-NEXT: cmlt v2.4h, v2.4h, #0
+; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
+; CHECK-BASE-NEXT: addv h2, v2.4h
+; CHECK-BASE-NEXT: fmov w8, s2
+; CHECK-BASE-NEXT: tbnz w8, #0, .LBB11_5
+; CHECK-BASE-NEXT: // %bb.1: // %else
+; CHECK-BASE-NEXT: tbnz w8, #1, .LBB11_6
+; CHECK-BASE-NEXT: .LBB11_2: // %else2
+; CHECK-BASE-NEXT: tbnz w8, #2, .LBB11_7
+; CHECK-BASE-NEXT: .LBB11_3: // %else5
+; CHECK-BASE-NEXT: tbnz w8, #3, .LBB11_8
+; CHECK-BASE-NEXT: .LBB11_4: // %else8
+; CHECK-BASE-NEXT: ret
+; CHECK-BASE-NEXT: .LBB11_5: // %cond.store
+; CHECK-BASE-NEXT: st1 { v0.d }[0], [x0], #8
+; CHECK-BASE-NEXT: tbz w8, #1, .LBB11_2
+; CHECK-BASE-NEXT: .LBB11_6: // %cond.store1
+; CHECK-BASE-NEXT: st1 { v0.d }[1], [x0], #8
+; CHECK-BASE-NEXT: tbz w8, #2, .LBB11_3
+; CHECK-BASE-NEXT: .LBB11_7: // %cond.store4
+; CHECK-BASE-NEXT: st1 { v1.d }[0], [x0], #8
+; CHECK-BASE-NEXT: tbz w8, #3, .LBB11_4
+; CHECK-BASE-NEXT: .LBB11_8: // %cond.store7
+; CHECK-BASE-NEXT: st1 { v1.d }[1], [x0]
+; CHECK-BASE-NEXT: ret
+;
+; CHECK-VL256-LABEL: test_compressstore_v4i64:
+; CHECK-VL256: // %bb.0:
+; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-VL256-NEXT: ptrue p0.d, vl4
+; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
+; CHECK-VL256-NEXT: ptrue p1.d, vl2
+; CHECK-VL256-NEXT: splice z0.d, p1, z0.d, z1.d
+; CHECK-VL256-NEXT: ptrue p1.d
+; CHECK-VL256-NEXT: uunpklo z2.d, z2.s
+; CHECK-VL256-NEXT: lsl z2.d, z2.d, #63
+; CHECK-VL256-NEXT: asr z2.d, z2.d, #63
+; CHECK-VL256-NEXT: cmpne p0.d, p0/z, z2.d, #0
+; CHECK-VL256-NEXT: cntp x8, p1, p0.d
+; CHECK-VL256-NEXT: compact z0.d, p0, z0.d
+; CHECK-VL256-NEXT: whilelo p0.d, xzr, x8
+; CHECK-VL256-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-VL256-NEXT: ret
+ tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %vec, ptr align 8 %p, <4 x i1> %mask)
+ ret void
+}
>From a980f8a2900645f1e3452d03f334a9c9f32550f5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 24 Nov 2025 10:28:07 +0000
Subject: [PATCH 4/5] Avoid VECREDUCE_ADD issues
---
.../Target/AArch64/AArch64ISelLowering.cpp | 11 ++++--
.../AArch64/sve-masked-compressstore.ll | 36 +++++++------------
2 files changed, 21 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4ad1c21eee22f..1d71a5d67f0ca 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30417,10 +30417,17 @@ SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op,
return SDValue();
EVT MaskVT = Store->getMask().getValueType();
-
+ EVT MaskExtVT = getPromotedVTForPredicate(MaskVT);
+ EVT MaskReduceVT = MaskExtVT.getScalarType();
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+ SDValue MaskExt =
+ DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Store->getMask());
SDValue CntActive =
- DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Store->getMask());
+ DAG.getNode(ISD::VECREDUCE_ADD, DL, MaskReduceVT, MaskExt);
+ if (MaskReduceVT != MVT::i64)
+ CntActive = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CntActive);
+
SDValue CompressedValue =
DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(),
Store->getMask(), DAG.getPOISON(VT));
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
index af0d4384f521a..ca4ccbedf58c8 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
@@ -7,9 +7,8 @@
define void @test_compressstore_nxv4i32(ptr %p, <vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cntp x8, p0, p0.s
; CHECK-NEXT: compact z0.s, p0, z0.s
-; CHECK-NEXT: cntp x8, p1, p0.s
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -20,9 +19,8 @@ define void @test_compressstore_nxv4i32(ptr %p, <vscale x 4 x i32> %vec, <vscale
define void @test_compressstore_nxv2i64(ptr %p, <vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: cntp x8, p0, p0.d
; CHECK-NEXT: compact z0.d, p0, z0.d
-; CHECK-NEXT: cntp x8, p1, p0.d
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -33,9 +31,8 @@ define void @test_compressstore_nxv2i64(ptr %p, <vscale x 2 x i64> %vec, <vscale
define void @test_compressstore_nxv4f32(ptr %p, <vscale x 4 x float> %vec, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cntp x8, p0, p0.s
; CHECK-NEXT: compact z0.s, p0, z0.s
-; CHECK-NEXT: cntp x8, p1, p0.s
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -46,9 +43,8 @@ define void @test_compressstore_nxv4f32(ptr %p, <vscale x 4 x float> %vec, <vsca
define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: cntp x8, p0, p0.d
; CHECK-NEXT: compact z0.d, p0, z0.d
-; CHECK-NEXT: cntp x8, p1, p0.d
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -61,9 +57,8 @@ define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vsc
define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: cntp x8, p0, p0.d
; CHECK-NEXT: compact z0.d, p0, z0.d
-; CHECK-NEXT: cntp x8, p1, p0.d
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: st1w { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -76,9 +71,8 @@ define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vsca
define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: cntp x8, p0, p0.d
; CHECK-NEXT: compact z0.d, p0, z0.d
-; CHECK-NEXT: cntp x8, p1, p0.d
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: st1b { z0.d }, p0, [x0]
; CHECK-NEXT: ret
@@ -89,9 +83,8 @@ define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x
define void @test_compressstore_nxv4i16(ptr %p, <vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: test_compressstore_nxv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: cntp x8, p0, p0.s
; CHECK-NEXT: compact z0.s, p0, z0.s
-; CHECK-NEXT: cntp x8, p1, p0.s
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: st1h { z0.s }, p0, [x0]
; CHECK-NEXT: ret
@@ -107,10 +100,9 @@ define void @test_compressstore_v2f64(ptr %p, <2 x double> %vec, <2 x i1> %mask)
; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: shl v1.2d, v1.2d, #63
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: cntp x8, p0, p0.d
; CHECK-NEXT: compact z0.d, p0, z0.d
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
@@ -125,10 +117,9 @@ define void @test_compressstore_v4i32(ptr %p, <4 x i32> %vec, <4 x i1> %mask) {
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: ptrue p1.s
; CHECK-NEXT: shl v1.4s, v1.4s, #31
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
-; CHECK-NEXT: cntp x8, p1, p0.s
+; CHECK-NEXT: cntp x8, p0, p0.s
; CHECK-NEXT: compact z0.s, p0, z0.s
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
@@ -143,10 +134,9 @@ define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: shl v1.2d, v1.2d, #63
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT: cntp x8, p1, p0.d
+; CHECK-NEXT: cntp x8, p0, p0.d
; CHECK-NEXT: compact z0.d, p0, z0.d
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
@@ -216,12 +206,11 @@ define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
; CHECK-VL256-NEXT: uunpklo z2.h, z2.b
; CHECK-VL256-NEXT: ptrue p1.s, vl4
; CHECK-VL256-NEXT: splice z0.s, p1, z0.s, z1.s
-; CHECK-VL256-NEXT: ptrue p1.s
; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
; CHECK-VL256-NEXT: lsl z2.s, z2.s, #31
; CHECK-VL256-NEXT: asr z2.s, z2.s, #31
; CHECK-VL256-NEXT: cmpne p0.s, p0/z, z2.s, #0
-; CHECK-VL256-NEXT: cntp x8, p1, p0.s
+; CHECK-VL256-NEXT: cntp x8, p0, p0.s
; CHECK-VL256-NEXT: compact z0.s, p0, z0.s
; CHECK-VL256-NEXT: whilelo p0.s, xzr, x8
; CHECK-VL256-NEXT: st1w { z0.s }, p0, [x0]
@@ -271,12 +260,11 @@ define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) {
; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
; CHECK-VL256-NEXT: ptrue p1.d, vl2
; CHECK-VL256-NEXT: splice z0.d, p1, z0.d, z1.d
-; CHECK-VL256-NEXT: ptrue p1.d
; CHECK-VL256-NEXT: uunpklo z2.d, z2.s
; CHECK-VL256-NEXT: lsl z2.d, z2.d, #63
; CHECK-VL256-NEXT: asr z2.d, z2.d, #63
; CHECK-VL256-NEXT: cmpne p0.d, p0/z, z2.d, #0
-; CHECK-VL256-NEXT: cntp x8, p1, p0.d
+; CHECK-VL256-NEXT: cntp x8, p0, p0.d
; CHECK-VL256-NEXT: compact z0.d, p0, z0.d
; CHECK-VL256-NEXT: whilelo p0.d, xzr, x8
; CHECK-VL256-NEXT: st1d { z0.d }, p0, [x0]
>From 416289e7ef1eac59157dcaf78f8ef475d6158877 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 24 Nov 2025 17:49:20 +0000
Subject: [PATCH 5/5] Fixups
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 25 ++--
.../Target/AArch64/AArch64ISelLowering.cpp | 4 +-
.../AArch64/AArch64TargetTransformInfo.h | 39 +----
.../AArch64/sve-masked-compressstore.ll | 139 ++++++++----------
4 files changed, 89 insertions(+), 118 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5684e0e4c26c4..028fa81f4069a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10607,20 +10607,25 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
assert(DataVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
"Incompatible types of Data and Mask");
if (IsCompressedMemory) {
- if (DataVT.isScalableVector())
- report_fatal_error(
- "Cannot currently handle compressed memory with scalable vectors");
// Incrementing the pointer according to number of '1's in the mask.
- EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
- SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
- if (MaskIntVT.getSizeInBits() < 32) {
- MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
- MaskIntVT = MVT::i32;
+ if (DataVT.isScalableVector()) {
+ EVT MaskExtVT = MaskVT.changeElementType(MVT::i32);
+ SDValue MaskExt = DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Mask);
+ Increment = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, MaskExt);
+ } else {
+ EVT MaskIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
+ SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
+ if (MaskIntVT.getSizeInBits() < 32) {
+ MaskInIntReg =
+ DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
+ MaskIntVT = MVT::i32;
+ }
+ Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
}
- // Count '1's with POPCNT.
- Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
+
// Scale is an element size in bytes.
SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
AddrVT);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1d71a5d67f0ca..3c0c83d532771 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1986,11 +1986,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
MVT::nxv4i32, MVT::nxv4f32}) {
- setOperationAction(ISD::MSTORE, VT, Custom);
+ setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
// Use a custom lowering for masked stores that could be a supported
// compressing store. Note: These types still use the normal (Legal)
// lowering for non-compressing masked stores.
- setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
}
// If we have SVE, we can use SVE logic for legal (or smaller than legal)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a1bece496c302..82856d605a56f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -333,45 +333,20 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
}
bool isElementTypeLegalForCompressStore(Type *Ty) const {
- if (Ty->isFloatTy() || Ty->isDoubleTy())
- return true;
-
- if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || Ty->isIntegerTy(32) ||
- Ty->isIntegerTy(64))
- return true;
-
- return false;
+ return Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isIntegerTy(32) ||
+ Ty->isIntegerTy(64);
}
bool isLegalMaskedCompressStore(Type *DataType,
Align Alignment) const override {
- auto VecTy = cast<VectorType>(DataType);
- Type *ElTy = VecTy->getScalarType();
- unsigned ElSizeInBits = ElTy->getScalarSizeInBits();
- TypeSize VecSizeInBits = VecTy->getPrimitiveSizeInBits();
-
- if (isa<FixedVectorType>(VecTy)) {
- // Each 128-bit segment must contain 2 or 4 elements (packed).
- if (ElSizeInBits != 32 && ElSizeInBits != 64)
- return false;
- if (VecSizeInBits % 128 != 0 ||
- VecSizeInBits > std::max(128U, ST->getMinSVEVectorSizeInBits()))
- return false;
- } else {
- // Each segment must contain 2 or 4 elements, but the segments can be
- // < 128-bits for unpacked vector types.
- if (VecSizeInBits.getKnownMinValue() > 128)
- return false;
- unsigned ElementsPerSegment =
- VecSizeInBits.getKnownMinValue() / ElSizeInBits;
- if (ElementsPerSegment != 2 && ElementsPerSegment != 4)
- return false;
- }
+ if (!ST->isSVEAvailable())
+ return false;
- if (!isElementTypeLegalForCompressStore(DataType->getScalarType()))
+ if (isa<FixedVectorType>(DataType) &&
+ DataType->getPrimitiveSizeInBits() < 128)
return false;
- return isLegalMaskedLoadStore(DataType, Alignment);
+ return isElementTypeLegalForCompressStore(DataType->getScalarType());
}
bool isLegalMaskedGatherScatter(Type *DataType) const {
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
index ca4ccbedf58c8..fada89374044e 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll
@@ -52,6 +52,26 @@ define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vsc
ret void
}
+;; SVE vectors that will be split
+
+define void @test_compressstore_nxv8i32(ptr %p, <vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compressstore_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: cntp x8, p1, p1.s
+; CHECK-NEXT: compact z1.s, p1, z1.s
+; CHECK-NEXT: cntp x9, p0, p0.s
+; CHECK-NEXT: compact z0.s, p0, z0.s
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
+; CHECK-NEXT: st1w { z0.s }, p1, [x0]
+; CHECK-NEXT: ret
+ tail call void @llvm.masked.compressstore.nxv8i32(<vscale x 8 x i32> %vec, ptr align 4 %p, <vscale x 8 x i1> %mask)
+ ret void
+}
+
;; Unpacked SVE vector types
define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
@@ -148,53 +168,29 @@ define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
; CHECK-BASE-LABEL: test_compressstore_v8i32:
; CHECK-BASE: // %bb.0:
-; CHECK-BASE-NEXT: shl v2.8b, v2.8b, #7
-; CHECK-BASE-NEXT: adrp x8, .LCPI10_0
-; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI10_0]
-; CHECK-BASE-NEXT: cmlt v2.8b, v2.8b, #0
-; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
-; CHECK-BASE-NEXT: addv b2, v2.8b
-; CHECK-BASE-NEXT: fmov w8, s2
-; CHECK-BASE-NEXT: tbnz w8, #0, .LBB10_9
-; CHECK-BASE-NEXT: // %bb.1: // %else
-; CHECK-BASE-NEXT: tbnz w8, #1, .LBB10_10
-; CHECK-BASE-NEXT: .LBB10_2: // %else2
-; CHECK-BASE-NEXT: tbnz w8, #2, .LBB10_11
-; CHECK-BASE-NEXT: .LBB10_3: // %else5
-; CHECK-BASE-NEXT: tbnz w8, #3, .LBB10_12
-; CHECK-BASE-NEXT: .LBB10_4: // %else8
-; CHECK-BASE-NEXT: tbnz w8, #4, .LBB10_13
-; CHECK-BASE-NEXT: .LBB10_5: // %else11
-; CHECK-BASE-NEXT: tbnz w8, #5, .LBB10_14
-; CHECK-BASE-NEXT: .LBB10_6: // %else14
-; CHECK-BASE-NEXT: tbnz w8, #6, .LBB10_15
-; CHECK-BASE-NEXT: .LBB10_7: // %else17
-; CHECK-BASE-NEXT: tbnz w8, #7, .LBB10_16
-; CHECK-BASE-NEXT: .LBB10_8: // %else20
-; CHECK-BASE-NEXT: ret
-; CHECK-BASE-NEXT: .LBB10_9: // %cond.store
-; CHECK-BASE-NEXT: st1 { v0.s }[0], [x0], #4
-; CHECK-BASE-NEXT: tbz w8, #1, .LBB10_2
-; CHECK-BASE-NEXT: .LBB10_10: // %cond.store1
-; CHECK-BASE-NEXT: st1 { v0.s }[1], [x0], #4
-; CHECK-BASE-NEXT: tbz w8, #2, .LBB10_3
-; CHECK-BASE-NEXT: .LBB10_11: // %cond.store4
-; CHECK-BASE-NEXT: st1 { v0.s }[2], [x0], #4
-; CHECK-BASE-NEXT: tbz w8, #3, .LBB10_4
-; CHECK-BASE-NEXT: .LBB10_12: // %cond.store7
-; CHECK-BASE-NEXT: st1 { v0.s }[3], [x0], #4
-; CHECK-BASE-NEXT: tbz w8, #4, .LBB10_5
-; CHECK-BASE-NEXT: .LBB10_13: // %cond.store10
-; CHECK-BASE-NEXT: st1 { v1.s }[0], [x0], #4
-; CHECK-BASE-NEXT: tbz w8, #5, .LBB10_6
-; CHECK-BASE-NEXT: .LBB10_14: // %cond.store13
-; CHECK-BASE-NEXT: st1 { v1.s }[1], [x0], #4
-; CHECK-BASE-NEXT: tbz w8, #6, .LBB10_7
-; CHECK-BASE-NEXT: .LBB10_15: // %cond.store16
-; CHECK-BASE-NEXT: st1 { v1.s }[2], [x0], #4
-; CHECK-BASE-NEXT: tbz w8, #7, .LBB10_8
-; CHECK-BASE-NEXT: .LBB10_16: // %cond.store19
-; CHECK-BASE-NEXT: st1 { v1.s }[3], [x0]
+; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-BASE-NEXT: zip2 v3.8b, v2.8b, v0.8b
+; CHECK-BASE-NEXT: zip1 v2.8b, v2.8b, v0.8b
+; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-BASE-NEXT: movi v4.4s, #1
+; CHECK-BASE-NEXT: ptrue p0.s, vl4
+; CHECK-BASE-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-BASE-NEXT: shl v3.4s, v3.4s, #31
+; CHECK-BASE-NEXT: shl v5.4s, v2.4s, #31
+; CHECK-BASE-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-BASE-NEXT: cmpne p1.s, p0/z, z3.s, #0
+; CHECK-BASE-NEXT: cmpne p0.s, p0/z, z5.s, #0
+; CHECK-BASE-NEXT: addv s2, v2.4s
+; CHECK-BASE-NEXT: fmov w10, s2
+; CHECK-BASE-NEXT: cntp x8, p1, p1.s
+; CHECK-BASE-NEXT: compact z1.s, p1, z1.s
+; CHECK-BASE-NEXT: compact z0.s, p0, z0.s
+; CHECK-BASE-NEXT: cntp x9, p0, p0.s
+; CHECK-BASE-NEXT: whilelo p0.s, xzr, x8
+; CHECK-BASE-NEXT: whilelo p1.s, xzr, x9
+; CHECK-BASE-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
+; CHECK-BASE-NEXT: st1w { z0.s }, p1, [x0]
; CHECK-BASE-NEXT: ret
;
; CHECK-VL256-LABEL: test_compressstore_v8i32:
@@ -222,33 +218,28 @@ define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) {
; CHECK-BASE-LABEL: test_compressstore_v4i64:
; CHECK-BASE: // %bb.0:
-; CHECK-BASE-NEXT: shl v2.4h, v2.4h, #15
-; CHECK-BASE-NEXT: adrp x8, .LCPI11_0
-; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI11_0]
-; CHECK-BASE-NEXT: cmlt v2.4h, v2.4h, #0
-; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
-; CHECK-BASE-NEXT: addv h2, v2.4h
-; CHECK-BASE-NEXT: fmov w8, s2
-; CHECK-BASE-NEXT: tbnz w8, #0, .LBB11_5
-; CHECK-BASE-NEXT: // %bb.1: // %else
-; CHECK-BASE-NEXT: tbnz w8, #1, .LBB11_6
-; CHECK-BASE-NEXT: .LBB11_2: // %else2
-; CHECK-BASE-NEXT: tbnz w8, #2, .LBB11_7
-; CHECK-BASE-NEXT: .LBB11_3: // %else5
-; CHECK-BASE-NEXT: tbnz w8, #3, .LBB11_8
-; CHECK-BASE-NEXT: .LBB11_4: // %else8
-; CHECK-BASE-NEXT: ret
-; CHECK-BASE-NEXT: .LBB11_5: // %cond.store
-; CHECK-BASE-NEXT: st1 { v0.d }[0], [x0], #8
-; CHECK-BASE-NEXT: tbz w8, #1, .LBB11_2
-; CHECK-BASE-NEXT: .LBB11_6: // %cond.store1
-; CHECK-BASE-NEXT: st1 { v0.d }[1], [x0], #8
-; CHECK-BASE-NEXT: tbz w8, #2, .LBB11_3
-; CHECK-BASE-NEXT: .LBB11_7: // %cond.store4
-; CHECK-BASE-NEXT: st1 { v1.d }[0], [x0], #8
-; CHECK-BASE-NEXT: tbz w8, #3, .LBB11_4
-; CHECK-BASE-NEXT: .LBB11_8: // %cond.store7
-; CHECK-BASE-NEXT: st1 { v1.d }[1], [x0]
+; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-BASE-NEXT: movi v5.2s, #1
+; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-BASE-NEXT: ptrue p0.d, vl2
+; CHECK-BASE-NEXT: ushll2 v3.2d, v2.4s, #0
+; CHECK-BASE-NEXT: ushll v4.2d, v2.2s, #0
+; CHECK-BASE-NEXT: and v2.8b, v2.8b, v5.8b
+; CHECK-BASE-NEXT: shl v3.2d, v3.2d, #63
+; CHECK-BASE-NEXT: shl v4.2d, v4.2d, #63
+; CHECK-BASE-NEXT: addp v2.2s, v2.2s, v2.2s
+; CHECK-BASE-NEXT: cmpne p1.d, p0/z, z3.d, #0
+; CHECK-BASE-NEXT: cmpne p0.d, p0/z, z4.d, #0
+; CHECK-BASE-NEXT: fmov w10, s2
+; CHECK-BASE-NEXT: cntp x8, p1, p1.d
+; CHECK-BASE-NEXT: compact z1.d, p1, z1.d
+; CHECK-BASE-NEXT: compact z0.d, p0, z0.d
+; CHECK-BASE-NEXT: cntp x9, p0, p0.d
+; CHECK-BASE-NEXT: whilelo p0.d, xzr, x8
+; CHECK-BASE-NEXT: whilelo p1.d, xzr, x9
+; CHECK-BASE-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
+; CHECK-BASE-NEXT: st1d { z0.d }, p1, [x0]
; CHECK-BASE-NEXT: ret
;
; CHECK-VL256-LABEL: test_compressstore_v4i64:
More information about the llvm-commits
mailing list