[llvm] [LLVM] Add `llvm.masked.compress` intrinsic (PR #92289)
Lawrence Benson via llvm-commits
llvm-commits at lists.llvm.org
Wed May 15 09:18:36 PDT 2024
https://github.com/lawben updated https://github.com/llvm/llvm-project/pull/92289
>From 3a7b06453eec84b5fd7c3178339fd230f21b5b35 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Wed, 15 May 2024 14:08:37 +0200
Subject: [PATCH 1/6] Add initial code for @llvm.masked.compress intrinsics
---
llvm/include/llvm/CodeGen/ISDOpcodes.h | 5 ++
llvm/include/llvm/IR/Intrinsics.td | 5 ++
.../include/llvm/Target/TargetSelectionDAG.td | 6 ++
.../SelectionDAG/LegalizeIntegerTypes.cpp | 20 +++++++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 3 +
.../SelectionDAG/LegalizeVectorOps.cpp | 50 ++++++++++++++++
.../SelectionDAG/LegalizeVectorTypes.cpp | 60 +++++++++++++++++++
.../SelectionDAG/SelectionDAGBuilder.cpp | 7 +++
.../SelectionDAG/SelectionDAGDumper.cpp | 1 +
llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 +
10 files changed, 160 insertions(+)
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d8af97957e48e..71dfd8b43b710 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1294,6 +1294,11 @@ enum NodeType {
MLOAD,
MSTORE,
+ // Masked compress - consecutively place vector elements based on mask
+ // e.g., vec = {A, B, C, D} and mask = 1010
+ // --> {A, C, ?, ?} where ? is undefined
+ MCOMPRESS,
+
// Masked gather and scatter - load and store operations for a vector of
// random addresses with additional mask operand that prevents memory
// accesses to the masked-off lanes.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f1c7d950f9275..e924d28956b0a 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2362,6 +2362,11 @@ def int_masked_compressstore:
[IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
NoCapture<ArgIndex<1>>]>;
+def int_masked_compress:
+ DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+ [IntrNoMem, IntrWillReturn]>;
+
// Test whether a pointer is associated with a type metadata identifier.
def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
[IntrNoMem, IntrWillReturn, IntrSpeculatable]>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 1684b424e3b44..061330fb4e08f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -266,6 +266,10 @@ def SDTMaskedScatter : SDTypeProfile<0, 4, [
SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 3>
]>;
+def SDTMaskedCompress : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
+]>;
+
def SDTVecShuffle : SDTypeProfile<1, 2, [
SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
]>;
@@ -731,6 +735,8 @@ def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather,
def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def masked_compress : SDNode<"ISD::MCOMPRESS", SDTMaskedCompress>;
+
// Do not use ld, st directly. Use load, extload, sextload, zextload, store,
// and truncst (see below).
def ld : SDNode<"ISD::LOAD" , SDTLoad,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 0aa36deda79dc..80f645b433cbe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -87,6 +87,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
break;
case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
break;
+ case ISD::MCOMPRESS: Res = PromoteIntRes_MCOMPRESS(N); break;
case ISD::SELECT:
case ISD::VSELECT:
case ISD::VP_SELECT:
@@ -948,6 +949,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
return Res;
}
+SDValue DAGTypeLegalizer::PromoteIntRes_MCOMPRESS(SDNode *N) {
+ SDValue Vec = GetPromotedInteger(N->getOperand(0));
+ return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), Vec.getValueType(), Vec, N->getOperand(1));
+}
+
/// Promote the overflow flag of an overflowing arithmetic node.
SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
// Change the return type of the boolean result while obeying
@@ -1855,6 +1861,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
OpNo); break;
case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
OpNo); break;
+ case ISD::MCOMPRESS: Res = PromoteIntOp_MCOMPRESS(N, OpNo); break;
case ISD::VP_TRUNCATE:
case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break;
case ISD::BF16_TO_FP:
@@ -2335,6 +2342,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
N->getIndexType(), TruncateStore);
}
+SDValue DAGTypeLegalizer::PromoteIntOp_MCOMPRESS(SDNode *N, unsigned OpNo) {
+ SDValue Vec = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+ EVT VT = Vec.getValueType();
+
+ if (OpNo == 0)
+ Vec = GetPromotedInteger(Vec);
+ else
+ Mask = PromoteTargetBoolean(Mask, VT);
+
+ return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), VT, Vec, Mask);
+}
+
SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
SDValue Op = GetPromotedInteger(N->getOperand(0));
if (N->getOpcode() == ISD::VP_TRUNCATE)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d925089d5689f..5fb14757f8991 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -321,6 +321,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_LOAD(LoadSDNode *N);
SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
+ SDValue PromoteIntRes_MCOMPRESS(SDNode *N);
SDValue PromoteIntRes_Overflow(SDNode *N);
SDValue PromoteIntRes_FFREXP(SDNode *N);
SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
@@ -390,6 +391,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_MCOMPRESS(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
SDValue PromoteIntOp_FIX(SDNode *N);
SDValue PromoteIntOp_ExpOp(SDNode *N);
@@ -882,6 +884,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi,
bool SplitSETCC = false);
+ void SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 423df9ae6b2a5..759de775ba011 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -134,6 +134,7 @@ class VectorLegalizer {
SDValue ExpandVSELECT(SDNode *Node);
SDValue ExpandVP_SELECT(SDNode *Node);
SDValue ExpandVP_MERGE(SDNode *Node);
+ SDValue ExpandMCOMPRESS(SDNode *Node);
SDValue ExpandVP_REM(SDNode *Node);
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
@@ -442,6 +443,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
case ISD::MGATHER:
+ case ISD::MCOMPRESS:
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
case ISD::SMULFIX:
@@ -1101,6 +1103,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
return;
break;
+ case ISD::MCOMPRESS:
+ Results.push_back(ExpandMCOMPRESS(Node));
+ return;
}
SDValue Unrolled = DAG.UnrollVectorOp(Node);
@@ -1505,6 +1510,51 @@ SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
}
+SDValue VectorLegalizer::ExpandMCOMPRESS(SDNode *Node) {
+ SDLoc DL(Node);
+ SDValue Vec = Node->getOperand(0);
+ SDValue Mask = Node->getOperand(1);
+
+ EVT VecVT = Vec.getValueType();
+ EVT ScalarVT = VecVT.getScalarType();
+ EVT MaskScalarVT = Mask.getValueType().getScalarType();
+
+ assert(TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(ScalarVT) && TLI.isTypeLegal(MaskScalarVT) &&
+ "Need legal vector/mask element types to scalarize masked compress.");
+
+ SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ SDValue Chain = DAG.getEntryNode();
+ SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
+
+ unsigned NumElms = VecVT.getVectorNumElements();
+ // Skip element zero, as we always copy this to the output vector.
+ for (unsigned I = 0; I < NumElms; I++) {
+ SDValue Idx = DAG.getVectorIdxConstant(I, DL);
+
+ SDValue ValI =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+ SDValue OutPtr =
+ TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+ Chain = DAG.getStore(Chain, DL, ValI, OutPtr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+
+ // Skip this for last element.
+ if (I < NumElms - 1) {
+ // Get the mask value and add it to the current output position. This
+ // either increments by 1 if MaskI is true or adds 0 otherwise.
+ SDValue MaskI =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
+ MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
+ MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
+ OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+ }
+ }
+
+ int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+ return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+}
+
SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
// Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index cd858003cf03b..62e7febed6568 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1058,6 +1058,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::VP_GATHER:
SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
break;
+ case ISD::MCOMPRESS:
+ SplitVecRes_MCOMPRESS(N, Lo, Hi);
+ break;
case ISD::SETCC:
case ISD::VP_SETCC:
SplitVecRes_SETCC(N, Lo, Hi);
@@ -2304,6 +2307,63 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
ReplaceValueWith(SDValue(N, 1), Ch);
}
+void DAGTypeLegalizer::SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo,
+ SDValue &Hi) {
+ // This is not "trivial", as there is a dependency between the two subvectors.
+ // Depending on the number of 1s in the mask, the elements from the Hi vector
+ // need to be moved to the Lo vector. So we just perform this as one "big"
+ // operation (analogously to the default MCOMPRESS expand implementation), by
+ // writing to memory and then loading the Lo and Hi vectors from that. This
+ // gets rid of MCOMPRESS and all other operands can be legalized later.
+ SDLoc DL(N);
+ SDValue Vec = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+
+ EVT VecVT = Vec.getValueType();
+ EVT SubVecVT = VecVT.getHalfNumVectorElementsVT(*DAG.getContext());
+ EVT ScalarVT = VecVT.getScalarType();
+ EVT MaskScalarVT = Mask.getValueType().getScalarType();
+
+ // TODO: This code is duplicated here and in LegalizeVectorOps.
+ SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ SDValue Chain = DAG.getEntryNode();
+ SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
+
+ unsigned NumElms = VecVT.getVectorNumElements();
+ // Skip element zero, as we always copy this to the output vector.
+ for (unsigned I = 0; I < NumElms; I++) {
+ SDValue Idx = DAG.getVectorIdxConstant(I, DL);
+
+ SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+ SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+ Chain = DAG.getStore(
+ Chain, DL, ValI, OutPtr,
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+
+ // Skip this for last element.
+ if (I < NumElms - 1) {
+ // Get the mask value and add it to the current output position. This
+ // either increments by 1 if MaskI is true or adds 0 otherwise.
+ SDValue MaskI =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
+ MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
+ MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
+ OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+ }
+ }
+
+ int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+ SDValue HiPtr = TLI.getVectorElementPointer(
+ DAG, StackPtr, VecVT, DAG.getConstant(NumElms / 2, DL, MVT::i32));
+
+ Lo = DAG.getLoad(SubVecVT, DL, Chain, StackPtr, PtrInfo);
+ Hi = DAG.getLoad(
+ SubVecVT, DL, Chain, HiPtr,
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+}
+
void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
assert(N->getValueType(0).isVector() &&
N->getOperand(0).getValueType().isVector() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ca352da5d36eb..665bab6121837 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6718,6 +6718,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::masked_compressstore:
visitMaskedStore(I, true /* IsCompressing */);
return;
+ case Intrinsic::masked_compress:
+ setValue(&I, DAG.getNode(ISD::MCOMPRESS, sdl,
+ getValue(I.getArgOperand(0)).getValueType(),
+ getValue(I.getArgOperand(0)),
+ getValue(I.getArgOperand(1)),
+ Flags));
+ return;
case Intrinsic::powi:
setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)), DAG));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 59742e90c6791..37288054b0e7b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -416,6 +416,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::MSTORE: return "masked_store";
case ISD::MGATHER: return "masked_gather";
case ISD::MSCATTER: return "masked_scatter";
+ case ISD::MCOMPRESS: return "masked_compress";
case ISD::VAARG: return "vaarg";
case ISD::VACOPY: return "vacopy";
case ISD::VAEND: return "vaend";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 09b70cfb72278..5ee12be752b27 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -956,6 +956,9 @@ void TargetLoweringBase::initActions() {
// Named vector shuffles default to expand.
setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
+ // Only some target support this vector operation. Most need to expand it.
+ setOperationAction(ISD::MCOMPRESS, VT, Expand);
+
// VP operations default to expand.
#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) \
setOperationAction(ISD::SDOPC, VT, Expand);
>From 75abf0b013f732335ced35002055f0da48f724e0 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Wed, 15 May 2024 15:32:47 +0200
Subject: [PATCH 2/6] Remove requirements for legal types
---
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 759de775ba011..ca32db26e511c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1519,9 +1519,6 @@ SDValue VectorLegalizer::ExpandMCOMPRESS(SDNode *Node) {
EVT ScalarVT = VecVT.getScalarType();
EVT MaskScalarVT = Mask.getValueType().getScalarType();
- assert(TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(ScalarVT) && TLI.isTypeLegal(MaskScalarVT) &&
- "Need legal vector/mask element types to scalarize masked compress.");
-
SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
SDValue Chain = DAG.getEntryNode();
SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
>From 0329bc9652f9a6c633924d951d6694399d9f6af7 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Wed, 15 May 2024 16:33:36 +0200
Subject: [PATCH 3/6] Add tests for AArch64
---
llvm/test/CodeGen/AArch64/masked-compress.ll | 280 +++++++++++++++++++
1 file changed, 280 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/masked-compress.ll
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
new file mode 100644
index 0000000000000..54c3beab82f76
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
+
+define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ushll.4s v1, v1, #0
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str s0, [sp]
+; CHECK-NEXT: shl.4s v1, v1, #31
+; CHECK-NEXT: cmlt.4s v1, v1, #0
+; CHECK-NEXT: mov.s w9, v1[1]
+; CHECK-NEXT: mov.s w10, v1[2]
+; CHECK-NEXT: fmov w11, s1
+; CHECK-NEXT: bfi x8, x11, #2, #1
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: and w10, w10, #0x1
+; CHECK-NEXT: add w9, w11, w9
+; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: st1.s { v0 }[1], [x8]
+; CHECK-NEXT: add w10, w9, w10
+; CHECK-NEXT: orr x9, x11, x9, lsl #2
+; CHECK-NEXT: bfi x11, x10, #2, #2
+; CHECK-NEXT: st1.s { v0 }[2], [x9]
+; CHECK-NEXT: st1.s { v0 }[3], [x11]
+; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: ret
+ %out = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> %vec, <4 x i1> %mask)
+ ret <4 x i32> %out
+}
+
+define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
+; CHECK-LABEL: test_compress_v16i8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: shl.16b v1, v1, #7
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: st1.b { v0 }[0], [x8]
+; CHECK-NEXT: mov x13, sp
+; CHECK-NEXT: cmlt.16b v1, v1, #0
+; CHECK-NEXT: umov.b w9, v1[0]
+; CHECK-NEXT: umov.b w10, v1[1]
+; CHECK-NEXT: umov.b w11, v1[2]
+; CHECK-NEXT: umov.b w14, v1[3]
+; CHECK-NEXT: bfxil x12, x9, #0, #1
+; CHECK-NEXT: and w10, w10, #0x1
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: add w9, w9, w10
+; CHECK-NEXT: umov.b w10, v1[4]
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: st1.b { v0 }[1], [x12]
+; CHECK-NEXT: orr x12, x8, x9
+; CHECK-NEXT: add w9, w9, w11
+; CHECK-NEXT: umov.b w11, v1[5]
+; CHECK-NEXT: and w14, w14, #0x1
+; CHECK-NEXT: st1.b { v0 }[2], [x12]
+; CHECK-NEXT: add w14, w9, w14
+; CHECK-NEXT: umov.b w12, v1[6]
+; CHECK-NEXT: orr x9, x8, x9
+; CHECK-NEXT: and w10, w10, #0x1
+; CHECK-NEXT: st1.b { v0 }[3], [x9]
+; CHECK-NEXT: orr x9, x8, x14
+; CHECK-NEXT: add w10, w14, w10
+; CHECK-NEXT: umov.b w14, v1[7]
+; CHECK-NEXT: st1.b { v0 }[4], [x9]
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: bfxil x13, x10, #0, #4
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: add w10, w10, w11
+; CHECK-NEXT: umov.b w11, v1[8]
+; CHECK-NEXT: and w12, w12, #0x1
+; CHECK-NEXT: bfxil x9, x10, #0, #4
+; CHECK-NEXT: st1.b { v0 }[5], [x13]
+; CHECK-NEXT: umov.b w13, v1[9]
+; CHECK-NEXT: add w10, w10, w12
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: and w14, w14, #0x1
+; CHECK-NEXT: st1.b { v0 }[6], [x9]
+; CHECK-NEXT: umov.b w9, v1[10]
+; CHECK-NEXT: bfxil x12, x10, #0, #4
+; CHECK-NEXT: add w10, w10, w14
+; CHECK-NEXT: mov x14, sp
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: bfxil x14, x10, #0, #4
+; CHECK-NEXT: add w10, w10, w11
+; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: and w13, w13, #0x1
+; CHECK-NEXT: st1.b { v0 }[7], [x12]
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: bfxil x11, x10, #0, #4
+; CHECK-NEXT: add w10, w10, w13
+; CHECK-NEXT: umov.b w13, v1[11]
+; CHECK-NEXT: st1.b { v0 }[8], [x14]
+; CHECK-NEXT: umov.b w14, v1[12]
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: bfxil x12, x10, #0, #4
+; CHECK-NEXT: add w9, w10, w9
+; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: st1.b { v0 }[9], [x11]
+; CHECK-NEXT: umov.b w11, v1[13]
+; CHECK-NEXT: bfxil x10, x9, #0, #4
+; CHECK-NEXT: st1.b { v0 }[10], [x12]
+; CHECK-NEXT: umov.b w12, v1[14]
+; CHECK-NEXT: and w13, w13, #0x1
+; CHECK-NEXT: and w14, w14, #0x1
+; CHECK-NEXT: add w9, w9, w13
+; CHECK-NEXT: st1.b { v0 }[11], [x10]
+; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: add w13, w9, w14
+; CHECK-NEXT: mov x14, sp
+; CHECK-NEXT: bfxil x10, x9, #0, #4
+; CHECK-NEXT: and w9, w11, #0x1
+; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: add w9, w13, w9
+; CHECK-NEXT: and w12, w12, #0x1
+; CHECK-NEXT: bfxil x14, x13, #0, #4
+; CHECK-NEXT: bfxil x11, x9, #0, #4
+; CHECK-NEXT: add w9, w9, w12
+; CHECK-NEXT: st1.b { v0 }[12], [x10]
+; CHECK-NEXT: bfxil x8, x9, #0, #4
+; CHECK-NEXT: st1.b { v0 }[13], [x14]
+; CHECK-NEXT: st1.b { v0 }[14], [x11]
+; CHECK-NEXT: st1.b { v0 }[15], [x8]
+; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: ret
+ %out = call <16 x i8> @llvm.masked.compress.v16i8(<16 x i8> %vec, <16 x i1> %mask)
+ ret <16 x i8> %out
+}
+
+define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
+; CHECK-LABEL: test_compress_large:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-NEXT: sub x9, sp, #48
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ; kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: umov.b w9, v2[0]
+; CHECK-NEXT: umov.b w10, v2[1]
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: umov.b w11, v2[2]
+; CHECK-NEXT: umov.b w13, v2[3]
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: umov.b w14, v2[4]
+; CHECK-NEXT: str s0, [sp]
+; CHECK-NEXT: bfi x12, x9, #2, #1
+; CHECK-NEXT: and w10, w10, #0x1
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: add w9, w9, w10
+; CHECK-NEXT: and w10, w11, #0x1
+; CHECK-NEXT: and w13, w13, #0x1
+; CHECK-NEXT: orr x11, x8, x9, lsl #2
+; CHECK-NEXT: add w9, w9, w10
+; CHECK-NEXT: umov.b w10, v2[5]
+; CHECK-NEXT: st1.s { v0 }[1], [x12]
+; CHECK-NEXT: add w13, w9, w13
+; CHECK-NEXT: orr x9, x8, x9, lsl #2
+; CHECK-NEXT: st1.s { v0 }[2], [x11]
+; CHECK-NEXT: umov.b w11, v2[6]
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: and w14, w14, #0x1
+; CHECK-NEXT: bfi x12, x13, #2, #3
+; CHECK-NEXT: st1.s { v0 }[3], [x9]
+; CHECK-NEXT: add w13, w13, w14
+; CHECK-NEXT: and w9, w10, #0x1
+; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: add w9, w13, w9
+; CHECK-NEXT: mov x14, sp
+; CHECK-NEXT: str s1, [x12]
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: bfi x10, x9, #2, #3
+; CHECK-NEXT: bfi x14, x13, #2, #3
+; CHECK-NEXT: add w9, w9, w11
+; CHECK-NEXT: bfi x8, x9, #2, #3
+; CHECK-NEXT: st1.s { v1 }[1], [x14]
+; CHECK-NEXT: st1.s { v1 }[2], [x10]
+; CHECK-NEXT: st1.s { v1 }[3], [x8]
+; CHECK-NEXT: ldp q0, q1, [sp]
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %out = call <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> %vec, <8 x i1> %mask)
+ ret <8 x i32> %out
+}
+
+define <4 x i32> @test_compress_const(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_const:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mov x8, #3 ; =0x3
+; CHECK-NEXT: mov w9, #9 ; =0x9
+; CHECK-NEXT: movk x8, #7, lsl #32
+; CHECK-NEXT: str x8, [sp, #-16]!
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov w8, #5 ; =0x5
+; CHECK-NEXT: str w9, [sp, #8]
+; CHECK-NEXT: str w8, [sp]
+; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: ret
+ %out = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
+ <4 x i1> <i1 0, i1 1, i1 1, i1 0>)
+ ret <4 x i32> %out
+}
+
+define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_small:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: shl.4h v1, v1, #15
+; CHECK-NEXT: add x8, sp, #8
+; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: str h0, [sp, #8]
+; CHECK-NEXT: cmlt.4h v1, v1, #0
+; CHECK-NEXT: umov.h w9, v1[0]
+; CHECK-NEXT: umov.h w10, v1[1]
+; CHECK-NEXT: umov.h w11, v1[2]
+; CHECK-NEXT: bfi x8, x9, #1, #1
+; CHECK-NEXT: and w10, w10, #0x1
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: add w9, w9, w10
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: add x10, sp, #8
+; CHECK-NEXT: add w11, w9, w11
+; CHECK-NEXT: orr x9, x10, x9, lsl #1
+; CHECK-NEXT: st1.h { v0 }[1], [x8]
+; CHECK-NEXT: bfi x10, x11, #1, #2
+; CHECK-NEXT: st1.h { v0 }[2], [x9]
+; CHECK-NEXT: st1.h { v0 }[3], [x10]
+; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %out = call <4 x i8> @llvm.masked.compress.v4i8(<4 x i8> %vec, <4 x i1> %mask)
+ ret <4 x i8> %out
+}
+
+define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_illegal_element_type:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: shl.4h v1, v1, #15
+; CHECK-NEXT: add x8, sp, #8
+; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: str h0, [sp, #8]
+; CHECK-NEXT: cmlt.4h v1, v1, #0
+; CHECK-NEXT: umov.h w9, v1[0]
+; CHECK-NEXT: umov.h w10, v1[1]
+; CHECK-NEXT: umov.h w11, v1[2]
+; CHECK-NEXT: bfi x8, x9, #1, #1
+; CHECK-NEXT: and w10, w10, #0x1
+; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: add w9, w9, w10
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: add x10, sp, #8
+; CHECK-NEXT: add w11, w9, w11
+; CHECK-NEXT: orr x9, x10, x9, lsl #1
+; CHECK-NEXT: st1.h { v0 }[1], [x8]
+; CHECK-NEXT: bfi x10, x11, #1, #2
+; CHECK-NEXT: st1.h { v0 }[2], [x9]
+; CHECK-NEXT: st1.h { v0 }[3], [x10]
+; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %out = call <4 x i4> @llvm.masked.compress.v4i4(<4 x i4> %vec, <4 x i1> %mask)
+ ret <4 x i4> %out
+}
+
+declare <4 x i32> @llvm.masked.compress.v4i32(<4 x i32>, <4 x i1>)
+declare <16 x i8> @llvm.masked.compress.v16i8(<16 x i8>, <16 x i1>)
+declare <4 x i4> @llvm.masked.compress.v4i4(<4 x i4>, <4 x i1>)
+declare <4 x i8> @llvm.masked.compress.v4i8(<4 x i8>, <4 x i1>)
+declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32>, <8 x i1>)
>From 73bfebbdb3d7a1c96c8521e5789ad7e59f665da7 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Wed, 15 May 2024 16:41:13 +0200
Subject: [PATCH 4/6] Add floating point test
---
llvm/test/CodeGen/AArch64/masked-compress.ll | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index 54c3beab82f76..a2f39b9620c95 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -32,6 +32,25 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
ret <4 x i32> %out
}
+define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compress_v2f64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ushll.2d v1, v1, #0
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str d0, [sp]
+; CHECK-NEXT: shl.2d v1, v1, #63
+; CHECK-NEXT: cmlt.2d v1, v1, #0
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: bfi x8, x9, #3, #1
+; CHECK-NEXT: st1.d { v0 }[1], [x8]
+; CHECK-NEXT: ldr q0, [sp], #16
+; CHECK-NEXT: ret
+ %out = call <2 x double> @llvm.masked.compress.v2f64(<2 x double> %vec, <2 x i1> %mask)
+ ret <2 x double> %out
+}
+
define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
; CHECK-LABEL: test_compress_v16i8:
; CHECK: ; %bb.0:
>From e4423a1b434c086a7787594efbba96aa29e392c4 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Wed, 15 May 2024 17:47:42 +0200
Subject: [PATCH 5/6] Add documentation
---
llvm/docs/LangRef.rst | 79 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 79 insertions(+)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 06809f8bf445d..773893b83a5d7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -24975,6 +24975,85 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations.
+Masked Vector Compress Intrinsic
+--------------------------------
+
+LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
+Semantically, this is similar to :ref:``@llvm.masked.compressstore <_int_compressstore>`` but with weaker assumptions
+and without storing the results to memory, i.e., the data remains in the vector.
+
+.. _int_masked_compress:
+
+'``llvm.masked.compress.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. A number of scalar values of integer, floating point or pointer data type are collected
+from an input vector and placed adjacently within the result vector. A mask defines which elements to collect from the vector.
+
+:: code-block:: llvm
+
+ declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>)
+ declare <16 x float> @llvm.masked.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>)
+
+Overview:
+"""""""""
+
+Selects elements from input vector '``value``' according to the '``mask``'.
+All selected elements are written into adjacent lanes in the result vector, from lower to higher.
+The mask holds a bit for each vector lane, and is used to select elements to be kept.
+The number of valid lanes is equal to the number of active bits in the mask.
+The main difference to :ref:`llvm.masked.compressstore <_int_compressstore>` is that the remainder of the vector may
+contain undefined values.
+This allows for branchless code and better optimization for all targets that do not support the explicit semantics of
+:ref:`llvm.masked.compressstore <_int_compressstore>`.
+The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
+vector, but without requiring branches to avoid writes where the mask is 0.
+
+
+Arguments:
+""""""""""
+
+The first operand is the input vector, from which elements are selected.
+The second operand is the mask, a vector of boolean values.
+The mask and the input vector must have the same number of vector elements.
+
+Semantics:
+""""""""""
+
+The '``llvm.masked.compress``' intrinsic is designed for compressing data within a vector, i.e., ideally within a register.
+It allows to collect elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector in one IR operation.
+It is useful for targets all that support compress operations (e.g., AVX-512, ARM SVE, RISCV V), which more instruction
+sets do than explicit compressstore, i.e., ``llvm.masked.compress`` may yield better performance on more targets than
+``llvm.masked.compressstore`` due to weaker constraints.
+This intrinsic allows vectorizing loops with cross-iteration dependencies like in the following example:
+
+.. code-block:: c
+
+ // Consecutively store selected values with branchless code.
+ int *in, *out; bool *mask; int pos = 0;
+ for (int i = 0; i < size; ++i) {
+ out[pos] = in[i];
+ // if mask[i] == 0, the current value is overwritten in the next iteration.
+ pos += mask[i];
+ }
+
+
+.. code-block:: llvm
+
+ ; Load elements from `in`.
+ %vec = load <4 x i32>, ptr %inPtr
+ %mask = load <4 x i1>, ptr %maskPtr
+ %compressed = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> %vec, <4 x i1> %mask)
+ store <4 x i32> %compressed, ptr %outPtr
+
+ ; %outPtr should be increased in each iteration by the number of '1's in the mask.
+ %iMask = bitcast <4 x i1> %mask to i4
+ %popcnt = call i4 @llvm.ctpop.i4(i4 %iMask)
+ %zextPopcnt = zext i4 %popcnt to i64
+ %nextOut = add i64 %outPos, %zextPopcnt
+
Memory Use Markers
------------------
>From 3e9967803d110116fc90823ae39a91cfe9d03d2c Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github at lawben.com>
Date: Wed, 15 May 2024 18:06:44 +0200
Subject: [PATCH 6/6] Fix formatting
---
.../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 11 ++++++++---
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 10 +++++-----
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 3 +--
3 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 80f645b433cbe..4063144f47393 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -87,7 +87,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
break;
case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
break;
- case ISD::MCOMPRESS: Res = PromoteIntRes_MCOMPRESS(N); break;
+ case ISD::MCOMPRESS:
+ Res = PromoteIntRes_MCOMPRESS(N);
+ break;
case ISD::SELECT:
case ISD::VSELECT:
case ISD::VP_SELECT:
@@ -951,7 +953,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
SDValue DAGTypeLegalizer::PromoteIntRes_MCOMPRESS(SDNode *N) {
SDValue Vec = GetPromotedInteger(N->getOperand(0));
- return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), Vec.getValueType(), Vec, N->getOperand(1));
+ return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), Vec.getValueType(), Vec,
+ N->getOperand(1));
}
/// Promote the overflow flag of an overflowing arithmetic node.
@@ -1861,7 +1864,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
OpNo); break;
case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
OpNo); break;
- case ISD::MCOMPRESS: Res = PromoteIntOp_MCOMPRESS(N, OpNo); break;
+ case ISD::MCOMPRESS:
+ Res = PromoteIntOp_MCOMPRESS(N, OpNo);
+ break;
case ISD::VP_TRUNCATE:
case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break;
case ISD::BF16_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ca32db26e511c..ebf0f63775d44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1528,11 +1528,11 @@ SDValue VectorLegalizer::ExpandMCOMPRESS(SDNode *Node) {
for (unsigned I = 0; I < NumElms; I++) {
SDValue Idx = DAG.getVectorIdxConstant(I, DL);
- SDValue ValI =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
- SDValue OutPtr =
- TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
- Chain = DAG.getStore(Chain, DL, ValI, OutPtr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+ SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+ SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+ Chain = DAG.getStore(
+ Chain, DL, ValI, OutPtr,
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
// Skip this for last element.
if (I < NumElms - 1) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 665bab6121837..20461511ac92f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6722,8 +6722,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
setValue(&I, DAG.getNode(ISD::MCOMPRESS, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
- getValue(I.getArgOperand(1)),
- Flags));
+ getValue(I.getArgOperand(1)), Flags));
return;
case Intrinsic::powi:
setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
More information about the llvm-commits
mailing list