[llvm] [NVPTX] Generalize and extend upsizing when lowering 8/16-bit-element vector loads/stores (PR #119622)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 11 14:18:20 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Drew Kersnar (dakersnar)
<details>
<summary>Changes</summary>
This addresses the following issue I opened: https://github.com/llvm/llvm-project/issues/118851.
This change generalizes the Type Legalization mechanism that currently handles `v8[i/f/bf]16` upsizing to include loads _and_ stores of `v8i8` + `v16i8`, allowing all of the mentioned vectors to be lowered to ptx as vectors of `b32`. This extension also allows us to remove the DagCombine that only handled exactly `load v16i8`, thus centralizing all the upsizing logic into one place.
Test changes include adding v8i8, v16i8, and v8i16 cases to load-store.ll, and updating the CHECKs for other tests to match the improved codegen.
---
Patch is 87.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119622.diff
7 Files Affected:
- (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+12-10)
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+110-104)
- (modified) llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll (+42-44)
- (modified) llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (+2-4)
- (modified) llvm/test/CodeGen/NVPTX/load-store.ll (+1473-1)
- (modified) llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll (+8-10)
- (modified) llvm/test/CodeGen/NVPTX/vector-stores.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index e1fb2d7fcee0309..8536be18b89e01b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1400,11 +1400,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
EVT EltVT = N->getValueType(0);
- // v8x16 is a special case. PTX doesn't have ld.v8.16
- // instruction. Instead, we split the vector into v2x16 chunks and
- // load them with ld.v4.b32.
- if (Isv2x16VT(EltVT)) {
- assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than ld.v4 for those element types.
+ // In Type Legalization, rather than splitting those vectors into multiple
+ // loads, we split the vector into v2x16/v4i8 chunks. Now, we lower to PTX as
+ // vector loads of b32.
+ if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
EltVT = MVT::i32;
FromType = NVPTX::PTXLdStInstCode::Untyped;
FromTypeWidth = 32;
@@ -2084,11 +2085,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
return false;
}
- // v8x16 is a special case. PTX doesn't have st.v8.x16
- // instruction. Instead, we split the vector into v2x16 chunks and
- // store them with st.v4.b32.
- if (Isv2x16VT(EltVT)) {
- assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than st.v4 for those element types.
+ // In Type Legalization, rather than splitting those vectors into multiple
+ // stores, we split the vector into v2x16/v4i8 chunks. Now, we lower to
+ // PTX as vector stores of b32.
+ if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
EltVT = MVT::i32;
ToType = NVPTX::PTXLdStInstCode::Untyped;
ToTypeWidth = 32;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 62647b312851886..68d6edda8dddfd8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -136,6 +136,8 @@ static bool IsPTXVectorType(MVT VT) {
case MVT::v4i1:
case MVT::v2i8:
case MVT::v4i8:
+ case MVT::v8i8: // <2 x i8x4>
+ case MVT::v16i8: // <4 x i8x4>
case MVT::v2i16:
case MVT::v4i16:
case MVT::v8i16: // <4 x i16x2>
@@ -761,8 +763,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
- ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
- ISD::VSELECT});
+ ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -3157,6 +3158,13 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(N);
EVT ValVT = Val.getValueType();
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than st.v4 for those element types.
+ // Here in Type Legalization, rather than splitting those vectors into
+ // multiple stores, we split the vector into v2x16/v4i8 chunks. Later, in
+ // Instruction Selection, we lower to PTX as vector stores of b32.
+ bool UpsizeElementTypes = false;
+
if (ValVT.isVector()) {
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 stores of <2 x double> here
@@ -3180,10 +3188,15 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
case MVT::v4f16:
case MVT::v4bf16:
case MVT::v4f32:
- case MVT::v8f16: // <4 x f16x2>
+ // This is a "native" vector type
+ break;
+ case MVT::v8i8: // <2 x i8x4>
+ case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
case MVT::v8i16: // <4 x i16x2>
- // This is a "native" vector type
+ case MVT::v16i8: // <4 x i8x4>
+ // This can be upsized into a "native" vector type
+ UpsizeElementTypes = true;
break;
}
@@ -3206,6 +3219,33 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
EVT EltVT = ValVT.getVectorElementType();
unsigned NumElts = ValVT.getVectorNumElements();
+ if (UpsizeElementTypes) {
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected Vector Type");
+ case MVT::v8i8: // <2 x i8x4>
+ NumElts = 2;
+ EltVT = MVT::v4i8;
+ break;
+ case MVT::v8f16: // <4 x f16x2>
+ NumElts = 4;
+ EltVT = MVT::v2f16;
+ break;
+ case MVT::v8bf16: // <4 x bf16x2>
+ NumElts = 4;
+ EltVT = MVT::v2bf16;
+ break;
+ case MVT::v8i16: // <4 x i16x2>
+ NumElts = 4;
+ EltVT = MVT::v2i16;
+ break;
+ case MVT::v16i8: // <4 x i8x4>
+ NumElts = 4;
+ EltVT = MVT::v4i8;
+ break;
+ }
+ }
+
// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
// stored type to i16 and propagate the "real" type as the memory type.
@@ -3213,7 +3253,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
if (EltVT.getSizeInBits() < 16)
NeedExt = true;
- bool StoreF16x2 = false;
switch (NumElts) {
default:
return SDValue();
@@ -3223,14 +3262,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
case 4:
Opcode = NVPTXISD::StoreV4;
break;
- case 8:
- // v8f16 is a special case. PTX doesn't have st.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // store them with st.v4.b32.
- assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
- Opcode = NVPTXISD::StoreV4;
- StoreF16x2 = true;
- break;
}
SmallVector<SDValue, 8> Ops;
@@ -3238,17 +3269,23 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
// First is the chain
Ops.push_back(N->getOperand(0));
- if (StoreF16x2) {
- // Combine f16,f16 -> v2f16
- NumElts /= 2;
+ if (UpsizeElementTypes) {
+ // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
+ // stored as b32s
+ unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i * 2, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i * 2 + 1, DL));
- EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
- SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
- Ops.push_back(V2);
+ SmallVector<SDValue, 8> Elts;
+ for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
+ SDValue E = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getVectorElementType(), Val,
+ DAG.getIntPtrConstant(i * NumEltsPerSubVector + j, DL));
+ Elts.push_back(E);
+ }
+ EVT VecVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT.getVectorElementType(),
+ NumEltsPerSubVector);
+ SDValue SubVector = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Elts);
+ Ops.push_back(SubVector);
}
} else {
// Then the split values
@@ -6136,49 +6173,6 @@ static SDValue PerformVSELECTCombine(SDNode *N,
return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
}
-static SDValue PerformLOADCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- SelectionDAG &DAG = DCI.DAG;
- LoadSDNode *LD = cast<LoadSDNode>(N);
-
- // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
- // letting ReplaceLoadVector split it into smaller loads during legalization.
- // This is done at dag-combine1 time, so that vector operations with i8
- // elements can be optimised away instead of being needlessly split during
- // legalization, which involves storing to the stack and loading it back.
- EVT VT = N->getValueType(0);
- bool CorrectlyAligned =
- DCI.DAG.getTargetLoweringInfo().allowsMemoryAccessForAlignment(
- *DAG.getContext(), DAG.getDataLayout(), LD->getMemoryVT(),
- *LD->getMemOperand());
- if (!(VT == MVT::v16i8 && CorrectlyAligned))
- return SDValue();
-
- SDLoc DL(N);
-
- // Create a v4i32 vector load operation, effectively <4 x v4i8>.
- unsigned Opc = NVPTXISD::LoadV4;
- EVT NewVT = MVT::v4i32;
- EVT EltVT = NewVT.getVectorElementType();
- unsigned NumElts = NewVT.getVectorNumElements();
- EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
- SDVTList RetVTList = DAG.getVTList(RetVTs);
- SmallVector<SDValue, 8> Ops(N->ops());
- Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
- SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
- LD->getMemOperand());
- SDValue NewChain = NewLoad.getValue(NumElts);
-
- // Create a vector of the same type returned by the original load.
- SmallVector<SDValue, 4> Elts;
- for (unsigned i = 0; i < NumElts; i++)
- Elts.push_back(NewLoad.getValue(i));
- return DCI.DAG.getMergeValues(
- {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
- NewChain},
- DL);
-}
-
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6199,8 +6193,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformREMCombine(N, DCI, OptLevel);
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
- case ISD::LOAD:
- return PerformLOADCombine(N, DCI);
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
@@ -6247,6 +6239,13 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
assert(ResVT.isVector() && "Vector load must have vector type");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than ld.v4 for those element types.
+ // Here in Type Legalization, rather than splitting those vectors into
+ // multiple loads, we split the vector into v2x16/v4i8 chunks. Later, in
+ // Instruction Selection, we lower to PTX as vector loads of b32.
+ bool UpsizeElementTypes = false;
+
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 loads of <2 x double> here
// but I'm leaving that as a TODO for now.
@@ -6267,10 +6266,15 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
case MVT::v4f16:
case MVT::v4bf16:
case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ case MVT::v8i8: // <2 x i8x4>
case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
case MVT::v8i16: // <4 x i16x2>
- // This is a "native" vector type
+ case MVT::v16i8: // <4 x i8x4>
+ // This can be upsized into a "native" vector type
+ UpsizeElementTypes = true;
break;
}
@@ -6292,6 +6296,33 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
EVT EltVT = ResVT.getVectorElementType();
unsigned NumElts = ResVT.getVectorNumElements();
+ if (UpsizeElementTypes) {
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected Vector Type");
+ case MVT::v8i8: // <2 x i8x4>
+ NumElts = 2;
+ EltVT = MVT::v4i8;
+ break;
+ case MVT::v8f16: // <4 x f16x2>
+ NumElts = 4;
+ EltVT = MVT::v2f16;
+ break;
+ case MVT::v8bf16: // <4 x bf16x2>
+ NumElts = 4;
+ EltVT = MVT::v2bf16;
+ break;
+ case MVT::v8i16: // <4 x i16x2>
+ NumElts = 4;
+ EltVT = MVT::v2i16;
+ break;
+ case MVT::v16i8: // <4 x i8x4>
+ NumElts = 4;
+ EltVT = MVT::v4i8;
+ break;
+ }
+ }
+
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
// loaded type to i16 and propagate the "real" type as the memory type.
@@ -6303,7 +6334,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
unsigned Opcode = 0;
SDVTList LdResVTs;
- bool Load16x2 = false;
switch (NumElts) {
default:
@@ -6318,31 +6348,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LdResVTs = DAG.getVTList(ListVTs);
break;
}
- case 8: {
- // v8f16 is a special case. PTX doesn't have ld.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // load them with ld.v4.b32.
- assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
- Load16x2 = true;
- Opcode = NVPTXISD::LoadV4;
- EVT VVT;
- switch (EltVT.getSimpleVT().SimpleTy) {
- case MVT::f16:
- VVT = MVT::v2f16;
- break;
- case MVT::bf16:
- VVT = MVT::v2bf16;
- break;
- case MVT::i16:
- VVT = MVT::v2i16;
- break;
- default:
- llvm_unreachable("Unsupported v8 vector type.");
- }
- EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
}
// Copy regular operands
@@ -6357,17 +6362,18 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LD->getMemOperand());
SmallVector<SDValue, 8> ScalarRes;
- if (Load16x2) {
- // Split v2f16 subvectors back into individual elements.
- NumElts /= 2;
+ if (UpsizeElementTypes) {
+ // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+ // into individual elements.
+ unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
SDValue SubVector = NewLD.getValue(i);
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(0, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(1, DL));
- ScalarRes.push_back(E0);
- ScalarRes.push_back(E1);
+ for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
+ SDValue E =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getScalarType(),
+ SubVector, DAG.getIntPtrConstant(j, DL));
+ ScalarRes.push_back(E);
+ }
}
} else {
for (unsigned i = 0; i < NumElts; ++i) {
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 028fab7ae54d6a4..e46657e4a582f31 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -172,30 +172,34 @@ define float @ff(ptr %p) {
define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
; ENABLED-LABEL: combine_v16i8(
; ENABLED: {
-; ENABLED-NEXT: .reg .b32 %r<40>;
+; ENABLED-NEXT: .reg .b32 %r<36>;
; ENABLED-NEXT: .reg .b64 %rd<3>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
-; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
-; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r21, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r22, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r23, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r24, %r4, 24, 8;
-; ENABLED-NEXT: add.s32 %r25, %r9, %r10;
+; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8;
+; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r9;
+; ENABLED-NEXT: add.s32 %r25, %r24, %r10;
; ENABLED-NEXT: add.s32 %r26, %r25, %r11;
; ENABLED-NEXT: add.s32 %r27, %r26, %r12;
; ENABLED-NEXT: add.s32 %r28, %r27, %r13;
@@ -206,11 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
-; ENABLED-NEXT: add.s32 %r36, %r35, %r21;
-; ENABLED-NEXT: add.s32 %r37, %r36, %r22;
-; ENABLED-NEXT: add.s32 %r38, %r37, %r23;
-; ENABLED-NEXT: add.s32 %r39, %r38, %r24;
-; ENABLED-NEXT: st.u32 [%rd2], %r39;
+; ENABLED-NEXT: st.u32 [%rd2], %r35;
; ENABLED-NEXT: ret;
;
; DISABLED-LABEL: combine_v16i8(
@@ -328,27 +328,25 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
-; ENABLED-NEXT: ld.u32 %r1, [%rd1+4];
-; ENABLED-NEXT: ld.u32 %r2, [%rd1];
+; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
-; ENABLED-NEXT: ld.u32 %r3, [%rd1+12];
-; ENABLED-NEXT: ld.u32 %r4, [%rd1+8];
-; ENABLED-NEXT: bfe.u32 %r5, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r6, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r7, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r8, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r4, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
+; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8];
+; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
+; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/119622
More information about the llvm-commits
mailing list