[llvm] [NVPTX] Generalize and extend upsizing when lowering 8/16-bit-element vector loads/stores (PR #119622)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 11 14:17:26 PST 2024
https://github.com/dakersnar created https://github.com/llvm/llvm-project/pull/119622
This addresses the following issue I opened: https://github.com/llvm/llvm-project/issues/118851.
This change generalizes the Type Legalization mechanism that currently handles v8[i/f/bf]16 upsizing to include loads and stores of v8i8 + v16i8, allowing all of the mentioned vectors to be lowered to ptx as vectors of b32. This extension also allows us to remove the DagCombine that only handled exactly load v16i8, thus centralizing all the upsizing logic into one place.
Test changes include adding v8i8, v16i8, and v8i16 cases to load-store.ll, and updating the CHECKs for other tests to match the improved codegen.
>From 87c3df9e2e05416992b68f519cc21a67bf6304e5 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 11 Dec 2024 21:41:44 +0000
Subject: [PATCH] [NVPTX] Generalize and extend upsizing when lowering
8-and-16-bit-element vector loads/stores
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 22 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 214 +--
.../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 86 +-
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 6 +-
llvm/test/CodeGen/NVPTX/load-store.ll | 1474 ++++++++++++++++-
.../CodeGen/NVPTX/shuffle-vec-undef-init.ll | 18 +-
llvm/test/CodeGen/NVPTX/vector-stores.ll | 4 +-
7 files changed, 1649 insertions(+), 175 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index e1fb2d7fcee0309..8536be18b89e01b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1400,11 +1400,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
EVT EltVT = N->getValueType(0);
- // v8x16 is a special case. PTX doesn't have ld.v8.16
- // instruction. Instead, we split the vector into v2x16 chunks and
- // load them with ld.v4.b32.
- if (Isv2x16VT(EltVT)) {
- assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than ld.v4 for those element types.
+ // In Type Legalization, rather than splitting those vectors into multiple
+ // loads, we split the vector into v2x16/v4i8 chunks. Now, we lower to PTX as
+ // vector loads of b32.
+ if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
EltVT = MVT::i32;
FromType = NVPTX::PTXLdStInstCode::Untyped;
FromTypeWidth = 32;
@@ -2084,11 +2085,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
return false;
}
- // v8x16 is a special case. PTX doesn't have st.v8.x16
- // instruction. Instead, we split the vector into v2x16 chunks and
- // store them with st.v4.b32.
- if (Isv2x16VT(EltVT)) {
- assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than st.v4 for those element types.
+ // In Type Legalization, rather than splitting those vectors into multiple
+ // stores, we split the vector into v2x16/v4i8 chunks. Now, we lower to
+ // PTX as vector stores of b32.
+ if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
EltVT = MVT::i32;
ToType = NVPTX::PTXLdStInstCode::Untyped;
ToTypeWidth = 32;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 62647b312851886..68d6edda8dddfd8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -136,6 +136,8 @@ static bool IsPTXVectorType(MVT VT) {
case MVT::v4i1:
case MVT::v2i8:
case MVT::v4i8:
+ case MVT::v8i8: // <2 x i8x4>
+ case MVT::v16i8: // <4 x i8x4>
case MVT::v2i16:
case MVT::v4i16:
case MVT::v8i16: // <4 x i16x2>
@@ -761,8 +763,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
- ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
- ISD::VSELECT});
+ ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -3157,6 +3158,13 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(N);
EVT ValVT = Val.getValueType();
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than st.v4 for those element types.
+ // Here in Type Legalization, rather than splitting those vectors into
+ // multiple stores, we split the vector into v2x16/v4i8 chunks. Later, in
+ // Instruction Selection, we lower to PTX as vector stores of b32.
+ bool UpsizeElementTypes = false;
+
if (ValVT.isVector()) {
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 stores of <2 x double> here
@@ -3180,10 +3188,15 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
case MVT::v4f16:
case MVT::v4bf16:
case MVT::v4f32:
- case MVT::v8f16: // <4 x f16x2>
+ // This is a "native" vector type
+ break;
+ case MVT::v8i8: // <2 x i8x4>
+ case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
case MVT::v8i16: // <4 x i16x2>
- // This is a "native" vector type
+ case MVT::v16i8: // <4 x i8x4>
+ // This can be upsized into a "native" vector type
+ UpsizeElementTypes = true;
break;
}
@@ -3206,6 +3219,33 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
EVT EltVT = ValVT.getVectorElementType();
unsigned NumElts = ValVT.getVectorNumElements();
+ if (UpsizeElementTypes) {
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected Vector Type");
+ case MVT::v8i8: // <2 x i8x4>
+ NumElts = 2;
+ EltVT = MVT::v4i8;
+ break;
+ case MVT::v8f16: // <4 x f16x2>
+ NumElts = 4;
+ EltVT = MVT::v2f16;
+ break;
+ case MVT::v8bf16: // <4 x bf16x2>
+ NumElts = 4;
+ EltVT = MVT::v2bf16;
+ break;
+ case MVT::v8i16: // <4 x i16x2>
+ NumElts = 4;
+ EltVT = MVT::v2i16;
+ break;
+ case MVT::v16i8: // <4 x i8x4>
+ NumElts = 4;
+ EltVT = MVT::v4i8;
+ break;
+ }
+ }
+
// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
// stored type to i16 and propagate the "real" type as the memory type.
@@ -3213,7 +3253,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
if (EltVT.getSizeInBits() < 16)
NeedExt = true;
- bool StoreF16x2 = false;
switch (NumElts) {
default:
return SDValue();
@@ -3223,14 +3262,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
case 4:
Opcode = NVPTXISD::StoreV4;
break;
- case 8:
- // v8f16 is a special case. PTX doesn't have st.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // store them with st.v4.b32.
- assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
- Opcode = NVPTXISD::StoreV4;
- StoreF16x2 = true;
- break;
}
SmallVector<SDValue, 8> Ops;
@@ -3238,17 +3269,23 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
// First is the chain
Ops.push_back(N->getOperand(0));
- if (StoreF16x2) {
- // Combine f16,f16 -> v2f16
- NumElts /= 2;
+ if (UpsizeElementTypes) {
+ // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
+ // stored as b32s
+ unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i * 2, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i * 2 + 1, DL));
- EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
- SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
- Ops.push_back(V2);
+ SmallVector<SDValue, 8> Elts;
+ for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
+ SDValue E = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getVectorElementType(), Val,
+ DAG.getIntPtrConstant(i * NumEltsPerSubVector + j, DL));
+ Elts.push_back(E);
+ }
+ EVT VecVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT.getVectorElementType(),
+ NumEltsPerSubVector);
+ SDValue SubVector = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Elts);
+ Ops.push_back(SubVector);
}
} else {
// Then the split values
@@ -6136,49 +6173,6 @@ static SDValue PerformVSELECTCombine(SDNode *N,
return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
}
-static SDValue PerformLOADCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- SelectionDAG &DAG = DCI.DAG;
- LoadSDNode *LD = cast<LoadSDNode>(N);
-
- // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
- // letting ReplaceLoadVector split it into smaller loads during legalization.
- // This is done at dag-combine1 time, so that vector operations with i8
- // elements can be optimised away instead of being needlessly split during
- // legalization, which involves storing to the stack and loading it back.
- EVT VT = N->getValueType(0);
- bool CorrectlyAligned =
- DCI.DAG.getTargetLoweringInfo().allowsMemoryAccessForAlignment(
- *DAG.getContext(), DAG.getDataLayout(), LD->getMemoryVT(),
- *LD->getMemOperand());
- if (!(VT == MVT::v16i8 && CorrectlyAligned))
- return SDValue();
-
- SDLoc DL(N);
-
- // Create a v4i32 vector load operation, effectively <4 x v4i8>.
- unsigned Opc = NVPTXISD::LoadV4;
- EVT NewVT = MVT::v4i32;
- EVT EltVT = NewVT.getVectorElementType();
- unsigned NumElts = NewVT.getVectorNumElements();
- EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
- SDVTList RetVTList = DAG.getVTList(RetVTs);
- SmallVector<SDValue, 8> Ops(N->ops());
- Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
- SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
- LD->getMemOperand());
- SDValue NewChain = NewLoad.getValue(NumElts);
-
- // Create a vector of the same type returned by the original load.
- SmallVector<SDValue, 4> Elts;
- for (unsigned i = 0; i < NumElts; i++)
- Elts.push_back(NewLoad.getValue(i));
- return DCI.DAG.getMergeValues(
- {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
- NewChain},
- DL);
-}
-
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6199,8 +6193,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformREMCombine(N, DCI, OptLevel);
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
- case ISD::LOAD:
- return PerformLOADCombine(N, DCI);
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
@@ -6247,6 +6239,13 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
assert(ResVT.isVector() && "Vector load must have vector type");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than ld.v4 for those element types.
+ // Here in Type Legalization, rather than splitting those vectors into
+ // multiple loads, we split the vector into v2x16/v4i8 chunks. Later, in
+ // Instruction Selection, we lower to PTX as vector loads of b32.
+ bool UpsizeElementTypes = false;
+
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 loads of <2 x double> here
// but I'm leaving that as a TODO for now.
@@ -6267,10 +6266,15 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
case MVT::v4f16:
case MVT::v4bf16:
case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ case MVT::v8i8: // <2 x i8x4>
case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
case MVT::v8i16: // <4 x i16x2>
- // This is a "native" vector type
+ case MVT::v16i8: // <4 x i8x4>
+ // This can be upsized into a "native" vector type
+ UpsizeElementTypes = true;
break;
}
@@ -6292,6 +6296,33 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
EVT EltVT = ResVT.getVectorElementType();
unsigned NumElts = ResVT.getVectorNumElements();
+ if (UpsizeElementTypes) {
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected Vector Type");
+ case MVT::v8i8: // <2 x i8x4>
+ NumElts = 2;
+ EltVT = MVT::v4i8;
+ break;
+ case MVT::v8f16: // <4 x f16x2>
+ NumElts = 4;
+ EltVT = MVT::v2f16;
+ break;
+ case MVT::v8bf16: // <4 x bf16x2>
+ NumElts = 4;
+ EltVT = MVT::v2bf16;
+ break;
+ case MVT::v8i16: // <4 x i16x2>
+ NumElts = 4;
+ EltVT = MVT::v2i16;
+ break;
+ case MVT::v16i8: // <4 x i8x4>
+ NumElts = 4;
+ EltVT = MVT::v4i8;
+ break;
+ }
+ }
+
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
// loaded type to i16 and propagate the "real" type as the memory type.
@@ -6303,7 +6334,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
unsigned Opcode = 0;
SDVTList LdResVTs;
- bool Load16x2 = false;
switch (NumElts) {
default:
@@ -6318,31 +6348,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LdResVTs = DAG.getVTList(ListVTs);
break;
}
- case 8: {
- // v8f16 is a special case. PTX doesn't have ld.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // load them with ld.v4.b32.
- assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
- Load16x2 = true;
- Opcode = NVPTXISD::LoadV4;
- EVT VVT;
- switch (EltVT.getSimpleVT().SimpleTy) {
- case MVT::f16:
- VVT = MVT::v2f16;
- break;
- case MVT::bf16:
- VVT = MVT::v2bf16;
- break;
- case MVT::i16:
- VVT = MVT::v2i16;
- break;
- default:
- llvm_unreachable("Unsupported v8 vector type.");
- }
- EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
}
// Copy regular operands
@@ -6357,17 +6362,18 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LD->getMemOperand());
SmallVector<SDValue, 8> ScalarRes;
- if (Load16x2) {
- // Split v2f16 subvectors back into individual elements.
- NumElts /= 2;
+ if (UpsizeElementTypes) {
+ // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+ // into individual elements.
+ unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
SDValue SubVector = NewLD.getValue(i);
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(0, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(1, DL));
- ScalarRes.push_back(E0);
- ScalarRes.push_back(E1);
+ for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
+ SDValue E =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getScalarType(),
+ SubVector, DAG.getIntPtrConstant(j, DL));
+ ScalarRes.push_back(E);
+ }
}
} else {
for (unsigned i = 0; i < NumElts; ++i) {
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 028fab7ae54d6a4..e46657e4a582f31 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -172,30 +172,34 @@ define float @ff(ptr %p) {
define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
; ENABLED-LABEL: combine_v16i8(
; ENABLED: {
-; ENABLED-NEXT: .reg .b32 %r<40>;
+; ENABLED-NEXT: .reg .b32 %r<36>;
; ENABLED-NEXT: .reg .b64 %rd<3>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
-; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
-; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r21, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r22, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r23, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r24, %r4, 24, 8;
-; ENABLED-NEXT: add.s32 %r25, %r9, %r10;
+; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8;
+; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r9;
+; ENABLED-NEXT: add.s32 %r25, %r24, %r10;
; ENABLED-NEXT: add.s32 %r26, %r25, %r11;
; ENABLED-NEXT: add.s32 %r27, %r26, %r12;
; ENABLED-NEXT: add.s32 %r28, %r27, %r13;
@@ -206,11 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
-; ENABLED-NEXT: add.s32 %r36, %r35, %r21;
-; ENABLED-NEXT: add.s32 %r37, %r36, %r22;
-; ENABLED-NEXT: add.s32 %r38, %r37, %r23;
-; ENABLED-NEXT: add.s32 %r39, %r38, %r24;
-; ENABLED-NEXT: st.u32 [%rd2], %r39;
+; ENABLED-NEXT: st.u32 [%rd2], %r35;
; ENABLED-NEXT: ret;
;
; DISABLED-LABEL: combine_v16i8(
@@ -328,27 +328,25 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
-; ENABLED-NEXT: ld.u32 %r1, [%rd1+4];
-; ENABLED-NEXT: ld.u32 %r2, [%rd1];
+; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
-; ENABLED-NEXT: ld.u32 %r3, [%rd1+12];
-; ENABLED-NEXT: ld.u32 %r4, [%rd1+8];
-; ENABLED-NEXT: bfe.u32 %r5, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r6, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r7, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r8, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r4, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
+; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8];
+; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8;
; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index c143d7674a7923a..3853ec5c4151a43 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -809,10 +809,8 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
-; CHECK-NEXT: ld.u32 %r1, [%rd1];
-; CHECK-NEXT: ld.u32 %r2, [%rd1+4];
-; CHECK-NEXT: st.u32 [%rd2+4], %r2;
-; CHECK-NEXT: st.u32 [%rd2], %r1;
+; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2};
; CHECK-NEXT: ret;
%t1 = load <8 x i8>, ptr %a
store <8 x i8> %t1, ptr %b, align 16
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index cd35949ab290a2c..82991b4c8d6ceb5 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
-; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+; TODO: add i1, and <6 x i8> vector tests.
; TODO: add test for vectors that exceed 128-bit length
; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
@@ -194,6 +194,156 @@ define void @generic_4xi8(ptr %a) {
ret void
}
+define void @generic_8xi8(ptr %a) {
+; CHECK-LABEL: generic_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi8_param_0];
+; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_16xi8(ptr %a) {
+; CHECK-LABEL: generic_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi8_param_0];
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr %a
+ ret void
+}
+
define void @generic_2xi16(ptr %a) {
; CHECK-LABEL: generic_2xi16(
; CHECK: {
@@ -237,6 +387,40 @@ define void @generic_4xi16(ptr %a) {
ret void
}
+define void @generic_8xi16(ptr %a) {
+; CHECK-LABEL: generic_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi16_param_0];
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr %a
+ ret void
+}
+
define void @generic_2xi32(ptr %a) {
; CHECK-LABEL: generic_2xi32(
; CHECK: {
@@ -538,6 +722,156 @@ define void @generic_volatile_4xi8(ptr %a) {
ret void
}
+define void @generic_volatile_8xi8(ptr %a) {
+; CHECK-LABEL: generic_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.volatile.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_16xi8(ptr %a) {
+; CHECK-LABEL: generic_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr %a
+ ret void
+}
+
define void @generic_volatile_2xi16(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi16(
; CHECK: {
@@ -581,6 +915,40 @@ define void @generic_volatile_4xi16(ptr %a) {
ret void
}
+define void @generic_volatile_8xi16(ptr %a) {
+; CHECK-LABEL: generic_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr %a
+ ret void
+}
+
define void @generic_volatile_2xi32(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi32(
; CHECK: {
@@ -1443,6 +1811,156 @@ define void @global_4xi8(ptr addrspace(1) %a) {
ret void
}
+define void @global_8xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0];
+; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr addrspace(1) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_16xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_16xi8_param_0];
+; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr addrspace(1) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_2xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi16(
; CHECK: {
@@ -1486,6 +2004,40 @@ define void @global_4xi16(ptr addrspace(1) %a) {
ret void
}
+define void @global_8xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi16_param_0];
+; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr addrspace(1) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_2xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi32(
; CHECK: {
@@ -1768,6 +2320,156 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) {
ret void
}
+define void @global_volatile_8xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.volatile.global.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr addrspace(1) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_16xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr addrspace(1) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_volatile_2xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi16(
; CHECK: {
@@ -1811,6 +2513,40 @@ define void @global_volatile_4xi16(ptr addrspace(1) %a) {
ret void
}
+define void @global_volatile_8xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr addrspace(1) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_volatile_2xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi32(
; CHECK: {
@@ -2815,6 +3551,156 @@ define void @shared_4xi8(ptr addrspace(3) %a) {
ret void
}
+define void @shared_8xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi8_param_0];
+; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.shared.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr addrspace(3) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_16xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi8_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr addrspace(3) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_2xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi16(
; CHECK: {
@@ -2858,6 +3744,40 @@ define void @shared_4xi16(ptr addrspace(3) %a) {
ret void
}
+define void @shared_8xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi16_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr addrspace(3) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_2xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi32(
; CHECK: {
@@ -3140,6 +4060,156 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
ret void
}
+define void @shared_volatile_8xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.volatile.shared.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr addrspace(3) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr addrspace(3) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi16(
; CHECK: {
@@ -3183,6 +4253,40 @@ define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
ret void
}
+define void @shared_volatile_8xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr addrspace(3) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi32(
; CHECK: {
@@ -4045,6 +5149,156 @@ define void @local_4xi8(ptr addrspace(5) %a) {
ret void
}
+define void @local_8xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi8_param_0];
+; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr addrspace(5) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_16xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi8_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr addrspace(5) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_2xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi16(
; CHECK: {
@@ -4088,6 +5342,40 @@ define void @local_4xi16(ptr addrspace(5) %a) {
ret void
}
+define void @local_8xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi16_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr addrspace(5) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_2xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi32(
; CHECK: {
@@ -4370,6 +5658,156 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
ret void
}
+define void @local_volatile_8xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr addrspace(5) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_16xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr addrspace(5) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_volatile_2xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi16(
; CHECK: {
@@ -4413,6 +5851,40 @@ define void @local_volatile_4xi16(ptr addrspace(5) %a) {
ret void
}
+define void @local_volatile_8xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr addrspace(5) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_volatile_2xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi32(
; CHECK: {
diff --git a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
index 4c7a51b70bc33a9..d5043c2c3047c10 100644
--- a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
+++ b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
@@ -6,19 +6,17 @@ target triple = "nvptx64-unknown-unknown"
define void @kernel_func(ptr %in.vec, ptr %out.vec0) nounwind {
; CHECK-LABEL: kernel_func(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-NEXT: .reg .b32 %r<14>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [kernel_func_param_0];
-; CHECK-NEXT: ld.u32 %r2, [%r1+8];
-; CHECK-NEXT: ld.u32 %r3, [%r1];
-; CHECK-NEXT: ld.u32 %r4, [%r1+24];
-; CHECK-NEXT: ld.u32 %r5, [%r1+16];
-; CHECK-NEXT: ld.param.u32 %r6, [kernel_func_param_1];
-; CHECK-NEXT: prmt.b32 %r7, %r5, %r4, 0x4000U;
-; CHECK-NEXT: prmt.b32 %r8, %r3, %r2, 0x40U;
-; CHECK-NEXT: prmt.b32 %r9, %r8, %r7, 0x7610U;
-; CHECK-NEXT: st.u32 [%r6], %r9;
+; CHECK-NEXT: ld.v4.b32 {%r2, %r3, %r4, %r5}, [%r1];
+; CHECK-NEXT: ld.v4.b32 {%r6, %r7, %r8, %r9}, [%r1+16];
+; CHECK-NEXT: ld.param.u32 %r10, [kernel_func_param_1];
+; CHECK-NEXT: prmt.b32 %r11, %r6, %r8, 0x4000U;
+; CHECK-NEXT: prmt.b32 %r12, %r2, %r4, 0x40U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x7610U;
+; CHECK-NEXT: st.u32 [%r10], %r13;
; CHECK-NEXT: ret;
%wide.vec = load <32 x i8>, ptr %in.vec, align 64
%vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index df14553a7720576..c914aa6e24082d1 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -31,8 +31,8 @@ define void @foo4(<4 x i32> %val, ptr %ptr) {
; CHECK-LABEL: .visible .func v16i8
define void @v16i8(ptr %a, ptr %b) {
-; CHECK: ld.v4.u32
-; CHECK: st.v4.u32
+; CHECK: ld.v4.b32
+; CHECK: st.v4.b32
%v = load <16 x i8>, ptr %a
store <16 x i8> %v, ptr %b
ret void
More information about the llvm-commits
mailing list