[llvm] [NVPTX] Generalize and extend upsizing when lowering 8/16-bit-element vector loads/stores (PR #119622)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 16 15:39:24 PST 2024
https://github.com/dakersnar updated https://github.com/llvm/llvm-project/pull/119622
>From 87c3df9e2e05416992b68f519cc21a67bf6304e5 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 11 Dec 2024 21:41:44 +0000
Subject: [PATCH 01/11] [NVPTX] Generalize and extend upsizing when lowering
8-and-16-bit-element vector loads/stores
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 22 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 214 +--
.../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 86 +-
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 6 +-
llvm/test/CodeGen/NVPTX/load-store.ll | 1474 ++++++++++++++++-
.../CodeGen/NVPTX/shuffle-vec-undef-init.ll | 18 +-
llvm/test/CodeGen/NVPTX/vector-stores.ll | 4 +-
7 files changed, 1649 insertions(+), 175 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index e1fb2d7fcee030..8536be18b89e01 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1400,11 +1400,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
EVT EltVT = N->getValueType(0);
- // v8x16 is a special case. PTX doesn't have ld.v8.16
- // instruction. Instead, we split the vector into v2x16 chunks and
- // load them with ld.v4.b32.
- if (Isv2x16VT(EltVT)) {
- assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than ld.v4 for those element types.
+ // In Type Legalization, rather than splitting those vectors into multiple
+ // loads, we split the vector into v2x16/v4i8 chunks. Now, we lower to PTX as
+ // vector loads of b32.
+ if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
EltVT = MVT::i32;
FromType = NVPTX::PTXLdStInstCode::Untyped;
FromTypeWidth = 32;
@@ -2084,11 +2085,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
return false;
}
- // v8x16 is a special case. PTX doesn't have st.v8.x16
- // instruction. Instead, we split the vector into v2x16 chunks and
- // store them with st.v4.b32.
- if (Isv2x16VT(EltVT)) {
- assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than st.v4 for those element types.
+ // In Type Legalization, rather than splitting those vectors into multiple
+ // stores, we split the vector into v2x16/v4i8 chunks. Now, we lower to
+ // PTX as vector stores of b32.
+ if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
EltVT = MVT::i32;
ToType = NVPTX::PTXLdStInstCode::Untyped;
ToTypeWidth = 32;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 62647b31285188..68d6edda8dddfd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -136,6 +136,8 @@ static bool IsPTXVectorType(MVT VT) {
case MVT::v4i1:
case MVT::v2i8:
case MVT::v4i8:
+ case MVT::v8i8: // <2 x i8x4>
+ case MVT::v16i8: // <4 x i8x4>
case MVT::v2i16:
case MVT::v4i16:
case MVT::v8i16: // <4 x i16x2>
@@ -761,8 +763,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
- ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
- ISD::VSELECT});
+ ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -3157,6 +3158,13 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(N);
EVT ValVT = Val.getValueType();
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than st.v4 for those element types.
+ // Here in Type Legalization, rather than splitting those vectors into
+ // multiple stores, we split the vector into v2x16/v4i8 chunks. Later, in
+ // Instruction Selection, we lower to PTX as vector stores of b32.
+ bool UpsizeElementTypes = false;
+
if (ValVT.isVector()) {
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 stores of <2 x double> here
@@ -3180,10 +3188,15 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
case MVT::v4f16:
case MVT::v4bf16:
case MVT::v4f32:
- case MVT::v8f16: // <4 x f16x2>
+ // This is a "native" vector type
+ break;
+ case MVT::v8i8: // <2 x i8x4>
+ case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
case MVT::v8i16: // <4 x i16x2>
- // This is a "native" vector type
+ case MVT::v16i8: // <4 x i8x4>
+ // This can be upsized into a "native" vector type
+ UpsizeElementTypes = true;
break;
}
@@ -3206,6 +3219,33 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
EVT EltVT = ValVT.getVectorElementType();
unsigned NumElts = ValVT.getVectorNumElements();
+ if (UpsizeElementTypes) {
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected Vector Type");
+ case MVT::v8i8: // <2 x i8x4>
+ NumElts = 2;
+ EltVT = MVT::v4i8;
+ break;
+ case MVT::v8f16: // <4 x f16x2>
+ NumElts = 4;
+ EltVT = MVT::v2f16;
+ break;
+ case MVT::v8bf16: // <4 x bf16x2>
+ NumElts = 4;
+ EltVT = MVT::v2bf16;
+ break;
+ case MVT::v8i16: // <4 x i16x2>
+ NumElts = 4;
+ EltVT = MVT::v2i16;
+ break;
+ case MVT::v16i8: // <4 x i8x4>
+ NumElts = 4;
+ EltVT = MVT::v4i8;
+ break;
+ }
+ }
+
// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
// stored type to i16 and propagate the "real" type as the memory type.
@@ -3213,7 +3253,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
if (EltVT.getSizeInBits() < 16)
NeedExt = true;
- bool StoreF16x2 = false;
switch (NumElts) {
default:
return SDValue();
@@ -3223,14 +3262,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
case 4:
Opcode = NVPTXISD::StoreV4;
break;
- case 8:
- // v8f16 is a special case. PTX doesn't have st.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // store them with st.v4.b32.
- assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
- Opcode = NVPTXISD::StoreV4;
- StoreF16x2 = true;
- break;
}
SmallVector<SDValue, 8> Ops;
@@ -3238,17 +3269,23 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
// First is the chain
Ops.push_back(N->getOperand(0));
- if (StoreF16x2) {
- // Combine f16,f16 -> v2f16
- NumElts /= 2;
+ if (UpsizeElementTypes) {
+ // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
+ // stored as b32s
+ unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i * 2, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i * 2 + 1, DL));
- EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
- SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
- Ops.push_back(V2);
+ SmallVector<SDValue, 8> Elts;
+ for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
+ SDValue E = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getVectorElementType(), Val,
+ DAG.getIntPtrConstant(i * NumEltsPerSubVector + j, DL));
+ Elts.push_back(E);
+ }
+ EVT VecVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT.getVectorElementType(),
+ NumEltsPerSubVector);
+ SDValue SubVector = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Elts);
+ Ops.push_back(SubVector);
}
} else {
// Then the split values
@@ -6136,49 +6173,6 @@ static SDValue PerformVSELECTCombine(SDNode *N,
return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
}
-static SDValue PerformLOADCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- SelectionDAG &DAG = DCI.DAG;
- LoadSDNode *LD = cast<LoadSDNode>(N);
-
- // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
- // letting ReplaceLoadVector split it into smaller loads during legalization.
- // This is done at dag-combine1 time, so that vector operations with i8
- // elements can be optimised away instead of being needlessly split during
- // legalization, which involves storing to the stack and loading it back.
- EVT VT = N->getValueType(0);
- bool CorrectlyAligned =
- DCI.DAG.getTargetLoweringInfo().allowsMemoryAccessForAlignment(
- *DAG.getContext(), DAG.getDataLayout(), LD->getMemoryVT(),
- *LD->getMemOperand());
- if (!(VT == MVT::v16i8 && CorrectlyAligned))
- return SDValue();
-
- SDLoc DL(N);
-
- // Create a v4i32 vector load operation, effectively <4 x v4i8>.
- unsigned Opc = NVPTXISD::LoadV4;
- EVT NewVT = MVT::v4i32;
- EVT EltVT = NewVT.getVectorElementType();
- unsigned NumElts = NewVT.getVectorNumElements();
- EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
- SDVTList RetVTList = DAG.getVTList(RetVTs);
- SmallVector<SDValue, 8> Ops(N->ops());
- Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
- SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
- LD->getMemOperand());
- SDValue NewChain = NewLoad.getValue(NumElts);
-
- // Create a vector of the same type returned by the original load.
- SmallVector<SDValue, 4> Elts;
- for (unsigned i = 0; i < NumElts; i++)
- Elts.push_back(NewLoad.getValue(i));
- return DCI.DAG.getMergeValues(
- {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
- NewChain},
- DL);
-}
-
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6199,8 +6193,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformREMCombine(N, DCI, OptLevel);
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
- case ISD::LOAD:
- return PerformLOADCombine(N, DCI);
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
@@ -6247,6 +6239,13 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
assert(ResVT.isVector() && "Vector load must have vector type");
+ // Vectors of 8-and-16-bit elements above a certain size are special cases.
+ // PTX doesn't have anything larger than ld.v4 for those element types.
+ // Here in Type Legalization, rather than splitting those vectors into
+ // multiple loads, we split the vector into v2x16/v4i8 chunks. Later, in
+ // Instruction Selection, we lower to PTX as vector loads of b32.
+ bool UpsizeElementTypes = false;
+
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 loads of <2 x double> here
// but I'm leaving that as a TODO for now.
@@ -6267,10 +6266,15 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
case MVT::v4f16:
case MVT::v4bf16:
case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ case MVT::v8i8: // <2 x i8x4>
case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
case MVT::v8i16: // <4 x i16x2>
- // This is a "native" vector type
+ case MVT::v16i8: // <4 x i8x4>
+ // This can be upsized into a "native" vector type
+ UpsizeElementTypes = true;
break;
}
@@ -6292,6 +6296,33 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
EVT EltVT = ResVT.getVectorElementType();
unsigned NumElts = ResVT.getVectorNumElements();
+ if (UpsizeElementTypes) {
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected Vector Type");
+ case MVT::v8i8: // <2 x i8x4>
+ NumElts = 2;
+ EltVT = MVT::v4i8;
+ break;
+ case MVT::v8f16: // <4 x f16x2>
+ NumElts = 4;
+ EltVT = MVT::v2f16;
+ break;
+ case MVT::v8bf16: // <4 x bf16x2>
+ NumElts = 4;
+ EltVT = MVT::v2bf16;
+ break;
+ case MVT::v8i16: // <4 x i16x2>
+ NumElts = 4;
+ EltVT = MVT::v2i16;
+ break;
+ case MVT::v16i8: // <4 x i8x4>
+ NumElts = 4;
+ EltVT = MVT::v4i8;
+ break;
+ }
+ }
+
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
// loaded type to i16 and propagate the "real" type as the memory type.
@@ -6303,7 +6334,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
unsigned Opcode = 0;
SDVTList LdResVTs;
- bool Load16x2 = false;
switch (NumElts) {
default:
@@ -6318,31 +6348,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LdResVTs = DAG.getVTList(ListVTs);
break;
}
- case 8: {
- // v8f16 is a special case. PTX doesn't have ld.v8.f16
- // instruction. Instead, we split the vector into v2f16 chunks and
- // load them with ld.v4.b32.
- assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
- Load16x2 = true;
- Opcode = NVPTXISD::LoadV4;
- EVT VVT;
- switch (EltVT.getSimpleVT().SimpleTy) {
- case MVT::f16:
- VVT = MVT::v2f16;
- break;
- case MVT::bf16:
- VVT = MVT::v2bf16;
- break;
- case MVT::i16:
- VVT = MVT::v2i16;
- break;
- default:
- llvm_unreachable("Unsupported v8 vector type.");
- }
- EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
}
// Copy regular operands
@@ -6357,17 +6362,18 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LD->getMemOperand());
SmallVector<SDValue, 8> ScalarRes;
- if (Load16x2) {
- // Split v2f16 subvectors back into individual elements.
- NumElts /= 2;
+ if (UpsizeElementTypes) {
+ // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+ // into individual elements.
+ unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
SDValue SubVector = NewLD.getValue(i);
- SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(0, DL));
- SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
- DAG.getIntPtrConstant(1, DL));
- ScalarRes.push_back(E0);
- ScalarRes.push_back(E1);
+ for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
+ SDValue E =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getScalarType(),
+ SubVector, DAG.getIntPtrConstant(j, DL));
+ ScalarRes.push_back(E);
+ }
}
} else {
for (unsigned i = 0; i < NumElts; ++i) {
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 028fab7ae54d6a..e46657e4a582f3 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -172,30 +172,34 @@ define float @ff(ptr %p) {
define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
; ENABLED-LABEL: combine_v16i8(
; ENABLED: {
-; ENABLED-NEXT: .reg .b32 %r<40>;
+; ENABLED-NEXT: .reg .b32 %r<36>;
; ENABLED-NEXT: .reg .b64 %rd<3>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
-; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
-; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r21, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r22, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r23, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r24, %r4, 24, 8;
-; ENABLED-NEXT: add.s32 %r25, %r9, %r10;
+; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8;
+; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
+; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
+; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
+; ENABLED-NEXT: add.s32 %r24, %r23, %r9;
+; ENABLED-NEXT: add.s32 %r25, %r24, %r10;
; ENABLED-NEXT: add.s32 %r26, %r25, %r11;
; ENABLED-NEXT: add.s32 %r27, %r26, %r12;
; ENABLED-NEXT: add.s32 %r28, %r27, %r13;
@@ -206,11 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
-; ENABLED-NEXT: add.s32 %r36, %r35, %r21;
-; ENABLED-NEXT: add.s32 %r37, %r36, %r22;
-; ENABLED-NEXT: add.s32 %r38, %r37, %r23;
-; ENABLED-NEXT: add.s32 %r39, %r38, %r24;
-; ENABLED-NEXT: st.u32 [%rd2], %r39;
+; ENABLED-NEXT: st.u32 [%rd2], %r35;
; ENABLED-NEXT: ret;
;
; DISABLED-LABEL: combine_v16i8(
@@ -328,27 +328,25 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
-; ENABLED-NEXT: ld.u32 %r1, [%rd1+4];
-; ENABLED-NEXT: ld.u32 %r2, [%rd1];
+; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
-; ENABLED-NEXT: ld.u32 %r3, [%rd1+12];
-; ENABLED-NEXT: ld.u32 %r4, [%rd1+8];
-; ENABLED-NEXT: bfe.u32 %r5, %r2, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r6, %r2, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r7, %r2, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r8, %r2, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r13, %r4, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r14, %r4, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r15, %r4, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r16, %r4, 24, 8;
-; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8;
-; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8;
-; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8;
-; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8;
+; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8];
+; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8;
+; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8;
+; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8;
+; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8;
; ENABLED-NEXT: add.s32 %r21, %r5, %r6;
; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
; ENABLED-NEXT: add.s32 %r23, %r22, %r8;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index c143d7674a7923..3853ec5c4151a4 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -809,10 +809,8 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
-; CHECK-NEXT: ld.u32 %r1, [%rd1];
-; CHECK-NEXT: ld.u32 %r2, [%rd1+4];
-; CHECK-NEXT: st.u32 [%rd2+4], %r2;
-; CHECK-NEXT: st.u32 [%rd2], %r1;
+; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2};
; CHECK-NEXT: ret;
%t1 = load <8 x i8>, ptr %a
store <8 x i8> %t1, ptr %b, align 16
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index cd35949ab290a2..82991b4c8d6ceb 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
-; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+; TODO: add i1, and <6 x i8> vector tests.
; TODO: add test for vectors that exceed 128-bit length
; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
@@ -194,6 +194,156 @@ define void @generic_4xi8(ptr %a) {
ret void
}
+define void @generic_8xi8(ptr %a) {
+; CHECK-LABEL: generic_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi8_param_0];
+; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_16xi8(ptr %a) {
+; CHECK-LABEL: generic_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi8_param_0];
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr %a
+ ret void
+}
+
define void @generic_2xi16(ptr %a) {
; CHECK-LABEL: generic_2xi16(
; CHECK: {
@@ -237,6 +387,40 @@ define void @generic_4xi16(ptr %a) {
ret void
}
+define void @generic_8xi16(ptr %a) {
+; CHECK-LABEL: generic_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi16_param_0];
+; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr %a
+ ret void
+}
+
define void @generic_2xi32(ptr %a) {
; CHECK-LABEL: generic_2xi32(
; CHECK: {
@@ -538,6 +722,156 @@ define void @generic_volatile_4xi8(ptr %a) {
ret void
}
+define void @generic_volatile_8xi8(ptr %a) {
+; CHECK-LABEL: generic_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.volatile.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_16xi8(ptr %a) {
+; CHECK-LABEL: generic_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr %a
+ ret void
+}
+
define void @generic_volatile_2xi16(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi16(
; CHECK: {
@@ -581,6 +915,40 @@ define void @generic_volatile_4xi16(ptr %a) {
ret void
}
+define void @generic_volatile_8xi16(ptr %a) {
+; CHECK-LABEL: generic_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr %a
+ ret void
+}
+
define void @generic_volatile_2xi32(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi32(
; CHECK: {
@@ -1443,6 +1811,156 @@ define void @global_4xi8(ptr addrspace(1) %a) {
ret void
}
+define void @global_8xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0];
+; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr addrspace(1) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_16xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_16xi8_param_0];
+; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr addrspace(1) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_2xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi16(
; CHECK: {
@@ -1486,6 +2004,40 @@ define void @global_4xi16(ptr addrspace(1) %a) {
ret void
}
+define void @global_8xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi16_param_0];
+; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr addrspace(1) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_2xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi32(
; CHECK: {
@@ -1768,6 +2320,156 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) {
ret void
}
+define void @global_volatile_8xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.volatile.global.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr addrspace(1) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_16xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr addrspace(1) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_volatile_2xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi16(
; CHECK: {
@@ -1811,6 +2513,40 @@ define void @global_volatile_4xi16(ptr addrspace(1) %a) {
ret void
}
+define void @global_volatile_8xi16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr addrspace(1) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr addrspace(1) %a
+ ret void
+}
+
define void @global_volatile_2xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi32(
; CHECK: {
@@ -2815,6 +3551,156 @@ define void @shared_4xi8(ptr addrspace(3) %a) {
ret void
}
+define void @shared_8xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi8_param_0];
+; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.shared.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr addrspace(3) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_16xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi8_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr addrspace(3) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_2xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi16(
; CHECK: {
@@ -2858,6 +3744,40 @@ define void @shared_4xi16(ptr addrspace(3) %a) {
ret void
}
+define void @shared_8xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi16_param_0];
+; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr addrspace(3) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_2xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi32(
; CHECK: {
@@ -3140,6 +4060,156 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
ret void
}
+define void @shared_volatile_8xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.volatile.shared.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr addrspace(3) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr addrspace(3) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi16(
; CHECK: {
@@ -3183,6 +4253,40 @@ define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
ret void
}
+define void @shared_volatile_8xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr addrspace(3) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr addrspace(3) %a
+ ret void
+}
+
define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi32(
; CHECK: {
@@ -4045,6 +5149,156 @@ define void @local_4xi8(ptr addrspace(5) %a) {
ret void
}
+define void @local_8xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi8_param_0];
+; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i8>, ptr addrspace(5) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_16xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi8_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load <16 x i8>, ptr addrspace(5) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <16 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_2xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi16(
; CHECK: {
@@ -4088,6 +5342,40 @@ define void @local_4xi16(ptr addrspace(5) %a) {
ret void
}
+define void @local_8xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi16_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load <8 x i16>, ptr addrspace(5) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store <8 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_2xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi32(
; CHECK: {
@@ -4370,6 +5658,156 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
ret void
}
+define void @local_volatile_8xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_8xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi8_param_0];
+; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i8>, ptr addrspace(5) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <8 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_16xi8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_16xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi8_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs2;
+; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs4;
+; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r10;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs6;
+; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r12;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs8;
+; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r16;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs10;
+; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r18;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r19, %rs12;
+; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r21;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs14;
+; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r23;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r24, %rs16;
+; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs17, %r27;
+; CHECK-NEXT: add.s16 %rs18, %rs17, 1;
+; CHECK-NEXT: cvt.u32.u16 %r28, %rs18;
+; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs19, %r29;
+; CHECK-NEXT: add.s16 %rs20, %rs19, 1;
+; CHECK-NEXT: cvt.u32.u16 %r30, %rs20;
+; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs21, %r32;
+; CHECK-NEXT: add.s16 %rs22, %rs21, 1;
+; CHECK-NEXT: cvt.u32.u16 %r33, %rs22;
+; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs23, %r34;
+; CHECK-NEXT: add.s16 %rs24, %rs23, 1;
+; CHECK-NEXT: cvt.u32.u16 %r35, %rs24;
+; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs25, %r38;
+; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
+; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
+; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <16 x i8>, ptr addrspace(5) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_volatile_2xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi16(
; CHECK: {
@@ -4413,6 +5851,40 @@ define void @local_volatile_4xi16(ptr addrspace(5) %a) {
ret void
}
+define void @local_volatile_8xi16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_8xi16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT: ret;
+ %a.load = load volatile <8 x i16>, ptr addrspace(5) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr addrspace(5) %a
+ ret void
+}
+
define void @local_volatile_2xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi32(
; CHECK: {
diff --git a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
index 4c7a51b70bc33a..d5043c2c3047c1 100644
--- a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
+++ b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
@@ -6,19 +6,17 @@ target triple = "nvptx64-unknown-unknown"
define void @kernel_func(ptr %in.vec, ptr %out.vec0) nounwind {
; CHECK-LABEL: kernel_func(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-NEXT: .reg .b32 %r<14>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [kernel_func_param_0];
-; CHECK-NEXT: ld.u32 %r2, [%r1+8];
-; CHECK-NEXT: ld.u32 %r3, [%r1];
-; CHECK-NEXT: ld.u32 %r4, [%r1+24];
-; CHECK-NEXT: ld.u32 %r5, [%r1+16];
-; CHECK-NEXT: ld.param.u32 %r6, [kernel_func_param_1];
-; CHECK-NEXT: prmt.b32 %r7, %r5, %r4, 0x4000U;
-; CHECK-NEXT: prmt.b32 %r8, %r3, %r2, 0x40U;
-; CHECK-NEXT: prmt.b32 %r9, %r8, %r7, 0x7610U;
-; CHECK-NEXT: st.u32 [%r6], %r9;
+; CHECK-NEXT: ld.v4.b32 {%r2, %r3, %r4, %r5}, [%r1];
+; CHECK-NEXT: ld.v4.b32 {%r6, %r7, %r8, %r9}, [%r1+16];
+; CHECK-NEXT: ld.param.u32 %r10, [kernel_func_param_1];
+; CHECK-NEXT: prmt.b32 %r11, %r6, %r8, 0x4000U;
+; CHECK-NEXT: prmt.b32 %r12, %r2, %r4, 0x40U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x7610U;
+; CHECK-NEXT: st.u32 [%r10], %r13;
; CHECK-NEXT: ret;
%wide.vec = load <32 x i8>, ptr %in.vec, align 64
%vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index df14553a772057..c914aa6e24082d 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -31,8 +31,8 @@ define void @foo4(<4 x i32> %val, ptr %ptr) {
; CHECK-LABEL: .visible .func v16i8
define void @v16i8(ptr %a, ptr %b) {
-; CHECK: ld.v4.u32
-; CHECK: st.v4.u32
+; CHECK: ld.v4.b32
+; CHECK: st.v4.b32
%v = load <16 x i8>, ptr %a
store <16 x i8> %v, ptr %b
ret void
>From 8a23c78c2edc09b58a50b73ffc6baff79f02fda4 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Thu, 12 Dec 2024 22:38:49 +0000
Subject: [PATCH 02/11] Split load-store.ll into two seperate files for scalars
and vectors
---
llvm/test/CodeGen/NVPTX/load-store-scalars.ll | 3206 ++++++++++++++
.../{load-store.ll => load-store-vectors.ll} | 3772 ++---------------
2 files changed, 3506 insertions(+), 3472 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/load-store-scalars.ll
rename llvm/test/CodeGen/NVPTX/{load-store.ll => load-store-vectors.ll} (53%)
diff --git a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
new file mode 100644
index 00000000000000..133c5e7a138a22
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
@@ -0,0 +1,3206 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
+
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
+
+;; generic statespace
+
+; generic
+
+define void @generic_i8(ptr %a) {
+; CHECK-LABEL: generic_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i8_param_0];
+; CHECK-NEXT: ld.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i8, ptr %a
+ %a.add = add i8 %a.load, 1
+ store i8 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_i16(ptr %a) {
+; CHECK-LABEL: generic_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i16_param_0];
+; CHECK-NEXT: ld.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_i32(ptr %a) {
+; CHECK-LABEL: generic_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i32_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_i64(ptr %a) {
+; CHECK-LABEL: generic_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_i64_param_0];
+; CHECK-NEXT: ld.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_float(ptr %a) {
+; CHECK-LABEL: generic_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_float_param_0];
+; CHECK-NEXT: ld.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr %a
+ ret void
+}
+
+define void @generic_double(ptr %a) {
+; CHECK-LABEL: generic_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_double_param_0];
+; CHECK-NEXT: ld.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr %a
+ ret void
+}
+
+; generic_volatile
+
+define void @generic_volatile_i8(ptr %a) {
+; CHECK-LABEL: generic_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i8_param_0];
+; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i8, ptr %a
+ %a.add = add i8 %a.load, 1
+ store volatile i8 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_i16(ptr %a) {
+; CHECK-LABEL: generic_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i16_param_0];
+; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_i32(ptr %a) {
+; CHECK-LABEL: generic_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i32_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_i64(ptr %a) {
+; CHECK-LABEL: generic_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i64_param_0];
+; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_float(ptr %a) {
+; CHECK-LABEL: generic_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_float_param_0];
+; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr %a
+ ret void
+}
+
+define void @generic_volatile_double(ptr %a) {
+; CHECK-LABEL: generic_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_double_param_0];
+; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr %a
+ ret void
+}
+
+; generic_unordered_sys
+
+define void @generic_unordered_sys_i8(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i8, ptr %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr %a unordered, align 1
+ ret void
+}
+
+define void @generic_unordered_sys_i16(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr %a unordered, align 2
+ ret void
+}
+
+define void @generic_unordered_sys_i32(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr %a unordered, align 4
+ ret void
+}
+
+define void @generic_unordered_sys_i64(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr %a unordered, align 8
+ ret void
+}
+
+define void @generic_unordered_sys_float(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
+; SM60-NEXT: ld.volatile.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr %a unordered, align 4
+ ret void
+}
+
+define void @generic_unordered_sys_double(ptr %a) {
+; SM60-LABEL: generic_unordered_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
+; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_unordered_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr %a unordered, align 8
+ ret void
+}
+
+; generic_unordered_volatile_sys
+
+define void @generic_unordered_volatile_sys_i8(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr %a unordered, align 1
+ ret void
+}
+
+define void @generic_unordered_volatile_sys_i16(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr %a unordered, align 2
+ ret void
+}
+
+define void @generic_unordered_volatile_sys_i32(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr %a unordered, align 4
+ ret void
+}
+
+define void @generic_unordered_volatile_sys_i64(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr %a unordered, align 8
+ ret void
+}
+
+define void @generic_unordered_volatile_sys_float(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr %a unordered, align 4
+ ret void
+}
+
+define void @generic_unordered_volatile_sys_double(ptr %a) {
+; CHECK-LABEL: generic_unordered_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr %a unordered, align 8
+ ret void
+}
+
+; generic_monotonic_sys
+
+define void @generic_monotonic_sys_i8(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i8, ptr %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr %a monotonic, align 1
+ ret void
+}
+
+define void @generic_monotonic_sys_i16(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr %a monotonic, align 2
+ ret void
+}
+
+define void @generic_monotonic_sys_i32(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr %a monotonic, align 4
+ ret void
+}
+
+define void @generic_monotonic_sys_i64(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr %a monotonic, align 8
+ ret void
+}
+
+define void @generic_monotonic_sys_float(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
+; SM60-NEXT: ld.volatile.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr %a monotonic, align 4
+ ret void
+}
+
+define void @generic_monotonic_sys_double(ptr %a) {
+; SM60-LABEL: generic_monotonic_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
+; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: generic_monotonic_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr %a monotonic, align 8
+ ret void
+}
+
+; generic_monotonic_volatile_sys
+
+define void @generic_monotonic_volatile_sys_i8(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+ ret void
+}
+
+define void @generic_monotonic_volatile_sys_i16(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr %a monotonic, align 2
+ ret void
+}
+
+define void @generic_monotonic_volatile_sys_i32(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr %a monotonic, align 4
+ ret void
+}
+
+define void @generic_monotonic_volatile_sys_i64(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr %a monotonic, align 8
+ ret void
+}
+
+define void @generic_monotonic_volatile_sys_float(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr %a monotonic, align 4
+ ret void
+}
+
+define void @generic_monotonic_volatile_sys_double(ptr %a) {
+; CHECK-LABEL: generic_monotonic_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr %a monotonic, align 8
+ ret void
+}
+
+;; global statespace
+
+; global
+
+define void @global_i8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i8_param_0];
+; CHECK-NEXT: ld.global.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.global.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i8, ptr addrspace(1) %a
+ %a.add = add i8 %a.load, 1
+ store i8 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_i16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i16_param_0];
+; CHECK-NEXT: ld.global.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.global.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr addrspace(1) %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_i32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i32_param_0];
+; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.global.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr addrspace(1) %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_i64(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_i64_param_0];
+; CHECK-NEXT: ld.global.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.global.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr addrspace(1) %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_float(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_float_param_0];
+; CHECK-NEXT: ld.global.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.global.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr addrspace(1) %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_double(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_double_param_0];
+; CHECK-NEXT: ld.global.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.global.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr addrspace(1) %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+; global_volatile
+
+define void @global_volatile_i8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i8_param_0];
+; CHECK-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i8, ptr addrspace(1) %a
+ %a.add = add i8 %a.load, 1
+ store volatile i8 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_i16(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i16_param_0];
+; CHECK-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr addrspace(1) %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_i32(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i32_param_0];
+; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr addrspace(1) %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_i64(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i64_param_0];
+; CHECK-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr addrspace(1) %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_float(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_float_param_0];
+; CHECK-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr addrspace(1) %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+define void @global_volatile_double(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_double_param_0];
+; CHECK-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr addrspace(1) %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr addrspace(1) %a
+ ret void
+}
+
+; global_unordered_sys
+
+define void @global_unordered_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ ret void
+}
+
+define void @global_unordered_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(1) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(1) %a unordered, align 2
+ ret void
+}
+
+define void @global_unordered_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(1) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(1) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
+
+define void @global_unordered_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
+
+; global_unordered_volatile_sys
+
+define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ ret void
+}
+
+define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(1) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(1) %a unordered, align 2
+ ret void
+}
+
+define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(1) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(1) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
+
+define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(1) %a unordered, align 4
+ ret void
+}
+
+define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_unordered_volatile_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(1) %a unordered, align 8
+ ret void
+}
+
+; global_monotonic_sys
+
+define void @global_monotonic_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ ret void
+}
+
+define void @global_monotonic_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(1) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(1) %a monotonic, align 2
+ ret void
+}
+
+define void @global_monotonic_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(1) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(1) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(1) %a monotonic, align 8
+ ret void
+}
+
+define void @global_monotonic_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(1) %a monotonic, align 8
+ ret void
+}
+
+; global_monotonic_volatile_sys
+
+define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(1) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(1) %a monotonic, align 2
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(1) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(1) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(1) %a monotonic, align 8
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
+; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(1) %a monotonic, align 4
+ ret void
+}
+
+define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) {
+; SM60-LABEL: global_monotonic_volatile_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
+; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
+; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(1) %a monotonic, align 8
+ ret void
+}
+
+;; shared statespace
+
+; shared
+
+define void @shared_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i8_param_0];
+; CHECK-NEXT: ld.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i8, ptr addrspace(3) %a
+ %a.add = add i8 %a.load, 1
+ store i8 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i16_param_0];
+; CHECK-NEXT: ld.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr addrspace(3) %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i32_param_0];
+; CHECK-NEXT: ld.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr addrspace(3) %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_i64_param_0];
+; CHECK-NEXT: ld.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr addrspace(3) %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_float_param_0];
+; CHECK-NEXT: ld.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr addrspace(3) %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_double_param_0];
+; CHECK-NEXT: ld.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr addrspace(3) %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+; shared_volatile
+
+define void @shared_volatile_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i8_param_0];
+; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i8, ptr addrspace(3) %a
+ %a.add = add i8 %a.load, 1
+ store volatile i8 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i16_param_0];
+; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr addrspace(3) %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i32_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr addrspace(3) %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i64_param_0];
+; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr addrspace(3) %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_float_param_0];
+; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr addrspace(3) %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+define void @shared_volatile_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_double_param_0];
+; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr addrspace(3) %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr addrspace(3) %a
+ ret void
+}
+
+; shared_unordered_sys
+
+define void @shared_unordered_sys_i8(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ ret void
+}
+
+define void @shared_unordered_sys_i16(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(3) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(3) %a unordered, align 2
+ ret void
+}
+
+define void @shared_unordered_sys_i32(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(3) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
+
+define void @shared_unordered_sys_i64(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(3) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(3) %a unordered, align 8
+ ret void
+}
+
+define void @shared_unordered_sys_float(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
+; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
+
+define void @shared_unordered_sys_double(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_unordered_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
+; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_unordered_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(3) %a unordered, align 8
+ ret void
+}
+
+; shared_unordered_volatile_sys
+
+define void @shared_unordered_volatile_sys_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ ret void
+}
+
+define void @shared_unordered_volatile_sys_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(3) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(3) %a unordered, align 2
+ ret void
+}
+
+define void @shared_unordered_volatile_sys_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(3) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
+
+define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(3) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(3) %a unordered, align 8
+ ret void
+}
+
+define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(3) %a unordered, align 4
+ ret void
+}
+
+define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_unordered_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(3) %a unordered, align 8
+ ret void
+}
+
+; shared_monotonic_sys
+
+define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i8(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
+; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ ret void
+}
+
+define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i16(
+; SM60: {
+; SM60-NEXT: .reg .b16 %rs<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
+; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; SM60-NEXT: add.s16 %rs2, %rs1, 1;
+; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1];
+; SM70-NEXT: add.s16 %rs2, %rs1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(3) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(3) %a monotonic, align 2
+ ret void
+}
+
+define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i32(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
+; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; SM60-NEXT: add.s32 %r2, %r1, 1;
+; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1];
+; SM70-NEXT: add.s32 %r2, %r1, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2;
+; SM70-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(3) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
+
+define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_i64(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
+; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; SM60-NEXT: add.s64 %rd3, %rd2, 1;
+; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1];
+; SM70-NEXT: add.s64 %rd3, %rd2, 1;
+; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3;
+; SM70-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(3) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(3) %a monotonic, align 8
+ ret void
+}
+
+define void @shared_monotonic_sys_float(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_float(
+; SM60: {
+; SM60-NEXT: .reg .f32 %f<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
+; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_float(
+; SM70: {
+; SM70-NEXT: .reg .f32 %f<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1];
+; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2;
+; SM70-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
+
+define void @shared_monotonic_sys_double(ptr addrspace(3) %a) {
+; SM60-LABEL: shared_monotonic_sys_double(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
+; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; SM60-NEXT: ret;
+;
+; SM70-LABEL: shared_monotonic_sys_double(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
+; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1];
+; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2;
+; SM70-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(3) %a monotonic, align 8
+ ret void
+}
+
+; shared_monotonic_volatile_sys
+
+define void @shared_monotonic_volatile_sys_i8(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ ret void
+}
+
+define void @shared_monotonic_volatile_sys_i16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(3) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(3) %a monotonic, align 2
+ ret void
+}
+
+define void @shared_monotonic_volatile_sys_i32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(3) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
+
+define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(3) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(3) %a monotonic, align 8
+ ret void
+}
+
+define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(3) %a monotonic, align 4
+ ret void
+}
+
+define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_monotonic_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(3) %a monotonic, align 8
+ ret void
+}
+
+;; local statespace
+
+; local
+
+define void @local_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i8, ptr addrspace(5) %a
+ %a.add = add i8 %a.load, 1
+ store i8 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load i16, ptr addrspace(5) %a
+ %a.add = add i16 %a.load, 1
+ store i16 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load i32, ptr addrspace(5) %a
+ %a.add = add i32 %a.load, 1
+ store i32 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load i64, ptr addrspace(5) %a
+ %a.add = add i64 %a.load, 1
+ store i64 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load float, ptr addrspace(5) %a
+ %a.add = fadd float %a.load, 1.
+ store float %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load double, ptr addrspace(5) %a
+ %a.add = fadd double %a.load, 1.
+ store double %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+; local_volatile
+
+define void @local_volatile_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i8, ptr addrspace(5) %a
+ %a.add = add i8 %a.load, 1
+ store volatile i8 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i16, ptr addrspace(5) %a
+ %a.add = add i16 %a.load, 1
+ store volatile i16 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i32, ptr addrspace(5) %a
+ %a.add = add i32 %a.load, 1
+ store volatile i32 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load volatile i64, ptr addrspace(5) %a
+ %a.add = add i64 %a.load, 1
+ store volatile i64 %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile float, ptr addrspace(5) %a
+ %a.add = fadd float %a.load, 1.
+ store volatile float %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+define void @local_volatile_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_volatile_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load volatile double, ptr addrspace(5) %a
+ %a.add = fadd double %a.load, 1.
+ store volatile double %a.add, ptr addrspace(5) %a
+ ret void
+}
+
+; local_unordered_sys
+
+define void @local_unordered_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ ret void
+}
+
+define void @local_unordered_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(5) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(5) %a unordered, align 2
+ ret void
+}
+
+define void @local_unordered_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(5) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
+
+define void @local_unordered_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(5) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(5) %a unordered, align 8
+ ret void
+}
+
+define void @local_unordered_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
+
+define void @local_unordered_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(5) %a unordered, align 8
+ ret void
+}
+
+; local_unordered_volatile_sys
+
+define void @local_unordered_volatile_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ ret void
+}
+
+define void @local_unordered_volatile_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(5) %a unordered, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(5) %a unordered, align 2
+ ret void
+}
+
+define void @local_unordered_volatile_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(5) %a unordered, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
+
+define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(5) %a unordered, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(5) %a unordered, align 8
+ ret void
+}
+
+define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(5) %a unordered, align 4
+ ret void
+}
+
+define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_unordered_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(5) %a unordered, align 8
+ ret void
+}
+
+; local_monotonic_sys
+
+define void @local_monotonic_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ ret void
+}
+
+define void @local_monotonic_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i16, ptr addrspace(5) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic i16 %a.add, ptr addrspace(5) %a monotonic, align 2
+ ret void
+}
+
+define void @local_monotonic_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i32, ptr addrspace(5) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic i32 %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
+
+define void @local_monotonic_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic i64, ptr addrspace(5) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic i64 %a.add, ptr addrspace(5) %a monotonic, align 8
+ ret void
+}
+
+define void @local_monotonic_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic float %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
+
+define void @local_monotonic_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic double %a.add, ptr addrspace(5) %a monotonic, align 8
+ ret void
+}
+
+; local_monotonic_volatile_sys
+
+define void @local_monotonic_volatile_sys_i8(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_i16(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i16, ptr addrspace(5) %a monotonic, align 2
+ %a.add = add i16 %a.load, 1
+ store atomic volatile i16 %a.add, ptr addrspace(5) %a monotonic, align 2
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_i32(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i32, ptr addrspace(5) %a monotonic, align 4
+ %a.add = add i32 %a.load, 1
+ store atomic volatile i32 %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
+; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile i64, ptr addrspace(5) %a monotonic, align 8
+ %a.add = add i64 %a.load, 1
+ store atomic volatile i64 %a.add, ptr addrspace(5) %a monotonic, align 8
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_float(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4
+ %a.add = fadd float %a.load, 1.
+ store atomic volatile float %a.add, ptr addrspace(5) %a monotonic, align 4
+ ret void
+}
+
+define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_monotonic_volatile_sys_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ret;
+ %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8
+ %a.add = fadd double %a.load, 1.
+ store atomic volatile double %a.add, ptr addrspace(5) %a monotonic, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
similarity index 53%
rename from llvm/test/CodeGen/NVPTX/load-store.ll
rename to llvm/test/CodeGen/NVPTX/load-store-vectors.ll
index 82991b4c8d6ceb..faf96dd0c1a7c2 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
@@ -1,8 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70
-; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
; TODO: add i1, and <6 x i8> vector tests.
@@ -19,119 +17,10 @@
; TODO: add weak,atomic,volatile,atomic volatile tests
; for .const and .param statespaces.
-; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
-
;; generic statespace
; generic
-define void @generic_i8(ptr %a) {
-; CHECK-LABEL: generic_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_i8_param_0];
-; CHECK-NEXT: ld.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load i8, ptr %a
- %a.add = add i8 %a.load, 1
- store i8 %a.add, ptr %a
- ret void
-}
-
-define void @generic_i16(ptr %a) {
-; CHECK-LABEL: generic_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_i16_param_0];
-; CHECK-NEXT: ld.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load i16, ptr %a
- %a.add = add i16 %a.load, 1
- store i16 %a.add, ptr %a
- ret void
-}
-
-define void @generic_i32(ptr %a) {
-; CHECK-LABEL: generic_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_i32_param_0];
-; CHECK-NEXT: ld.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load i32, ptr %a
- %a.add = add i32 %a.load, 1
- store i32 %a.add, ptr %a
- ret void
-}
-
-define void @generic_i64(ptr %a) {
-; CHECK-LABEL: generic_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_i64_param_0];
-; CHECK-NEXT: ld.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load i64, ptr %a
- %a.add = add i64 %a.load, 1
- store i64 %a.add, ptr %a
- ret void
-}
-
-define void @generic_float(ptr %a) {
-; CHECK-LABEL: generic_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_float_param_0];
-; CHECK-NEXT: ld.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load float, ptr %a
- %a.add = fadd float %a.load, 1.
- store float %a.add, ptr %a
- ret void
-}
-
-define void @generic_double(ptr %a) {
-; CHECK-LABEL: generic_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_double_param_0];
-; CHECK-NEXT: ld.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load double, ptr %a
- %a.add = fadd double %a.load, 1.
- store double %a.add, ptr %a
- ret void
-}
-
; TODO: make the lowering of this weak vector ops consistent with
; the ones of the next tests. This test lowers to a weak PTX
; vector op, but next test lowers to a vector PTX op.
@@ -540,113 +429,6 @@ define void @generic_2xdouble(ptr %a) {
; generic_volatile
-define void @generic_volatile_i8(ptr %a) {
-; CHECK-LABEL: generic_volatile_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i8_param_0];
-; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i8, ptr %a
- %a.add = add i8 %a.load, 1
- store volatile i8 %a.add, ptr %a
- ret void
-}
-
-define void @generic_volatile_i16(ptr %a) {
-; CHECK-LABEL: generic_volatile_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i16_param_0];
-; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i16, ptr %a
- %a.add = add i16 %a.load, 1
- store volatile i16 %a.add, ptr %a
- ret void
-}
-
-define void @generic_volatile_i32(ptr %a) {
-; CHECK-LABEL: generic_volatile_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i32_param_0];
-; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i32, ptr %a
- %a.add = add i32 %a.load, 1
- store volatile i32 %a.add, ptr %a
- ret void
-}
-
-define void @generic_volatile_i64(ptr %a) {
-; CHECK-LABEL: generic_volatile_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i64_param_0];
-; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load volatile i64, ptr %a
- %a.add = add i64 %a.load, 1
- store volatile i64 %a.add, ptr %a
- ret void
-}
-
-define void @generic_volatile_float(ptr %a) {
-; CHECK-LABEL: generic_volatile_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_float_param_0];
-; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load volatile float, ptr %a
- %a.add = fadd float %a.load, 1.
- store volatile float %a.add, ptr %a
- ret void
-}
-
-define void @generic_volatile_double(ptr %a) {
-; CHECK-LABEL: generic_volatile_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_double_param_0];
-; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load volatile double, ptr %a
- %a.add = fadd double %a.load, 1.
- store volatile double %a.add, ptr %a
- ret void
-}
-
; TODO: volatile, atomic, and volatile atomic memory operations on vector types.
; Currently, LLVM:
; - does not allow atomic operations on vectors.
@@ -1066,812 +848,127 @@ define void @generic_volatile_2xdouble(ptr %a) {
ret void
}
-; generic_unordered_sys
-
-define void @generic_unordered_sys_i8(ptr %a) {
-; SM60-LABEL: generic_unordered_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_unordered_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
-; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i8, ptr %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr %a unordered, align 1
- ret void
-}
-
-define void @generic_unordered_sys_i16(ptr %a) {
-; SM60-LABEL: generic_unordered_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_unordered_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
-; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i16, ptr %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr %a unordered, align 2
- ret void
-}
-
-define void @generic_unordered_sys_i32(ptr %a) {
-; SM60-LABEL: generic_unordered_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_unordered_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
-; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic i32, ptr %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr %a unordered, align 4
- ret void
-}
-
-define void @generic_unordered_sys_i64(ptr %a) {
-; SM60-LABEL: generic_unordered_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_unordered_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
-; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic i64, ptr %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr %a unordered, align 8
- ret void
-}
-
-define void @generic_unordered_sys_float(ptr %a) {
-; SM60-LABEL: generic_unordered_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
-; SM60-NEXT: ld.volatile.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_unordered_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
-; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic float, ptr %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr %a unordered, align 4
- ret void
-}
-
-define void @generic_unordered_sys_double(ptr %a) {
-; SM60-LABEL: generic_unordered_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
-; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_unordered_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
-; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic double, ptr %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr %a unordered, align 8
- ret void
-}
+;; global statespace
-; generic_unordered_volatile_sys
+; global
-define void @generic_unordered_volatile_sys_i8(ptr %a) {
-; CHECK-LABEL: generic_unordered_volatile_sys_i8(
+define void @global_2xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_2xi8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i8_param_0];
-; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0];
+; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT: ret;
- %a.load = load atomic volatile i8, ptr %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr %a unordered, align 1
+ %a.load = load <2 x i8>, ptr addrspace(1) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store <2 x i8> %a.add, ptr addrspace(1) %a
ret void
}
-define void @generic_unordered_volatile_sys_i16(ptr %a) {
-; CHECK-LABEL: generic_unordered_volatile_sys_i16(
+define void @global_4xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_4xi8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i16_param_0];
-; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0];
+; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i16, ptr %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr %a unordered, align 2
- ret void
-}
-
-define void @generic_unordered_volatile_sys_i32(ptr %a) {
-; CHECK-LABEL: generic_unordered_volatile_sys_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i32_param_0];
-; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i32, ptr %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr %a unordered, align 4
- ret void
-}
-
-define void @generic_unordered_volatile_sys_i64(ptr %a) {
-; CHECK-LABEL: generic_unordered_volatile_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i64_param_0];
-; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U;
+; CHECK-NEXT: st.global.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
- %a.load = load atomic volatile i64, ptr %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr %a unordered, align 8
+ %a.load = load <4 x i8>, ptr addrspace(1) %a
+ %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
+ store <4 x i8> %a.add, ptr addrspace(1) %a
ret void
}
-define void @generic_unordered_volatile_sys_float(ptr %a) {
-; CHECK-LABEL: generic_unordered_volatile_sys_float(
+define void @global_8xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_8xi8(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<25>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0];
+; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
+; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
+; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
+; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
+; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
+; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
+; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
+; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
+; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
+; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
+; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT: ret;
- %a.load = load atomic volatile float, ptr %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr %a unordered, align 4
+ %a.load = load <8 x i8>, ptr addrspace(1) %a
+ %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store <8 x i8> %a.add, ptr addrspace(1) %a
ret void
}
-define void @generic_unordered_volatile_sys_double(ptr %a) {
-; CHECK-LABEL: generic_unordered_volatile_sys_double(
+define void @global_16xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_16xi8(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile double, ptr %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr %a unordered, align 8
- ret void
-}
-
-; generic_monotonic_sys
-
-define void @generic_monotonic_sys_i8(ptr %a) {
-; SM60-LABEL: generic_monotonic_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_monotonic_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
-; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i8, ptr %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr %a monotonic, align 1
- ret void
-}
-
-define void @generic_monotonic_sys_i16(ptr %a) {
-; SM60-LABEL: generic_monotonic_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_monotonic_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
-; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i16, ptr %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr %a monotonic, align 2
- ret void
-}
-
-define void @generic_monotonic_sys_i32(ptr %a) {
-; SM60-LABEL: generic_monotonic_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_monotonic_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
-; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic i32, ptr %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr %a monotonic, align 4
- ret void
-}
-
-define void @generic_monotonic_sys_i64(ptr %a) {
-; SM60-LABEL: generic_monotonic_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_monotonic_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
-; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic i64, ptr %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr %a monotonic, align 8
- ret void
-}
-
-define void @generic_monotonic_sys_float(ptr %a) {
-; SM60-LABEL: generic_monotonic_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
-; SM60-NEXT: ld.volatile.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_monotonic_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
-; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic float, ptr %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr %a monotonic, align 4
- ret void
-}
-
-define void @generic_monotonic_sys_double(ptr %a) {
-; SM60-LABEL: generic_monotonic_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
-; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: generic_monotonic_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
-; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic double, ptr %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr %a monotonic, align 8
- ret void
-}
-
-; generic_monotonic_volatile_sys
-
-define void @generic_monotonic_volatile_sys_i8(ptr %a) {
-; CHECK-LABEL: generic_monotonic_volatile_sys_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i8_param_0];
-; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i8, ptr %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr %a monotonic, align 1
- ret void
-}
-
-define void @generic_monotonic_volatile_sys_i16(ptr %a) {
-; CHECK-LABEL: generic_monotonic_volatile_sys_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i16_param_0];
-; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i16, ptr %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr %a monotonic, align 2
- ret void
-}
-
-define void @generic_monotonic_volatile_sys_i32(ptr %a) {
-; CHECK-LABEL: generic_monotonic_volatile_sys_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i32_param_0];
-; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.volatile.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i32, ptr %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr %a monotonic, align 4
- ret void
-}
-
-define void @generic_monotonic_volatile_sys_i64(ptr %a) {
-; CHECK-LABEL: generic_monotonic_volatile_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i64_param_0];
-; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i64, ptr %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr %a monotonic, align 8
- ret void
-}
-
-define void @generic_monotonic_volatile_sys_float(ptr %a) {
-; CHECK-LABEL: generic_monotonic_volatile_sys_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile float, ptr %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr %a monotonic, align 4
- ret void
-}
-
-define void @generic_monotonic_volatile_sys_double(ptr %a) {
-; CHECK-LABEL: generic_monotonic_volatile_sys_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile double, ptr %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr %a monotonic, align 8
- ret void
-}
-
-;; global statespace
-
-; global
-
-define void @global_i8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_i8_param_0];
-; CHECK-NEXT: ld.global.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.global.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load i8, ptr addrspace(1) %a
- %a.add = add i8 %a.load, 1
- store i8 %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_i16(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_i16_param_0];
-; CHECK-NEXT: ld.global.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.global.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load i16, ptr addrspace(1) %a
- %a.add = add i16 %a.load, 1
- store i16 %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_i32(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_i32_param_0];
-; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.global.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load i32, ptr addrspace(1) %a
- %a.add = add i32 %a.load, 1
- store i32 %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_i64(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_i64_param_0];
-; CHECK-NEXT: ld.global.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.global.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load i64, ptr addrspace(1) %a
- %a.add = add i64 %a.load, 1
- store i64 %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_float(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_float_param_0];
-; CHECK-NEXT: ld.global.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.global.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load float, ptr addrspace(1) %a
- %a.add = fadd float %a.load, 1.
- store float %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_double(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_double_param_0];
-; CHECK-NEXT: ld.global.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.global.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load double, ptr addrspace(1) %a
- %a.add = fadd double %a.load, 1.
- store double %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_2xi8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_2xi8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0];
-; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1];
-; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
-; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3};
-; CHECK-NEXT: ret;
- %a.load = load <2 x i8>, ptr addrspace(1) %a
- %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
- store <2 x i8> %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_4xi8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_4xi8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<13>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0];
-; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
-; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
-; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
-; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
-; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U;
-; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT: st.global.u32 [%rd1], %r12;
-; CHECK-NEXT: ret;
- %a.load = load <4 x i8>, ptr addrspace(1) %a
- %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
- store <4 x i8> %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_8xi8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_8xi8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<17>;
-; CHECK-NEXT: .reg .b32 %r<25>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0];
-; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
-; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
-; CHECK-NEXT: cvt.u32.u16 %r6, %rs4;
-; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs5, %r8;
-; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
-; CHECK-NEXT: cvt.u32.u16 %r9, %rs6;
-; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U;
-; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U;
-; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs9, %r14;
-; CHECK-NEXT: add.s16 %rs10, %rs9, 1;
-; CHECK-NEXT: cvt.u32.u16 %r15, %rs10;
-; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs11, %r16;
-; CHECK-NEXT: add.s16 %rs12, %rs11, 1;
-; CHECK-NEXT: cvt.u32.u16 %r17, %rs12;
-; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs13, %r19;
-; CHECK-NEXT: add.s16 %rs14, %rs13, 1;
-; CHECK-NEXT: cvt.u32.u16 %r20, %rs14;
-; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs15, %r21;
-; CHECK-NEXT: add.s16 %rs16, %rs15, 1;
-; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
-; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
-; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
-; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r24, %r13};
-; CHECK-NEXT: ret;
- %a.load = load <8 x i8>, ptr addrspace(1) %a
- %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- store <8 x i8> %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_16xi8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_16xi8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<33>;
-; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b16 %rs<33>;
+; CHECK-NEXT: .reg .b32 %r<49>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
@@ -2157,144 +1254,37 @@ define void @global_2xdouble(ptr addrspace(1) %a) {
; global_volatile
-define void @global_volatile_i8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_i8(
+define void @global_volatile_2xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_2xi8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i8_param_0];
-; CHECK-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.global.u8 [%rd1], %rs2;
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT: ret;
- %a.load = load volatile i8, ptr addrspace(1) %a
- %a.add = add i8 %a.load, 1
- store volatile i8 %a.add, ptr addrspace(1) %a
+ %a.load = load volatile <2 x i8>, ptr addrspace(1) %a
+ %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
+ store volatile <2 x i8> %a.add, ptr addrspace(1) %a
ret void
}
-define void @global_volatile_i16(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_i16(
+define void @global_volatile_4xi8(ptr addrspace(1) %a) {
+; CHECK-LABEL: global_volatile_4xi8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<13>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i16_param_0];
-; CHECK-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.global.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i16, ptr addrspace(1) %a
- %a.add = add i16 %a.load, 1
- store volatile i16 %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_volatile_i32(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i32_param_0];
-; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i32, ptr addrspace(1) %a
- %a.add = add i32 %a.load, 1
- store volatile i32 %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_volatile_i64(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i64_param_0];
-; CHECK-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.volatile.global.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load volatile i64, ptr addrspace(1) %a
- %a.add = add i64 %a.load, 1
- store volatile i64 %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_volatile_float(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_float_param_0];
-; CHECK-NEXT: ld.volatile.global.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.global.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load volatile float, ptr addrspace(1) %a
- %a.add = fadd float %a.load, 1.
- store volatile float %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_volatile_double(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_double_param_0];
-; CHECK-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.global.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load volatile double, ptr addrspace(1) %a
- %a.add = fadd double %a.load, 1.
- store volatile double %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_volatile_2xi8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_2xi8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0];
-; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1];
-; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
-; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3};
-; CHECK-NEXT: ret;
- %a.load = load volatile <2 x i8>, ptr addrspace(1) %a
- %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
- store volatile <2 x i8> %a.add, ptr addrspace(1) %a
- ret void
-}
-
-define void @global_volatile_4xi8(ptr addrspace(1) %a) {
-; CHECK-LABEL: global_volatile_4xi8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b32 %r<13>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0];
-; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0];
+; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
@@ -2664,837 +1654,10 @@ define void @global_volatile_2xdouble(ptr addrspace(1) %a) {
ret void
}
-; global_unordered_sys
-
-define void @global_unordered_sys_i8(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
- ret void
-}
-
-define void @global_unordered_sys_i16(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i16, ptr addrspace(1) %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr addrspace(1) %a unordered, align 2
- ret void
-}
-
-define void @global_unordered_sys_i32(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic i32, ptr addrspace(1) %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr addrspace(1) %a unordered, align 4
- ret void
-}
-
-define void @global_unordered_sys_i64(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic i64, ptr addrspace(1) %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr addrspace(1) %a unordered, align 8
- ret void
-}
-
-define void @global_unordered_sys_float(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
-; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr addrspace(1) %a unordered, align 4
- ret void
-}
-
-define void @global_unordered_sys_double(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
-; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr addrspace(1) %a unordered, align 8
- ret void
-}
-
-; global_unordered_volatile_sys
-
-define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_volatile_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_volatile_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
- ret void
-}
-
-define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_volatile_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_volatile_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i16, ptr addrspace(1) %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr addrspace(1) %a unordered, align 2
- ret void
-}
-
-define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_volatile_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_volatile_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i32, ptr addrspace(1) %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr addrspace(1) %a unordered, align 4
- ret void
-}
-
-define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_volatile_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_volatile_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i64, ptr addrspace(1) %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr addrspace(1) %a unordered, align 8
- ret void
-}
-
-define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_volatile_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
-; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_volatile_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr addrspace(1) %a unordered, align 4
- ret void
-}
-
-define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) {
-; SM60-LABEL: global_unordered_volatile_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
-; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_unordered_volatile_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr addrspace(1) %a unordered, align 8
- ret void
-}
-
-; global_monotonic_sys
-
-define void @global_monotonic_sys_i8(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
- ret void
-}
-
-define void @global_monotonic_sys_i16(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i16, ptr addrspace(1) %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr addrspace(1) %a monotonic, align 2
- ret void
-}
-
-define void @global_monotonic_sys_i32(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic i32, ptr addrspace(1) %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr addrspace(1) %a monotonic, align 4
- ret void
-}
-
-define void @global_monotonic_sys_i64(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic i64, ptr addrspace(1) %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr addrspace(1) %a monotonic, align 8
- ret void
-}
-
-define void @global_monotonic_sys_float(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
-; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr addrspace(1) %a monotonic, align 4
- ret void
-}
-
-define void @global_monotonic_sys_double(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
-; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
-; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr addrspace(1) %a monotonic, align 8
- ret void
-}
-
-; global_monotonic_volatile_sys
-
-define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_volatile_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_volatile_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
- ret void
-}
-
-define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_volatile_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_volatile_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i16, ptr addrspace(1) %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr addrspace(1) %a monotonic, align 2
- ret void
-}
-
-define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_volatile_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_volatile_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i32, ptr addrspace(1) %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr addrspace(1) %a monotonic, align 4
- ret void
-}
-
-define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_volatile_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_volatile_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile i64, ptr addrspace(1) %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr addrspace(1) %a monotonic, align 8
- ret void
-}
-
-define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_volatile_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
-; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_volatile_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr addrspace(1) %a monotonic, align 4
- ret void
-}
-
-define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) {
-; SM60-LABEL: global_monotonic_volatile_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
-; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: global_monotonic_volatile_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
-; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr addrspace(1) %a monotonic, align 8
- ret void
-}
-
;; shared statespace
; shared
-define void @shared_i8(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_i8_param_0];
-; CHECK-NEXT: ld.shared.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.shared.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load i8, ptr addrspace(3) %a
- %a.add = add i8 %a.load, 1
- store i8 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_i16(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_i16_param_0];
-; CHECK-NEXT: ld.shared.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.shared.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load i16, ptr addrspace(3) %a
- %a.add = add i16 %a.load, 1
- store i16 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_i32(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_i32_param_0];
-; CHECK-NEXT: ld.shared.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.shared.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load i32, ptr addrspace(3) %a
- %a.add = add i32 %a.load, 1
- store i32 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_i64(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_i64_param_0];
-; CHECK-NEXT: ld.shared.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.shared.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load i64, ptr addrspace(3) %a
- %a.add = add i64 %a.load, 1
- store i64 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_float(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_float_param_0];
-; CHECK-NEXT: ld.shared.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.shared.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load float, ptr addrspace(3) %a
- %a.add = fadd float %a.load, 1.
- store float %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_double(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_double_param_0];
-; CHECK-NEXT: ld.shared.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.shared.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load double, ptr addrspace(3) %a
- %a.add = fadd double %a.load, 1.
- store double %a.add, ptr addrspace(3) %a
- ret void
-}
-
define void @shared_2xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi8(
; CHECK: {
@@ -3897,113 +2060,6 @@ define void @shared_2xdouble(ptr addrspace(3) %a) {
; shared_volatile
-define void @shared_volatile_i8(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i8_param_0];
-; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i8, ptr addrspace(3) %a
- %a.add = add i8 %a.load, 1
- store volatile i8 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_i16(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i16_param_0];
-; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i16, ptr addrspace(3) %a
- %a.add = add i16 %a.load, 1
- store volatile i16 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_i32(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i32_param_0];
-; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i32, ptr addrspace(3) %a
- %a.add = add i32 %a.load, 1
- store volatile i32 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_i64(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i64_param_0];
-; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load volatile i64, ptr addrspace(3) %a
- %a.add = add i64 %a.load, 1
- store volatile i64 %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_float(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_float_param_0];
-; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load volatile float, ptr addrspace(3) %a
- %a.add = fadd float %a.load, 1.
- store volatile float %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_double(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_double_param_0];
-; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load volatile double, ptr addrspace(3) %a
- %a.add = fadd double %a.load, 1.
- store volatile double %a.add, ptr addrspace(3) %a
- ret void
-}
-
define void @shared_volatile_2xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi8(
; CHECK: {
@@ -4188,911 +2244,226 @@ define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: add.s16 %rs26, %rs25, 1;
; CHECK-NEXT: cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
-; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
-; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
-; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
-; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
-; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
-; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
-; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
-; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
-; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
-; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
-; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
-; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
-; CHECK-NEXT: ret;
- %a.load = load volatile <16 x i8>, ptr addrspace(3) %a
- %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- store volatile <16 x i8> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_2xi16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0];
-; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
-; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load volatile <2 x i16>, ptr addrspace(3) %a
- %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
- store volatile <2 x i16> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_4xi16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<9>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0];
-; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
-; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
-; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
-; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
-; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
-; CHECK-NEXT: ret;
- %a.load = load volatile <4 x i16>, ptr addrspace(3) %a
- %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
- store volatile <4 x i16> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_8xi16(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_8xi16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<17>;
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0];
-; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
-; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
-; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
-; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
-; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
-; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
-; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
-; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
-; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
-; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
-; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
-; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
-; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
-; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
-; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
-; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
-; CHECK-NEXT: ret;
- %a.load = load volatile <8 x i16>, ptr addrspace(3) %a
- %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
- store volatile <8 x i16> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_2xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0];
-; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT: add.s32 %r3, %r2, 1;
-; CHECK-NEXT: add.s32 %r4, %r1, 1;
-; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3};
-; CHECK-NEXT: ret;
- %a.load = load volatile <2 x i32>, ptr addrspace(3) %a
- %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
- store volatile <2 x i32> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_4xi32(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_4xi32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0];
-; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: add.s32 %r5, %r4, 1;
-; CHECK-NEXT: add.s32 %r6, %r3, 1;
-; CHECK-NEXT: add.s32 %r7, %r2, 1;
-; CHECK-NEXT: add.s32 %r8, %r1, 1;
-; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
-; CHECK-NEXT: ret;
- %a.load = load volatile <4 x i32>, ptr addrspace(3) %a
- %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
- store volatile <4 x i32> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_2xi64(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_2xi64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<6>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0];
-; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
-; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4};
-; CHECK-NEXT: ret;
- %a.load = load volatile <2 x i64>, ptr addrspace(3) %a
- %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
- store volatile <2 x i64> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_2xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0];
-; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3};
-; CHECK-NEXT: ret;
- %a.load = load volatile <2 x float>, ptr addrspace(3) %a
- %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
- store volatile <2 x float> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_4xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0];
-; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
-; CHECK-NEXT: ret;
- %a.load = load volatile <4 x float>, ptr addrspace(3) %a
- %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
- store volatile <4 x float> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-define void @shared_volatile_2xdouble(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_volatile_2xdouble(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0];
-; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3};
-; CHECK-NEXT: ret;
- %a.load = load volatile <2 x double>, ptr addrspace(3) %a
- %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
- store volatile <2 x double> %a.add, ptr addrspace(3) %a
- ret void
-}
-
-; shared_unordered_sys
-
-define void @shared_unordered_sys_i8(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_unordered_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_unordered_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
- ret void
-}
-
-define void @shared_unordered_sys_i16(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_unordered_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_unordered_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i16, ptr addrspace(3) %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr addrspace(3) %a unordered, align 2
- ret void
-}
-
-define void @shared_unordered_sys_i32(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_unordered_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_unordered_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic i32, ptr addrspace(3) %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr addrspace(3) %a unordered, align 4
- ret void
-}
-
-define void @shared_unordered_sys_i64(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_unordered_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_unordered_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic i64, ptr addrspace(3) %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr addrspace(3) %a unordered, align 8
- ret void
-}
-
-define void @shared_unordered_sys_float(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_unordered_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
-; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_unordered_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr addrspace(3) %a unordered, align 4
- ret void
-}
-
-define void @shared_unordered_sys_double(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_unordered_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
-; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_unordered_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr addrspace(3) %a unordered, align 8
- ret void
-}
-
-; shared_unordered_volatile_sys
-
-define void @shared_unordered_volatile_sys_i8(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_unordered_volatile_sys_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i8_param_0];
-; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
- ret void
-}
-
-define void @shared_unordered_volatile_sys_i16(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_unordered_volatile_sys_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i16_param_0];
-; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i16, ptr addrspace(3) %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr addrspace(3) %a unordered, align 2
- ret void
-}
-
-define void @shared_unordered_volatile_sys_i32(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_unordered_volatile_sys_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i32_param_0];
-; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i32, ptr addrspace(3) %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr addrspace(3) %a unordered, align 4
- ret void
-}
-
-define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_unordered_volatile_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i64_param_0];
-; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i64, ptr addrspace(3) %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr addrspace(3) %a unordered, align 8
- ret void
-}
-
-define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_unordered_volatile_sys_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr addrspace(3) %a unordered, align 4
- ret void
-}
-
-define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_unordered_volatile_sys_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr addrspace(3) %a unordered, align 8
- ret void
-}
-
-; shared_monotonic_sys
-
-define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_monotonic_sys_i8(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
-; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_monotonic_sys_i8(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
- ret void
-}
-
-define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_monotonic_sys_i16(
-; SM60: {
-; SM60-NEXT: .reg .b16 %rs<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
-; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
-; SM60-NEXT: add.s16 %rs2, %rs1, 1;
-; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_monotonic_sys_i16(
-; SM70: {
-; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1];
-; SM70-NEXT: add.s16 %rs2, %rs1, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2;
-; SM70-NEXT: ret;
- %a.load = load atomic i16, ptr addrspace(3) %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr addrspace(3) %a monotonic, align 2
- ret void
-}
-
-define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_monotonic_sys_i32(
-; SM60: {
-; SM60-NEXT: .reg .b32 %r<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
-; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
-; SM60-NEXT: add.s32 %r2, %r1, 1;
-; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_monotonic_sys_i32(
-; SM70: {
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1];
-; SM70-NEXT: add.s32 %r2, %r1, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2;
-; SM70-NEXT: ret;
- %a.load = load atomic i32, ptr addrspace(3) %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr addrspace(3) %a monotonic, align 4
- ret void
-}
-
-define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_monotonic_sys_i64(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<4>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
-; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
-; SM60-NEXT: add.s64 %rd3, %rd2, 1;
-; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_monotonic_sys_i64(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<4>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1];
-; SM70-NEXT: add.s64 %rd3, %rd2, 1;
-; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3;
-; SM70-NEXT: ret;
- %a.load = load atomic i64, ptr addrspace(3) %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr addrspace(3) %a monotonic, align 8
- ret void
-}
-
-define void @shared_monotonic_sys_float(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_monotonic_sys_float(
-; SM60: {
-; SM60-NEXT: .reg .f32 %f<3>;
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
-; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
-; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_monotonic_sys_float(
-; SM70: {
-; SM70-NEXT: .reg .f32 %f<3>;
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1];
-; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2;
-; SM70-NEXT: ret;
- %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr addrspace(3) %a monotonic, align 4
- ret void
-}
-
-define void @shared_monotonic_sys_double(ptr addrspace(3) %a) {
-; SM60-LABEL: shared_monotonic_sys_double(
-; SM60: {
-; SM60-NEXT: .reg .b64 %rd<2>;
-; SM60-NEXT: .reg .f64 %fd<3>;
-; SM60-EMPTY:
-; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
-; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
-; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
-; SM60-NEXT: ret;
-;
-; SM70-LABEL: shared_monotonic_sys_double(
-; SM70: {
-; SM70-NEXT: .reg .b64 %rd<2>;
-; SM70-NEXT: .reg .f64 %fd<3>;
-; SM70-EMPTY:
-; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
-; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1];
-; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2;
-; SM70-NEXT: ret;
- %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr addrspace(3) %a monotonic, align 8
- ret void
-}
-
-; shared_monotonic_volatile_sys
-
-define void @shared_monotonic_volatile_sys_i8(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_monotonic_volatile_sys_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i8_param_0];
-; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
- ret void
-}
-
-define void @shared_monotonic_volatile_sys_i16(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_monotonic_volatile_sys_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i16_param_0];
-; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT: cvt.u16.u32 %rs27, %r40;
+; CHECK-NEXT: add.s16 %rs28, %rs27, 1;
+; CHECK-NEXT: cvt.u32.u16 %r41, %rs28;
+; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U;
+; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs29, %r43;
+; CHECK-NEXT: add.s16 %rs30, %rs29, 1;
+; CHECK-NEXT: cvt.u32.u16 %r44, %rs30;
+; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs31, %r45;
+; CHECK-NEXT: add.s16 %rs32, %rs31, 1;
+; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
+; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
+; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
+; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT: ret;
- %a.load = load atomic volatile i16, ptr addrspace(3) %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr addrspace(3) %a monotonic, align 2
+ %a.load = load volatile <16 x i8>, ptr addrspace(3) %a
+ %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ store volatile <16 x i8> %a.add, ptr addrspace(3) %a
ret void
}
-define void @shared_monotonic_volatile_sys_i32(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_monotonic_volatile_sys_i32(
+define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xi16(
; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0];
; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2;
; CHECK-NEXT: ret;
- %a.load = load atomic volatile i32, ptr addrspace(3) %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr addrspace(3) %a monotonic, align 4
- ret void
-}
-
-define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_monotonic_volatile_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i64_param_0];
-; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i64, ptr addrspace(3) %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr addrspace(3) %a monotonic, align 8
+ %a.load = load volatile <2 x i16>, ptr addrspace(3) %a
+ %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
+ store volatile <2 x i16> %a.add, ptr addrspace(3) %a
ret void
}
-define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_monotonic_volatile_sys_float(
+define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_4xi16(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b16 %rs<9>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
+; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
+; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
+; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT: ret;
- %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr addrspace(3) %a monotonic, align 4
+ %a.load = load volatile <4 x i16>, ptr addrspace(3) %a
+ %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
+ store volatile <4 x i16> %a.add, ptr addrspace(3) %a
ret void
}
-define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) {
-; CHECK-LABEL: shared_monotonic_volatile_sys_double(
+define void @shared_volatile_8xi16(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_8xi16(
; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<17>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
+; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT: add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT: add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7};
+; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2;
+; CHECK-NEXT: add.s16 %rs11, %rs10, 1;
+; CHECK-NEXT: add.s16 %rs12, %rs9, 1;
+; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11};
+; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1;
+; CHECK-NEXT: add.s16 %rs15, %rs14, 1;
+; CHECK-NEXT: add.s16 %rs16, %rs13, 1;
+; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15};
+; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT: ret;
- %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr addrspace(3) %a monotonic, align 8
+ %a.load = load volatile <8 x i16>, ptr addrspace(3) %a
+ %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ store volatile <8 x i16> %a.add, ptr addrspace(3) %a
ret void
}
-;; local statespace
-
-; local
-
-define void @local_i8(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_i8(
+define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xi32(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_i8_param_0];
-; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r2, 1;
+; CHECK-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3};
; CHECK-NEXT: ret;
- %a.load = load i8, ptr addrspace(5) %a
- %a.add = add i8 %a.load, 1
- store i8 %a.add, ptr addrspace(5) %a
+ %a.load = load volatile <2 x i32>, ptr addrspace(3) %a
+ %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
+ store volatile <2 x i32> %a.add, ptr addrspace(3) %a
ret void
}
-define void @local_i16(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_i16(
+define void @shared_volatile_4xi32(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_4xi32(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_i16_param_0];
-; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r4, 1;
+; CHECK-NEXT: add.s32 %r6, %r3, 1;
+; CHECK-NEXT: add.s32 %r7, %r2, 1;
+; CHECK-NEXT: add.s32 %r8, %r1, 1;
+; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT: ret;
- %a.load = load i16, ptr addrspace(5) %a
- %a.add = add i16 %a.load, 1
- store i16 %a.add, ptr addrspace(5) %a
+ %a.load = load volatile <4 x i32>, ptr addrspace(3) %a
+ %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
+ store volatile <4 x i32> %a.add, ptr addrspace(3) %a
ret void
}
-define void @local_i32(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_i32(
+define void @shared_volatile_2xi64(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xi64(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_i32_param_0];
-; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT: ret;
- %a.load = load i32, ptr addrspace(5) %a
- %a.add = add i32 %a.load, 1
- store i32 %a.add, ptr addrspace(5) %a
+ %a.load = load volatile <2 x i64>, ptr addrspace(3) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store volatile <2 x i64> %a.add, ptr addrspace(3) %a
ret void
}
-define void @local_i64(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_i64(
+define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xfloat(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_i64_param_0];
-; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3};
; CHECK-NEXT: ret;
- %a.load = load i64, ptr addrspace(5) %a
- %a.add = add i64 %a.load, 1
- store i64 %a.add, ptr addrspace(5) %a
+ %a.load = load volatile <2 x float>, ptr addrspace(3) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store volatile <2 x float> %a.add, ptr addrspace(3) %a
ret void
}
-define void @local_float(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_float(
+define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_4xfloat(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .f32 %f<9>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_float_param_0];
-; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0];
+; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
; CHECK-NEXT: ret;
- %a.load = load float, ptr addrspace(5) %a
- %a.add = fadd float %a.load, 1.
- store float %a.add, ptr addrspace(5) %a
+ %a.load = load volatile <4 x float>, ptr addrspace(3) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store volatile <4 x float> %a.add, ptr addrspace(3) %a
ret void
}
-define void @local_double(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_double(
+define void @shared_volatile_2xdouble(ptr addrspace(3) %a) {
+; CHECK-LABEL: shared_volatile_2xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_double_param_0];
-; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0];
+; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3};
; CHECK-NEXT: ret;
- %a.load = load double, ptr addrspace(5) %a
- %a.add = fadd double %a.load, 1.
- store double %a.add, ptr addrspace(5) %a
+ %a.load = load volatile <2 x double>, ptr addrspace(3) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store volatile <2 x double> %a.add, ptr addrspace(3) %a
ret void
}
+;; local statespace
+
+; local
+
define void @local_2xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi8(
; CHECK: {
@@ -5424,184 +2795,77 @@ define void @local_2xi64(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0];
; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
-; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4};
-; CHECK-NEXT: ret;
- %a.load = load <2 x i64>, ptr addrspace(5) %a
- %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
- store <2 x i64> %a.add, ptr addrspace(5) %a
- ret void
-}
-
-define void @local_2xfloat(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_2xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0];
-; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3};
-; CHECK-NEXT: ret;
- %a.load = load <2 x float>, ptr addrspace(5) %a
- %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
- store <2 x float> %a.add, ptr addrspace(5) %a
- ret void
-}
-
-define void @local_4xfloat(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_4xfloat(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<9>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0];
-; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
-; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
-; CHECK-NEXT: ret;
- %a.load = load <4 x float>, ptr addrspace(5) %a
- %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
- store <4 x float> %a.add, ptr addrspace(5) %a
- ret void
-}
-
-define void @local_2xdouble(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_2xdouble(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0];
-; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
-; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3};
-; CHECK-NEXT: ret;
- %a.load = load <2 x double>, ptr addrspace(5) %a
- %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
- store <2 x double> %a.add, ptr addrspace(5) %a
- ret void
-}
-
-; local_volatile
-
-define void @local_volatile_i8(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_volatile_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i8_param_0];
-; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i8, ptr addrspace(5) %a
- %a.add = add i8 %a.load, 1
- store volatile i8 %a.add, ptr addrspace(5) %a
- ret void
-}
-
-define void @local_volatile_i16(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_volatile_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i16_param_0];
-; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load volatile i16, ptr addrspace(5) %a
- %a.add = add i16 %a.load, 1
- store volatile i16 %a.add, ptr addrspace(5) %a
- ret void
-}
-
-define void @local_volatile_i32(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_volatile_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i32_param_0];
-; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.local.u32 [%rd1], %r2;
+; CHECK-NEXT: add.s64 %rd4, %rd3, 1;
+; CHECK-NEXT: add.s64 %rd5, %rd2, 1;
+; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT: ret;
- %a.load = load volatile i32, ptr addrspace(5) %a
- %a.add = add i32 %a.load, 1
- store volatile i32 %a.add, ptr addrspace(5) %a
+ %a.load = load <2 x i64>, ptr addrspace(5) %a
+ %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
+ store <2 x i64> %a.add, ptr addrspace(5) %a
ret void
}
-define void @local_volatile_i64(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_volatile_i64(
+define void @local_2xfloat(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xfloat(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i64_param_0];
-; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0];
+; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3};
; CHECK-NEXT: ret;
- %a.load = load volatile i64, ptr addrspace(5) %a
- %a.add = add i64 %a.load, 1
- store volatile i64 %a.add, ptr addrspace(5) %a
+ %a.load = load <2 x float>, ptr addrspace(5) %a
+ %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
+ store <2 x float> %a.add, ptr addrspace(5) %a
ret void
}
-define void @local_volatile_float(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_volatile_float(
+define void @local_4xfloat(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_4xfloat(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .f32 %f<9>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_float_param_0];
-; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.f32 [%rd1], %f2;
+; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0];
+; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000;
+; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000;
+; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
; CHECK-NEXT: ret;
- %a.load = load volatile float, ptr addrspace(5) %a
- %a.add = fadd float %a.load, 1.
- store volatile float %a.add, ptr addrspace(5) %a
+ %a.load = load <4 x float>, ptr addrspace(5) %a
+ %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
+ store <4 x float> %a.add, ptr addrspace(5) %a
ret void
}
-define void @local_volatile_double(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_volatile_double(
+define void @local_2xdouble(ptr addrspace(5) %a) {
+; CHECK-LABEL: local_2xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_double_param_0];
-; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0];
+; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
+; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3};
; CHECK-NEXT: ret;
- %a.load = load volatile double, ptr addrspace(5) %a
- %a.add = fadd double %a.load, 1.
- store volatile double %a.add, ptr addrspace(5) %a
+ %a.load = load <2 x double>, ptr addrspace(5) %a
+ %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
+ store <2 x double> %a.add, ptr addrspace(5) %a
ret void
}
+; local_volatile
+
define void @local_volatile_2xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi8(
; CHECK: {
@@ -6001,439 +3265,3 @@ define void @local_volatile_2xdouble(ptr addrspace(5) %a) {
store volatile <2 x double> %a.add, ptr addrspace(5) %a
ret void
}
-
-; local_unordered_sys
-
-define void @local_unordered_sys_i8(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_sys_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i8_param_0];
-; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
- ret void
-}
-
-define void @local_unordered_sys_i16(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_sys_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i16_param_0];
-; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic i16, ptr addrspace(5) %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr addrspace(5) %a unordered, align 2
- ret void
-}
-
-define void @local_unordered_sys_i32(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_sys_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i32_param_0];
-; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.local.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load atomic i32, ptr addrspace(5) %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr addrspace(5) %a unordered, align 4
- ret void
-}
-
-define void @local_unordered_sys_i64(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i64_param_0];
-; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load atomic i64, ptr addrspace(5) %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr addrspace(5) %a unordered, align 8
- ret void
-}
-
-define void @local_unordered_sys_float(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_sys_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_float_param_0];
-; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr addrspace(5) %a unordered, align 4
- ret void
-}
-
-define void @local_unordered_sys_double(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_sys_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_double_param_0];
-; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr addrspace(5) %a unordered, align 8
- ret void
-}
-
-; local_unordered_volatile_sys
-
-define void @local_unordered_volatile_sys_i8(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_volatile_sys_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i8_param_0];
-; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
- ret void
-}
-
-define void @local_unordered_volatile_sys_i16(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_volatile_sys_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i16_param_0];
-; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i16, ptr addrspace(5) %a unordered, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr addrspace(5) %a unordered, align 2
- ret void
-}
-
-define void @local_unordered_volatile_sys_i32(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_volatile_sys_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i32_param_0];
-; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.local.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i32, ptr addrspace(5) %a unordered, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr addrspace(5) %a unordered, align 4
- ret void
-}
-
-define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_volatile_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i64_param_0];
-; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i64, ptr addrspace(5) %a unordered, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr addrspace(5) %a unordered, align 8
- ret void
-}
-
-define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_volatile_sys_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr addrspace(5) %a unordered, align 4
- ret void
-}
-
-define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_unordered_volatile_sys_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr addrspace(5) %a unordered, align 8
- ret void
-}
-
-; local_monotonic_sys
-
-define void @local_monotonic_sys_i8(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_sys_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i8_param_0];
-; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
- ret void
-}
-
-define void @local_monotonic_sys_i16(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_sys_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i16_param_0];
-; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic i16, ptr addrspace(5) %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic i16 %a.add, ptr addrspace(5) %a monotonic, align 2
- ret void
-}
-
-define void @local_monotonic_sys_i32(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_sys_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i32_param_0];
-; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.local.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load atomic i32, ptr addrspace(5) %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic i32 %a.add, ptr addrspace(5) %a monotonic, align 4
- ret void
-}
-
-define void @local_monotonic_sys_i64(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i64_param_0];
-; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load atomic i64, ptr addrspace(5) %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic i64 %a.add, ptr addrspace(5) %a monotonic, align 8
- ret void
-}
-
-define void @local_monotonic_sys_float(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_sys_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_float_param_0];
-; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic float %a.add, ptr addrspace(5) %a monotonic, align 4
- ret void
-}
-
-define void @local_monotonic_sys_double(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_sys_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_double_param_0];
-; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic double %a.add, ptr addrspace(5) %a monotonic, align 8
- ret void
-}
-
-; local_monotonic_volatile_sys
-
-define void @local_monotonic_volatile_sys_i8(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_volatile_sys_i8(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i8_param_0];
-; CHECK-NEXT: ld.local.u8 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u8 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
- ret void
-}
-
-define void @local_monotonic_volatile_sys_i16(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_volatile_sys_i16(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i16_param_0];
-; CHECK-NEXT: ld.local.u16 %rs1, [%rd1];
-; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.u16 [%rd1], %rs2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i16, ptr addrspace(5) %a monotonic, align 2
- %a.add = add i16 %a.load, 1
- store atomic volatile i16 %a.add, ptr addrspace(5) %a monotonic, align 2
- ret void
-}
-
-define void @local_monotonic_volatile_sys_i32(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_volatile_sys_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i32_param_0];
-; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.local.u32 [%rd1], %r2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i32, ptr addrspace(5) %a monotonic, align 4
- %a.add = add i32 %a.load, 1
- store atomic volatile i32 %a.add, ptr addrspace(5) %a monotonic, align 4
- ret void
-}
-
-define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_volatile_sys_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i64_param_0];
-; CHECK-NEXT: ld.local.u64 %rd2, [%rd1];
-; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.local.u64 [%rd1], %rd3;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile i64, ptr addrspace(5) %a monotonic, align 8
- %a.add = add i64 %a.load, 1
- store atomic volatile i64 %a.add, ptr addrspace(5) %a monotonic, align 8
- ret void
-}
-
-define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_volatile_sys_float(
-; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT: ld.local.f32 %f1, [%rd1];
-; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT: st.local.f32 [%rd1], %f2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4
- %a.add = fadd float %a.load, 1.
- store atomic volatile float %a.add, ptr addrspace(5) %a monotonic, align 4
- ret void
-}
-
-define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) {
-; CHECK-LABEL: local_monotonic_volatile_sys_double(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-NEXT: .reg .f64 %fd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT: ld.local.f64 %fd1, [%rd1];
-; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.f64 [%rd1], %fd2;
-; CHECK-NEXT: ret;
- %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8
- %a.add = fadd double %a.load, 1.
- store atomic volatile double %a.add, ptr addrspace(5) %a monotonic, align 8
- ret void
-}
>From 2710318724dfca6409c42b0b1d05f3872274e288 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Thu, 12 Dec 2024 23:16:27 +0000
Subject: [PATCH 03/11] Address review feedback
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 25 ++---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 113 +++++++-------------
2 files changed, 49 insertions(+), 89 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 8536be18b89e01..548457baf99533 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1346,6 +1346,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
return true;
}
+static bool isVectorElementTypeUpsized(EVT EltVT) {
+ // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
+ // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
+ // vectorized loads/stores with the actual element type for i8/i16 as that
+ // would require v8/v16 variants that do not exist.
+ // In order to load/store such vectors efficiently, in Type Legalization
+ // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
+ // lower to PTX as vectors of b32.
+ return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
+}
+
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
MemSDNode *MemSD = cast<MemSDNode>(N);
EVT LoadedVT = MemSD->getMemoryVT();
@@ -1400,12 +1411,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
EVT EltVT = N->getValueType(0);
- // Vectors of 8-and-16-bit elements above a certain size are special cases.
- // PTX doesn't have anything larger than ld.v4 for those element types.
- // In Type Legalization, rather than splitting those vectors into multiple
- // loads, we split the vector into v2x16/v4i8 chunks. Now, we lower to PTX as
- // vector loads of b32.
- if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
+ if (isVectorElementTypeUpsized(EltVT)) {
EltVT = MVT::i32;
FromType = NVPTX::PTXLdStInstCode::Untyped;
FromTypeWidth = 32;
@@ -2085,12 +2091,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
return false;
}
- // Vectors of 8-and-16-bit elements above a certain size are special cases.
- // PTX doesn't have anything larger than st.v4 for those element types.
- // In Type Legalization, rather than splitting those vectors into multiple
- // stores, we split the vector into v2x16/v4i8 chunks. Now, we lower to
- // PTX as vector stores of b32.
- if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) {
+ if (isVectorElementTypeUpsized(EltVT)) {
EltVT = MVT::i32;
ToType = NVPTX::PTXLdStInstCode::Untyped;
ToTypeWidth = 32;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a7d6ccc2853c60..7e06f4ae3e94ec 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -100,6 +100,16 @@ static cl::opt<bool> ForceMinByValParamAlign(
" params of device functions."),
cl::init(false));
+static auto GetUpsizedNumEltsAndEltVT = [](unsigned OldNumElts, EVT OldEltVT) {
+ // Number of elements to pack in one word.
+ unsigned NPerWord = 32 / OldEltVT.getSizeInBits();
+ // Word-sized vector.
+ EVT NewEltVT = MVT::getVectorVT(OldEltVT.getSimpleVT(), NPerWord);
+ // Number of word-sized vectors.
+ unsigned NewNumElts = OldNumElts / NPerWord;
+ return std::pair(NewNumElts, NewEltVT);
+};
+
int NVPTXTargetLowering::getDivF32Level() const {
if (UsePrecDivF32.getNumOccurrences() > 0) {
// If nvptx-prec-div32=N is used on the command-line, always honor it
@@ -3163,11 +3173,13 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(N);
EVT ValVT = Val.getValueType();
- // Vectors of 8-and-16-bit elements above a certain size are special cases.
- // PTX doesn't have anything larger than st.v4 for those element types.
- // Here in Type Legalization, rather than splitting those vectors into
- // multiple stores, we split the vector into v2x16/v4i8 chunks. Later, in
- // Instruction Selection, we lower to PTX as vector stores of b32.
+ // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
+ // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
+ // vectorized loads/stores with the actual element type for i8/i16 as that
+ // would require v8/v16 variants that do not exist.
+ // In order to load/store such vectors efficiently, here in Type Legalization,
+ // we split the vector into word-sized chunks (v2x16/v4i8). Later, we will
+ // lower to PTX as vectors of b32.
bool UpsizeElementTypes = false;
if (ValVT.isVector()) {
@@ -3225,30 +3237,9 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
unsigned NumElts = ValVT.getVectorNumElements();
if (UpsizeElementTypes) {
- switch (ValVT.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("Unexpected Vector Type");
- case MVT::v8i8: // <2 x i8x4>
- NumElts = 2;
- EltVT = MVT::v4i8;
- break;
- case MVT::v8f16: // <4 x f16x2>
- NumElts = 4;
- EltVT = MVT::v2f16;
- break;
- case MVT::v8bf16: // <4 x bf16x2>
- NumElts = 4;
- EltVT = MVT::v2bf16;
- break;
- case MVT::v8i16: // <4 x i16x2>
- NumElts = 4;
- EltVT = MVT::v2i16;
- break;
- case MVT::v16i8: // <4 x i8x4>
- NumElts = 4;
- EltVT = MVT::v4i8;
- break;
- }
+ auto [NewNumElts, NewEltVT] = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
+ NumElts = NewNumElts;
+ EltVT = NewEltVT;
}
// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
@@ -3279,17 +3270,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
// stored as b32s
unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
- SmallVector<SDValue, 8> Elts;
- for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
- SDValue E = DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getVectorElementType(), Val,
- DAG.getIntPtrConstant(i * NumEltsPerSubVector + j, DL));
- Elts.push_back(E);
- }
- EVT VecVT =
- EVT::getVectorVT(*DAG.getContext(), EltVT.getVectorElementType(),
- NumEltsPerSubVector);
- SDValue SubVector = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Elts);
+ SmallVector<SDValue, 4> SubVectorElts;
+ DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector,
+ NumEltsPerSubVector);
+ SDValue SubVector = DAG.getBuildVector(EltVT, DL, SubVectorElts);
Ops.push_back(SubVector);
}
} else {
@@ -6244,11 +6228,13 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
assert(ResVT.isVector() && "Vector load must have vector type");
- // Vectors of 8-and-16-bit elements above a certain size are special cases.
- // PTX doesn't have anything larger than ld.v4 for those element types.
- // Here in Type Legalization, rather than splitting those vectors into
- // multiple loads, we split the vector into v2x16/v4i8 chunks. Later, in
- // Instruction Selection, we lower to PTX as vector loads of b32.
+ // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
+ // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
+ // vectorized loads/stores with the actual element type for i8/i16 as that
+ // would require v8/v16 variants that do not exist.
+ // In order to load/store such vectors efficiently, here in Type Legalization,
+ // we split the vector into word-sized chunks (v2x16/v4i8). Later, we will
+ // lower to PTX as vectors of b32.
bool UpsizeElementTypes = false;
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
@@ -6302,30 +6288,9 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
unsigned NumElts = ResVT.getVectorNumElements();
if (UpsizeElementTypes) {
- switch (ResVT.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("Unexpected Vector Type");
- case MVT::v8i8: // <2 x i8x4>
- NumElts = 2;
- EltVT = MVT::v4i8;
- break;
- case MVT::v8f16: // <4 x f16x2>
- NumElts = 4;
- EltVT = MVT::v2f16;
- break;
- case MVT::v8bf16: // <4 x bf16x2>
- NumElts = 4;
- EltVT = MVT::v2bf16;
- break;
- case MVT::v8i16: // <4 x i16x2>
- NumElts = 4;
- EltVT = MVT::v2i16;
- break;
- case MVT::v16i8: // <4 x i8x4>
- NumElts = 4;
- EltVT = MVT::v4i8;
- break;
- }
+ auto [NewNumElts, NewEltVT] = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
+ NumElts = NewNumElts;
+ EltVT = NewEltVT;
}
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
@@ -6366,19 +6331,13 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LD->getMemoryVT(),
LD->getMemOperand());
- SmallVector<SDValue, 8> ScalarRes;
+ SmallVector<SDValue> ScalarRes;
if (UpsizeElementTypes) {
// Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
// into individual elements.
- unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
SDValue SubVector = NewLD.getValue(i);
- for (unsigned j = 0; j < NumEltsPerSubVector; ++j) {
- SDValue E =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT.getScalarType(),
- SubVector, DAG.getIntPtrConstant(j, DL));
- ScalarRes.push_back(E);
- }
+ DAG.ExtractVectorElements(SubVector, ScalarRes);
}
} else {
for (unsigned i = 0; i < NumElts; ++i) {
>From c4437530e2e055efd388468cf111370a09759310 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Fri, 13 Dec 2024 00:35:19 +0000
Subject: [PATCH 04/11] Fix ld.global.nc edge case, it was crashing
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 2 +-
llvm/test/CodeGen/NVPTX/ldg-invariant.ll | 36 +++++++++++++++++++++
2 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 548457baf99533..612dc2dda8cadc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1584,7 +1584,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
NumElts /= 2;
} else if (OrigType == MVT::v4i8) {
EltVT = OrigType;
- NumElts = 1;
+ NumElts /= 4;
}
}
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index 5d0db6f80a83d9..f7ca67fe1f6a5d 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -56,6 +56,42 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
ret half %sum
}
+; CHECK-LABEL: @ld_global_v8i8
+define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
+; CHECK: ld.global.nc.v2.u32
+ %a = load <8 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
+ %v1 = extractelement <8 x i8> %a, i32 0
+ %v2 = extractelement <8 x i8> %a, i32 2
+ %v3 = extractelement <8 x i8> %a, i32 4
+ %v4 = extractelement <8 x i8> %a, i32 6
+ %sum1 = add i8 %v1, %v2
+ %sum2 = add i8 %v3, %v4
+ %sum = add i8 %sum1, %sum2
+ ret i8 %sum
+}
+
+; CHECK-LABEL: @ld_global_v16i8
+define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
+; CHECK: ld.global.nc.v4.u32
+ %a = load <16 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
+ %v1 = extractelement <16 x i8> %a, i32 0
+ %v2 = extractelement <16 x i8> %a, i32 2
+ %v3 = extractelement <16 x i8> %a, i32 4
+ %v4 = extractelement <16 x i8> %a, i32 6
+ %v5 = extractelement <16 x i8> %a, i32 8
+ %v6 = extractelement <16 x i8> %a, i32 10
+ %v7 = extractelement <16 x i8> %a, i32 12
+ %v8 = extractelement <16 x i8> %a, i32 14
+ %sum1 = add i8 %v1, %v2
+ %sum2 = add i8 %v3, %v4
+ %sum3 = add i8 %v5, %v6
+ %sum4 = add i8 %v7, %v8
+ %sum5 = add i8 %sum1, %sum2
+ %sum6 = add i8 %sum3, %sum4
+ %sum7 = add i8 %sum5, %sum6
+ ret i8 %sum7
+}
+
; CHECK-LABEL: @ld_global_v2i32
define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) {
; CHECK: ld.global.nc.v2.{{[a-z]}}32
>From 06d0adef6afd17ed017773ae007a0e53eb4c3c4d Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Fri, 13 Dec 2024 21:53:04 +0000
Subject: [PATCH 05/11] Update ldg-invariant.ll to be auto generated
---
llvm/test/CodeGen/NVPTX/ldg-invariant.ll | 203 ++++++++++++++++++++---
1 file changed, 183 insertions(+), 20 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index f7ca67fe1f6a5d..8875b0d6a3fde8 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -1,21 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
; Check that invariant loads from the global addrspace are lowered to
; ld.global.nc.
-; CHECK-LABEL: @ld_global
define i32 @ld_global(ptr addrspace(1) %ptr) {
-; CHECK: ld.global.nc.{{[a-z]}}32
+; CHECK-LABEL: ld_global(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_param_0];
+; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%a = load i32, ptr addrspace(1) %ptr, !invariant.load !0
ret i32 %a
}
-; CHECK-LABEL: @ld_global_v2f16
define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
; Load of v2f16 is weird. We consider it to be a legal type, which happens to be
; loaded/stored as a 32-bit scalar.
-; CHECK: ld.global.nc.u32
+; CHECK-LABEL: ld_global_v2f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<4>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f16_param_0];
+; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
+; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: ret;
%a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <2 x half> %a, i32 0
%v2 = extractelement <2 x half> %a, i32 1
@@ -23,12 +47,33 @@ define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
ret half %sum
}
-; CHECK-LABEL: @ld_global_v4f16
define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
; Larger f16 vectors may be split into individual f16 elements and multiple
; loads/stores may be vectorized using f16 element type. Practically it's
; limited to v4 variant only.
-; CHECK: ld.global.nc.v4.u16
+; CHECK-LABEL: ld_global_v4f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<8>;
+; CHECK-NEXT: .reg .f32 %f<10>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4f16_param_0];
+; CHECK-NEXT: ld.global.nc.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
+; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NEXT: cvt.f32.f16 %f4, %rs4;
+; CHECK-NEXT: cvt.f32.f16 %f5, %rs3;
+; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NEXT: cvt.f32.f16 %f7, %rs6;
+; CHECK-NEXT: cvt.f32.f16 %f8, %rs5;
+; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT: ret;
%a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <4 x half> %a, i32 0
%v2 = extractelement <4 x half> %a, i32 1
@@ -40,11 +85,37 @@ define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
ret half %sum
}
-; CHECK-LABEL: @ld_global_v8f16
define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
; Larger vectors are, again, loaded as v4i32. PTX has no v8 variants of loads/stores,
; so load/store vectorizer has to convert v8f16 -> v4 x v2f16.
-; CHECK: ld.global.nc.v4.u32
+; CHECK-LABEL: ld_global_v8f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<8>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .f32 %f<10>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8f16_param_0];
+; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; }
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; }
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs4, tmp}, %r2; }
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs4;
+; CHECK-NEXT: cvt.f32.f16 %f2, %rs3;
+; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NEXT: cvt.f32.f16 %f4, %rs2;
+; CHECK-NEXT: cvt.f32.f16 %f5, %rs1;
+; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NEXT: cvt.f32.f16 %f7, %rs6;
+; CHECK-NEXT: cvt.f32.f16 %f8, %rs5;
+; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT: ret;
%a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <8 x half> %a, i32 0
%v2 = extractelement <8 x half> %a, i32 2
@@ -56,9 +127,31 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
ret half %sum
}
-; CHECK-LABEL: @ld_global_v8i8
define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
-; CHECK: ld.global.nc.v2.u32
+; CHECK-LABEL: ld_global_v8i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<8>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8i8_param_0];
+; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r3, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT: bfe.u32 %r5, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
+; CHECK-NEXT: bfe.u32 %r6, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT: add.s16 %rs5, %rs4, %rs3;
+; CHECK-NEXT: add.s16 %rs6, %rs2, %rs1;
+; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6;
+; CHECK-NEXT: cvt.u32.u16 %r7, %rs7;
+; CHECK-NEXT: and.b32 %r8, %r7, 255;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ret;
%a = load <8 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <8 x i8> %a, i32 0
%v2 = extractelement <8 x i8> %a, i32 2
@@ -70,9 +163,43 @@ define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
ret i8 %sum
}
-; CHECK-LABEL: @ld_global_v16i8
define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
-; CHECK: ld.global.nc.v4.u32
+; CHECK-LABEL: ld_global_v16i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<16>;
+; CHECK-NEXT: .reg .b32 %r<15>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v16i8_param_0];
+; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
+; CHECK-NEXT: bfe.u32 %r6, %r4, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r6;
+; CHECK-NEXT: bfe.u32 %r7, %r3, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
+; CHECK-NEXT: bfe.u32 %r8, %r3, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r8;
+; CHECK-NEXT: bfe.u32 %r9, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r9;
+; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs6, %r10;
+; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r11;
+; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs8, %r12;
+; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
+; CHECK-NEXT: add.s16 %rs10, %rs6, %rs5;
+; CHECK-NEXT: add.s16 %rs11, %rs4, %rs3;
+; CHECK-NEXT: add.s16 %rs12, %rs2, %rs1;
+; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10;
+; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12;
+; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14;
+; CHECK-NEXT: cvt.u32.u16 %r13, %rs15;
+; CHECK-NEXT: and.b32 %r14, %r13, 255;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
+; CHECK-NEXT: ret;
%a = load <16 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <16 x i8> %a, i32 0
%v2 = extractelement <16 x i8> %a, i32 2
@@ -92,9 +219,18 @@ define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
ret i8 %sum7
}
-; CHECK-LABEL: @ld_global_v2i32
define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) {
-; CHECK: ld.global.nc.v2.{{[a-z]}}32
+; CHECK-LABEL: ld_global_v2i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2i32_param_0];
+; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
%a = load <2 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <2 x i32> %a, i32 0
%v2 = extractelement <2 x i32> %a, i32 1
@@ -102,9 +238,20 @@ define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) {
ret i32 %sum
}
-; CHECK-LABEL: @ld_global_v4i32
define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) {
-; CHECK: ld.global.nc.v4.{{[a-z]}}32
+; CHECK-LABEL: ld_global_v4i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4i32_param_0];
+; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: add.s32 %r5, %r1, %r2;
+; CHECK-NEXT: add.s32 %r6, %r3, %r4;
+; CHECK-NEXT: add.s32 %r7, %r5, %r6;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT: ret;
%a = load <4 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <4 x i32> %a, i32 0
%v2 = extractelement <4 x i32> %a, i32 1
@@ -116,16 +263,32 @@ define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) {
ret i32 %sum3
}
-; CHECK-LABEL: @ld_not_invariant
define i32 @ld_not_invariant(ptr addrspace(1) %ptr) {
-; CHECK: ld.global.{{[a-z]}}32
+; CHECK-LABEL: ld_not_invariant(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_invariant_param_0];
+; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%a = load i32, ptr addrspace(1) %ptr
ret i32 %a
}
-; CHECK-LABEL: @ld_not_global_addrspace
define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) {
-; CHECK: ld.{{[a-z]}}32
+; CHECK-LABEL: ld_not_global_addrspace(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_global_addrspace_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
%a = load i32, ptr addrspace(0) %ptr
ret i32 %a
}
>From 7a248026bf6e07f44415c871c6ccb32bd6c38ce4 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Fri, 13 Dec 2024 22:02:27 +0000
Subject: [PATCH 06/11] Add assert
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 612dc2dda8cadc..d561bb69a88a10 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1583,6 +1583,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
EltVT = OrigType;
NumElts /= 2;
} else if (OrigType == MVT::v4i8) {
+ assert(NumElts % 4 == 0 && "NumElts must be a multuple of 4");
EltVT = OrigType;
NumElts /= 4;
}
>From 3ac4c5941fa1bb6912ad1cb89e8e420972ae1b88 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Mon, 16 Dec 2024 19:23:20 +0000
Subject: [PATCH 07/11] Clean up with std::tie
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 28 +++++++++------------
1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7e06f4ae3e94ec..ee65f8cea50f07 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -100,16 +100,6 @@ static cl::opt<bool> ForceMinByValParamAlign(
" params of device functions."),
cl::init(false));
-static auto GetUpsizedNumEltsAndEltVT = [](unsigned OldNumElts, EVT OldEltVT) {
- // Number of elements to pack in one word.
- unsigned NPerWord = 32 / OldEltVT.getSizeInBits();
- // Word-sized vector.
- EVT NewEltVT = MVT::getVectorVT(OldEltVT.getSimpleVT(), NPerWord);
- // Number of word-sized vectors.
- unsigned NewNumElts = OldNumElts / NPerWord;
- return std::pair(NewNumElts, NewEltVT);
-};
-
int NVPTXTargetLowering::getDivF32Level() const {
if (UsePrecDivF32.getNumOccurrences() > 0) {
// If nvptx-prec-div32=N is used on the command-line, always honor it
@@ -172,6 +162,16 @@ static bool Is16bitsType(MVT VT) {
VT.SimpleTy == MVT::i16);
}
+static auto GetUpsizedNumEltsAndEltVT(unsigned OldNumElts, EVT OldEltVT) {
+ // Number of elements to pack in one word.
+ unsigned NPerWord = 32 / OldEltVT.getSizeInBits();
+ // Word-sized vector.
+ EVT NewEltVT = MVT::getVectorVT(OldEltVT.getSimpleVT(), NPerWord);
+ // Number of word-sized vectors.
+ unsigned NewNumElts = OldNumElts / NPerWord;
+ return std::pair(NewNumElts, NewEltVT);
+};
+
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
/// into their primitive components.
@@ -3237,9 +3237,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
unsigned NumElts = ValVT.getVectorNumElements();
if (UpsizeElementTypes) {
- auto [NewNumElts, NewEltVT] = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
- NumElts = NewNumElts;
- EltVT = NewEltVT;
+ std::tie(NumElts, EltVT) = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
}
// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
@@ -6288,9 +6286,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
unsigned NumElts = ResVT.getVectorNumElements();
if (UpsizeElementTypes) {
- auto [NewNumElts, NewEltVT] = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
- NumElts = NewNumElts;
- EltVT = NewEltVT;
+ std::tie(NumElts, EltVT) = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
}
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
>From 0bd02be1716bd75390174baca21ada4b36466dca Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Mon, 16 Dec 2024 21:11:03 +0000
Subject: [PATCH 08/11] hoist more shared logic into the helper function
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 299 +++++++++-----------
1 file changed, 132 insertions(+), 167 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ee65f8cea50f07..72630fcf5f9764 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -64,6 +64,7 @@
#include <iterator>
#include <optional>
#include <string>
+#include <tuple>
#include <utility>
#include <vector>
@@ -162,14 +163,70 @@ static bool Is16bitsType(MVT VT) {
VT.SimpleTy == MVT::i16);
}
-static auto GetUpsizedNumEltsAndEltVT(unsigned OldNumElts, EVT OldEltVT) {
- // Number of elements to pack in one word.
- unsigned NPerWord = 32 / OldEltVT.getSizeInBits();
- // Word-sized vector.
- EVT NewEltVT = MVT::getVectorVT(OldEltVT.getSimpleVT(), NPerWord);
- // Number of word-sized vectors.
- unsigned NewNumElts = OldNumElts / NPerWord;
- return std::pair(NewNumElts, NewEltVT);
+// When legalizing vector loads/stores, this function is called, which does two things:
+// 1. Determines Whether the vector is something we want to custom lower, std::nullopt is returned if we do not want to custom lower it.
+// 2. If we do want to handle it, returns three parameters:
+// - unsigned int NumElts - The number of elements in the final vector
+// - EVT EltVT - The type of the elements in the final vector
+// - bool UpsizeElementTypes - Whether or not we are upsizing the elements of the vectors
+static std::optional<std::tuple<unsigned int, EVT, bool>> tryGetVectorLoweringParams(EVT ValVT) {
+ // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
+ // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
+ // vectorized loads/stores with the actual element type for i8/i16 as that
+ // would require v8/v16 variants that do not exist.
+ // In order to load/store such vectors efficiently, here in Type Legalization,
+ // we split the vector into word-sized chunks (v2x16/v4i8). Later, we will
+ // lower to PTX as vectors of b32.
+ bool UpsizeElementTypes = false;
+
+ if (!ValVT.isVector() || !ValVT.isSimple())
+ return std::nullopt;
+
+ EVT EltVT = ValVT.getVectorElementType();
+ unsigned NumElts = ValVT.getVectorNumElements();
+
+ // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+ // legal. We can (and should) split that into 2 stores of <2 x double> here
+ // but I'm leaving that as a TODO for now.
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ default:
+ return std::nullopt;
+ case MVT::v2i8:
+ case MVT::v2i16:
+ case MVT::v2i32:
+ case MVT::v2i64:
+ case MVT::v2f16:
+ case MVT::v2bf16:
+ case MVT::v2f32:
+ case MVT::v2f64:
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v4i32:
+ case MVT::v4f16:
+ case MVT::v4bf16:
+ case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ case MVT::v8i8: // <2 x i8x4>
+ case MVT::v8f16: // <4 x f16x2>
+ case MVT::v8bf16: // <4 x bf16x2>
+ case MVT::v8i16: // <4 x i16x2>
+ case MVT::v16i8: // <4 x i8x4>
+ // This can be upsized into a "native" vector type
+ UpsizeElementTypes = true;
+ break;
+ }
+
+ if (UpsizeElementTypes) {
+ // Number of elements to pack in one word.
+ unsigned NPerWord = 32 / EltVT.getSizeInBits();
+ // Word-sized vector.
+ EltVT = MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord);
+ // Number of word-sized vectors.
+ NumElts = NumElts / NPerWord;
+ }
+
+ return std::tuple(NumElts, EltVT, UpsizeElementTypes);
};
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -3173,130 +3230,81 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(N);
EVT ValVT = Val.getValueType();
- // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
- // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
- // vectorized loads/stores with the actual element type for i8/i16 as that
- // would require v8/v16 variants that do not exist.
- // In order to load/store such vectors efficiently, here in Type Legalization,
- // we split the vector into word-sized chunks (v2x16/v4i8). Later, we will
- // lower to PTX as vectors of b32.
- bool UpsizeElementTypes = false;
-
- if (ValVT.isVector()) {
- // We only handle "native" vector sizes for now, e.g. <4 x double> is not
- // legal. We can (and should) split that into 2 stores of <2 x double> here
- // but I'm leaving that as a TODO for now.
- if (!ValVT.isSimple())
- return SDValue();
- switch (ValVT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v2i8:
- case MVT::v2i16:
- case MVT::v2i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v2bf16:
- case MVT::v2f32:
- case MVT::v2f64:
- case MVT::v4i8:
- case MVT::v4i16:
- case MVT::v4i32:
- case MVT::v4f16:
- case MVT::v4bf16:
- case MVT::v4f32:
- // This is a "native" vector type
- break;
- case MVT::v8i8: // <2 x i8x4>
- case MVT::v8f16: // <4 x f16x2>
- case MVT::v8bf16: // <4 x bf16x2>
- case MVT::v8i16: // <4 x i16x2>
- case MVT::v16i8: // <4 x i8x4>
- // This can be upsized into a "native" vector type
- UpsizeElementTypes = true;
- break;
- }
-
- MemSDNode *MemSD = cast<MemSDNode>(N);
- const DataLayout &TD = DAG.getDataLayout();
-
- Align Alignment = MemSD->getAlign();
- Align PrefAlign =
- TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
- if (Alignment < PrefAlign) {
- // This store is not sufficiently aligned, so bail out and let this vector
- // store be scalarized. Note that we may still be able to emit smaller
- // vector stores. For example, if we are storing a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return SDValue();
- }
+ auto VectorLoweringParams = tryGetVectorLoweringParams(ValVT);
+ if (!VectorLoweringParams)
+ return SDValue();
+ auto [NumElts, EltVT, UpsizeElementTypes] = VectorLoweringParams.value();
- unsigned Opcode = 0;
- EVT EltVT = ValVT.getVectorElementType();
- unsigned NumElts = ValVT.getVectorNumElements();
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ const DataLayout &TD = DAG.getDataLayout();
- if (UpsizeElementTypes) {
- std::tie(NumElts, EltVT) = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
- }
+ Align Alignment = MemSD->getAlign();
+ Align PrefAlign =
+ TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
+ if (Alignment < PrefAlign) {
+ // This store is not sufficiently aligned, so bail out and let this vector
+ // store be scalarized. Note that we may still be able to emit smaller
+ // vector stores. For example, if we are storing a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return SDValue();
+ }
- // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // stored type to i16 and propagate the "real" type as the memory type.
- bool NeedExt = false;
- if (EltVT.getSizeInBits() < 16)
- NeedExt = true;
+ // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // stored type to i16 and propagate the "real" type as the memory type.
+ bool NeedExt = false;
+ if (EltVT.getSizeInBits() < 16)
+ NeedExt = true;
- switch (NumElts) {
- default:
- return SDValue();
- case 2:
- Opcode = NVPTXISD::StoreV2;
- break;
- case 4:
- Opcode = NVPTXISD::StoreV4;
- break;
- }
+ unsigned Opcode = 0;
+ switch (NumElts) {
+ default:
+ return SDValue();
+ case 2:
+ Opcode = NVPTXISD::StoreV2;
+ break;
+ case 4:
+ Opcode = NVPTXISD::StoreV4;
+ break;
+ }
- SmallVector<SDValue, 8> Ops;
+ SmallVector<SDValue, 8> Ops;
- // First is the chain
- Ops.push_back(N->getOperand(0));
+ // First is the chain
+ Ops.push_back(N->getOperand(0));
- if (UpsizeElementTypes) {
- // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
- // stored as b32s
- unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
- for (unsigned i = 0; i < NumElts; ++i) {
- SmallVector<SDValue, 4> SubVectorElts;
- DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector,
- NumEltsPerSubVector);
- SDValue SubVector = DAG.getBuildVector(EltVT, DL, SubVectorElts);
- Ops.push_back(SubVector);
- }
- } else {
- // Then the split values
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i, DL));
- if (NeedExt)
- ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
- Ops.push_back(ExtVal);
- }
+ // Then the split values
+ if (UpsizeElementTypes) {
+ // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
+ // stored as b32s
+ unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SmallVector<SDValue, 4> SubVectorElts;
+ DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector,
+ NumEltsPerSubVector);
+ SDValue SubVector = DAG.getBuildVector(EltVT, DL, SubVectorElts);
+ Ops.push_back(SubVector);
}
+ } else {
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+ DAG.getIntPtrConstant(i, DL));
+ if (NeedExt)
+ ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+ Ops.push_back(ExtVal);
+ }
+ }
- // Then any remaining arguments
- Ops.append(N->op_begin() + 2, N->op_end());
-
- SDValue NewSt =
- DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
- MemSD->getMemoryVT(), MemSD->getMemOperand());
+ // Then any remaining arguments
+ Ops.append(N->op_begin() + 2, N->op_end());
- // return DCI.CombineTo(N, NewSt, true);
- return NewSt;
- }
+ SDValue NewSt =
+ DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+ MemSD->getMemoryVT(), MemSD->getMemOperand());
- return SDValue();
+ // return DCI.CombineTo(N, NewSt, true);
+ return NewSt;
}
// st i1 v, addr
@@ -6226,46 +6234,10 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
assert(ResVT.isVector() && "Vector load must have vector type");
- // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
- // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
- // vectorized loads/stores with the actual element type for i8/i16 as that
- // would require v8/v16 variants that do not exist.
- // In order to load/store such vectors efficiently, here in Type Legalization,
- // we split the vector into word-sized chunks (v2x16/v4i8). Later, we will
- // lower to PTX as vectors of b32.
- bool UpsizeElementTypes = false;
-
- // We only handle "native" vector sizes for now, e.g. <4 x double> is not
- // legal. We can (and should) split that into 2 loads of <2 x double> here
- // but I'm leaving that as a TODO for now.
- assert(ResVT.isSimple() && "Can only handle simple types");
- switch (ResVT.getSimpleVT().SimpleTy) {
- default:
+ auto VectorLoweringParams = tryGetVectorLoweringParams(ResVT);
+ if (!VectorLoweringParams)
return;
- case MVT::v2i8:
- case MVT::v2i16:
- case MVT::v2i32:
- case MVT::v2i64:
- case MVT::v2f16:
- case MVT::v2f32:
- case MVT::v2f64:
- case MVT::v4i8:
- case MVT::v4i16:
- case MVT::v4i32:
- case MVT::v4f16:
- case MVT::v4bf16:
- case MVT::v4f32:
- // This is a "native" vector type
- break;
- case MVT::v8i8: // <2 x i8x4>
- case MVT::v8f16: // <4 x f16x2>
- case MVT::v8bf16: // <4 x bf16x2>
- case MVT::v8i16: // <4 x i16x2>
- case MVT::v16i8: // <4 x i8x4>
- // This can be upsized into a "native" vector type
- UpsizeElementTypes = true;
- break;
- }
+ auto [NumElts, EltVT, UpsizeElementTypes] = VectorLoweringParams.value();
LoadSDNode *LD = cast<LoadSDNode>(N);
@@ -6282,13 +6254,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
return;
}
- EVT EltVT = ResVT.getVectorElementType();
- unsigned NumElts = ResVT.getVectorNumElements();
-
- if (UpsizeElementTypes) {
- std::tie(NumElts, EltVT) = GetUpsizedNumEltsAndEltVT(NumElts, EltVT);
- }
-
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
// loaded type to i16 and propagate the "real" type as the memory type.
>From 9540054dc245d0dbb21a360fc1e7dbfb4aa10ff4 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Mon, 16 Dec 2024 21:16:45 +0000
Subject: [PATCH 09/11] Clang format
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 72630fcf5f9764..8188b061401ea6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -163,13 +163,17 @@ static bool Is16bitsType(MVT VT) {
VT.SimpleTy == MVT::i16);
}
-// When legalizing vector loads/stores, this function is called, which does two things:
-// 1. Determines Whether the vector is something we want to custom lower, std::nullopt is returned if we do not want to custom lower it.
+// When legalizing vector loads/stores, this function is called, which does two
+// things:
+// 1. Determines Whether the vector is something we want to custom lower,
+// std::nullopt is returned if we do not want to custom lower it.
// 2. If we do want to handle it, returns three parameters:
// - unsigned int NumElts - The number of elements in the final vector
// - EVT EltVT - The type of the elements in the final vector
-// - bool UpsizeElementTypes - Whether or not we are upsizing the elements of the vectors
-static std::optional<std::tuple<unsigned int, EVT, bool>> tryGetVectorLoweringParams(EVT ValVT) {
+// - bool UpsizeElementTypes - Whether or not we are upsizing the elements of
+// the vector
+static std::optional<std::tuple<unsigned int, EVT, bool>>
+tryGetVectorLoweringParams(EVT ValVT) {
// Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
// total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
// vectorized loads/stores with the actual element type for i8/i16 as that
@@ -3239,8 +3243,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
const DataLayout &TD = DAG.getDataLayout();
Align Alignment = MemSD->getAlign();
- Align PrefAlign =
- TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
+ Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
if (Alignment < PrefAlign) {
// This store is not sufficiently aligned, so bail out and let this vector
// store be scalarized. Note that we may still be able to emit smaller
@@ -3289,7 +3292,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
} else {
for (unsigned i = 0; i < NumElts; ++i) {
SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
- DAG.getIntPtrConstant(i, DL));
+ DAG.getIntPtrConstant(i, DL));
if (NeedExt)
ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
Ops.push_back(ExtVal);
>From 09480b055afcb679ac59f4d87eef826a7c98d168 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Mon, 16 Dec 2024 23:17:45 +0000
Subject: [PATCH 10/11] Refactor helper function
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 39 +++++++++------------
1 file changed, 16 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8188b061401ea6..2b53eb16cb1286 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -173,16 +173,7 @@ static bool Is16bitsType(MVT VT) {
// - bool UpsizeElementTypes - Whether or not we are upsizing the elements of
// the vector
static std::optional<std::tuple<unsigned int, EVT, bool>>
-tryGetVectorLoweringParams(EVT ValVT) {
- // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
- // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
- // vectorized loads/stores with the actual element type for i8/i16 as that
- // would require v8/v16 variants that do not exist.
- // In order to load/store such vectors efficiently, here in Type Legalization,
- // we split the vector into word-sized chunks (v2x16/v4i8). Later, we will
- // lower to PTX as vectors of b32.
- bool UpsizeElementTypes = false;
-
+getVectorLoweringShape(EVT ValVT) {
if (!ValVT.isVector() || !ValVT.isSimple())
return std::nullopt;
@@ -210,27 +201,29 @@ tryGetVectorLoweringParams(EVT ValVT) {
case MVT::v4bf16:
case MVT::v4f32:
// This is a "native" vector type
- break;
+ return std::tuple(NumElts, EltVT, /* UpsizeElementTypes = */ false);
case MVT::v8i8: // <2 x i8x4>
case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
case MVT::v8i16: // <4 x i16x2>
case MVT::v16i8: // <4 x i8x4>
- // This can be upsized into a "native" vector type
- UpsizeElementTypes = true;
- break;
- }
+ // This can be upsized into a "native" vector type.
+ // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
+ // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
+ // vectorized loads/stores with the actual element type for i8/i16 as that
+ // would require v8/v16 variants that do not exist.
+ // In order to load/store such vectors efficiently, here in Type
+ // Legalization, we split the vector into word-sized chunks (v2x16/v4i8).
+ // Later, we will lower to PTX as vectors of b32.
- if (UpsizeElementTypes) {
// Number of elements to pack in one word.
unsigned NPerWord = 32 / EltVT.getSizeInBits();
- // Word-sized vector.
- EltVT = MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord);
- // Number of word-sized vectors.
- NumElts = NumElts / NPerWord;
+ return std::tuple(NumElts / NPerWord,
+ MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord),
+ /* UpsizeElementTypes = */ true);
}
- return std::tuple(NumElts, EltVT, UpsizeElementTypes);
+ llvm_unreachable("All cases should return.");
};
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -3234,7 +3227,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(N);
EVT ValVT = Val.getValueType();
- auto VectorLoweringParams = tryGetVectorLoweringParams(ValVT);
+ auto VectorLoweringParams = getVectorLoweringShape(ValVT);
if (!VectorLoweringParams)
return SDValue();
auto [NumElts, EltVT, UpsizeElementTypes] = VectorLoweringParams.value();
@@ -6237,7 +6230,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
assert(ResVT.isVector() && "Vector load must have vector type");
- auto VectorLoweringParams = tryGetVectorLoweringParams(ResVT);
+ auto VectorLoweringParams = getVectorLoweringShape(ResVT);
if (!VectorLoweringParams)
return;
auto [NumElts, EltVT, UpsizeElementTypes] = VectorLoweringParams.value();
>From a21ca5fcd0aedc3755b913632daada8a643fdcd8 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Mon, 16 Dec 2024 23:39:06 +0000
Subject: [PATCH 11/11] Remove upsized boolean and derive it in the caller
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 50 +++++++++++----------
1 file changed, 27 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2b53eb16cb1286..ef3cf60a03d61c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -64,7 +64,6 @@
#include <iterator>
#include <optional>
#include <string>
-#include <tuple>
#include <utility>
#include <vector>
@@ -167,23 +166,21 @@ static bool Is16bitsType(MVT VT) {
// things:
// 1. Determines Whether the vector is something we want to custom lower,
// std::nullopt is returned if we do not want to custom lower it.
-// 2. If we do want to handle it, returns three parameters:
+// 2. If we do want to handle it, returns two parameters:
// - unsigned int NumElts - The number of elements in the final vector
// - EVT EltVT - The type of the elements in the final vector
-// - bool UpsizeElementTypes - Whether or not we are upsizing the elements of
-// the vector
-static std::optional<std::tuple<unsigned int, EVT, bool>>
-getVectorLoweringShape(EVT ValVT) {
- if (!ValVT.isVector() || !ValVT.isSimple())
+static std::optional<std::pair<unsigned int, EVT>>
+getVectorLoweringShape(EVT VectorVT) {
+ if (!VectorVT.isVector() || !VectorVT.isSimple())
return std::nullopt;
- EVT EltVT = ValVT.getVectorElementType();
- unsigned NumElts = ValVT.getVectorNumElements();
+ EVT EltVT = VectorVT.getVectorElementType();
+ unsigned NumElts = VectorVT.getVectorNumElements();
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
// legal. We can (and should) split that into 2 stores of <2 x double> here
// but I'm leaving that as a TODO for now.
- switch (ValVT.getSimpleVT().SimpleTy) {
+ switch (VectorVT.getSimpleVT().SimpleTy) {
default:
return std::nullopt;
case MVT::v2i8:
@@ -201,7 +198,7 @@ getVectorLoweringShape(EVT ValVT) {
case MVT::v4bf16:
case MVT::v4f32:
// This is a "native" vector type
- return std::tuple(NumElts, EltVT, /* UpsizeElementTypes = */ false);
+ return std::pair(NumElts, EltVT);
case MVT::v8i8: // <2 x i8x4>
case MVT::v8f16: // <4 x f16x2>
case MVT::v8bf16: // <4 x bf16x2>
@@ -218,12 +215,12 @@ getVectorLoweringShape(EVT ValVT) {
// Number of elements to pack in one word.
unsigned NPerWord = 32 / EltVT.getSizeInBits();
- return std::tuple(NumElts / NPerWord,
- MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord),
- /* UpsizeElementTypes = */ true);
+
+ return std::pair(NumElts / NPerWord,
+ MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord));
}
- llvm_unreachable("All cases should return.");
+ llvm_unreachable("All cases in switch should return.");
};
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
@@ -3227,10 +3224,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(N);
EVT ValVT = Val.getValueType();
- auto VectorLoweringParams = getVectorLoweringShape(ValVT);
- if (!VectorLoweringParams)
+ auto NumEltsAndEltVT = getVectorLoweringShape(ValVT);
+ if (!NumEltsAndEltVT)
return SDValue();
- auto [NumElts, EltVT, UpsizeElementTypes] = VectorLoweringParams.value();
+ auto [NumElts, EltVT] = NumEltsAndEltVT.value();
MemSDNode *MemSD = cast<MemSDNode>(N);
const DataLayout &TD = DAG.getDataLayout();
@@ -3271,7 +3268,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
Ops.push_back(N->getOperand(0));
// Then the split values
- if (UpsizeElementTypes) {
+ if (ValVT.getVectorNumElements() > NumElts) {
+ // If the number of elements has changed, getVectorLoweringShape has upsized
+ // the element types
+ assert((Isv2x16VT(EltVT) || EltVT == MVT::v4i8) && "Unexpected upsized type.");
// Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
// stored as b32s
unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
@@ -6230,10 +6230,10 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
assert(ResVT.isVector() && "Vector load must have vector type");
- auto VectorLoweringParams = getVectorLoweringShape(ResVT);
- if (!VectorLoweringParams)
+ auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
+ if (!NumEltsAndEltVT)
return;
- auto [NumElts, EltVT, UpsizeElementTypes] = VectorLoweringParams.value();
+ auto [NumElts, EltVT] = NumEltsAndEltVT.value();
LoadSDNode *LD = cast<LoadSDNode>(N);
@@ -6289,7 +6289,11 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
LD->getMemOperand());
SmallVector<SDValue> ScalarRes;
- if (UpsizeElementTypes) {
+ if (ResVT.getVectorNumElements() > NumElts) {
+ // If the number of elements has changed, getVectorLoweringShape has upsized
+ // the element types
+ assert((Isv2x16VT(EltVT) || EltVT == MVT::v4i8) &&
+ "Unexpected upsized type.");
// Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
// into individual elements.
for (unsigned i = 0; i < NumElts; ++i) {
More information about the llvm-commits
mailing list