[clang] [llvm] [NVPTX] Support inline asm with 128-bit operand in NVPTX backend (PR #97113)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 28 15:14:53 PDT 2024
https://github.com/Chengjunp updated https://github.com/llvm/llvm-project/pull/97113
>From b9d056d5aba5185dcfb44ee1062bc6fbc36d002d Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Tue, 25 Jun 2024 17:20:27 +0000
Subject: [PATCH 1/6] [NVPTX] Support inline asm with 128-bit operand in NVPTX
backend
---
clang/lib/Basic/Targets/NVPTX.h | 1 +
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 3 +
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 2 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 68 +++++++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 3 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 94 +++++++++
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 8 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 2 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 12 +-
llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 4 +
llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 3 +
.../CodeGen/NVPTX/inline-asm-b128-test1.ll | 92 +++++++++
.../CodeGen/NVPTX/inline-asm-b128-test2.ll | 57 ++++++
.../CodeGen/NVPTX/inline-asm-b128-test3.ll | 179 ++++++++++++++++++
14 files changed, 525 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
create mode 100644 llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
create mode 100644 llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index f476d49047c01..7e9b6b34df636 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -105,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
case 'l':
case 'f':
case 'd':
+ case 'q':
Info.setAllowsRegister();
return true;
}
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index b7a20c351f5ff..380d878c1f532 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -60,6 +60,9 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const {
case 6:
OS << "%fd";
break;
+ case 7:
+ OS << "%rq";
+ break;
}
unsigned VReg = Reg.id() & 0x0FFFFFFF;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ca077d41d36ba..1645261d74d06 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -315,6 +315,8 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
Ret = (5 << 28);
} else if (RC == &NVPTX::Float64RegsRegClass) {
Ret = (6 << 28);
+ } else if (RC == &NVPTX::Int128RegsRegClass) {
+ Ret = (7 << 28);
} else {
report_fatal_error("Bad register class");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1e1cbb15e33d4..05706e200bda6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -519,6 +519,20 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryConstantFP(N))
return;
break;
+ case ISD::CopyToReg: {
+ if (N->getOperand(1).getValueType() == MVT::i128) {
+ SelectV2I64toI128(N);
+ return;
+ }
+ break;
+ }
+ case ISD::CopyFromReg: {
+ if(N->getOperand(1).getValueType() == MVT::i128){
+ SelectI128toV2I64(N);
+ return;
+ }
+ break;
+ }
default:
break;
}
@@ -3798,6 +3812,60 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
return true;
}
+void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
+ // Lower a CopyToReg with two 64-bit inputs
+ // Dst:i128, lo:i64, hi:i64
+ //
+ // CopyToReg Dst, lo, hi;
+ //
+ // ==>
+ //
+ // tmp = V2I64toI128 {lo, hi};
+ // CopyToReg Dst, tmp;
+ SDValue Dst = N->getOperand(1);
+ SDValue Lo = N->getOperand(2);
+ SDValue Hi = N->getOperand(3);
+
+ SDLoc DL(N);
+ SDNode *Mov =
+ CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
+
+ SmallVector<EVT, 8> ResultsType(N->value_begin(), N->value_end());
+ SmallVector<SDValue, 8> NewOps(N->getNumOperands() - 1);
+ NewOps[0] = N->getOperand(0);
+ NewOps[1] = Dst;
+ NewOps[2] = SDValue(Mov, 0);
+ if (N->getNumOperands() == 5)
+ NewOps[3] = N->getOperand(4);
+ SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
+
+ ReplaceNode(N, NewValue.getNode());
+}
+
+void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
+ // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
+ // Dst:i128, Src:i128
+ //
+ // {lo, hi} = CopyFromReg Src
+ //
+ // ==>
+ //
+ // {lo, hi} = I128toV2I64 Src
+ //
+ SDValue Ch = N->getOperand(0);
+ SDValue Src = N->getOperand(1);
+ SDValue Glue = N->getOperand(2);
+ SDLoc DL(N);
+
+ // Add Glue and Ch to the operands and results to avoid break the execution order
+ SDNode *Mov = CurDAG->getMachineNode(
+ NVPTX::I128toV2I64, DL,
+ {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
+ {Src, Ch, Glue});
+
+ ReplaceNode(N, Mov);
+}
+
/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
/// conversion from \p SrcTy to \p DestTy.
unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index c5524351f2ff9..49626d4051485 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -74,7 +74,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
-
+ void SelectV2I64toI128(SDNode *N);
+ void SelectI128toV2I64(SDNode *N);
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 76633a437fe71..927c3ef55dcb1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -859,6 +859,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
}
+ // Custom lowering for inline asm with 128-bit operands
+ setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
+ setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
+
// No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
// No FPOW or FREM in PTX.
@@ -2804,6 +2808,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVectorArith(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::CopyToReg:
+ return LowerCopyToReg_128(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
}
@@ -3094,6 +3100,53 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
+SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
+ // operand so that it can pass the legalization.
+
+ assert(Op.getOperand(1).getValueType() == MVT::i128 &&
+ "Custom lowering for 128-bit CopyToReg only");
+
+ SDNode *Node = Op.getNode();
+ SDLoc DL(Node);
+
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op->getOperand(2));
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
+ DAG.getIntPtrConstant(1, DL));
+
+ SmallVector<SDValue, 8> NewOps(Op->getNumOperands() + 1);
+ SmallVector<EVT, 8> ResultsType(Node->value_begin(), Node->value_end());
+
+ NewOps[0] = Op->getOperand(0); // Chain
+ NewOps[1] = Op->getOperand(1); // Dst Reg
+ NewOps[2] = Lo; // Lower 64-bit
+ NewOps[3] = Hi; // Higher 64-bit
+ if (Op.getNumOperands() == 4)
+ NewOps[4] = Op->getOperand(3); // Glue if exists
+
+ return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
+}
+
+unsigned NVPTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT = std::nullopt) const {
+ if(VT == MVT::i128 && RegisterVT == MVT::i128)
+ return 1;
+ return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
+}
+
+bool NVPTXTargetLowering::splitValueIntoRegisterParts(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+ unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
+ if (Val.getValueType() == MVT::i128 && NumParts == 1) {
+ Parts[0] = Val;
+ return true;
+ }
+ return false;
+}
+
// This creates target external symbol for a function parameter.
// Name of the symbol is composed from its index and the function name.
// Negative index corresponds to special parameter (unsized array) used for
@@ -5152,6 +5205,7 @@ NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
case 'l':
case 'f':
case 'd':
+ case 'q':
case '0':
case 'N':
return C_RegisterClass;
@@ -5177,6 +5231,12 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 'l':
case 'N':
return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+ case 'q': {
+ if (STI.getSmVersion() < 70)
+ report_fatal_error("Inline asm with 128 bit operands is only "
+ "supported for sm_70 and higher!");
+ return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
+ }
case 'f':
return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
case 'd':
@@ -6244,6 +6304,37 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
}
}
+static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) {
+ // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit result
+ // so that it can pass the legalization
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Reg = N->getOperand(1);
+ SDValue Glue = N->getOperand(2);
+
+ assert(Reg.getValueType() == MVT::i128 &&
+ "Custom lowering for CopyFromReg with 128-bit reg only");
+ SmallVector<EVT, 8> ResultsType(4);
+ SmallVector<SDValue, 8> NewOps(3);
+ ResultsType[0] = MVT::i64;
+ ResultsType[1] = MVT::i64;
+ ResultsType[2] = N->getValueType(1);
+ ResultsType[3] = N->getValueType(2);
+
+ NewOps[0] = Chain;
+ NewOps[1] = Reg;
+ NewOps[2] = Glue;
+
+ SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
+ {NewValue.getValue(0), NewValue.getValue(1)});
+
+ Results.push_back(Pair);
+ Results.push_back(NewValue.getValue(2));
+ Results.push_back(NewValue.getValue(3));
+}
+
void NVPTXTargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -6255,6 +6346,9 @@ void NVPTXTargetLowering::ReplaceNodeResults(
case ISD::INTRINSIC_W_CHAIN:
ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
return;
+ case ISD::CopyFromReg:
+ ReplaceCopyFromReg_128(N, DAG, Results);
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index e211286fcc556..63262961b363e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -640,6 +640,14 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCopyToReg_128(SDValue Op, SelectionDAG &DAG) const;
+ unsigned getNumRegisters(LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT) const override;
+ bool
+ splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
+ SDValue *Parts, unsigned NumParts, MVT PartVT,
+ std::optional<CallingConv::ID> CC) const override;
+
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index b0d792b5ee3fe..673858f92e7ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -51,6 +51,8 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else if (DestRC == &NVPTX::Int64RegsRegClass) {
Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
: NVPTX::BITCONVERT_64_F2I);
+ } else if (DestRC == &NVPTX::Int128RegsRegClass) {
+ Op = NVPTX::IMOV128rr;
} else if (DestRC == &NVPTX::Float32RegsRegClass) {
Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
: NVPTX::BITCONVERT_32_I2F);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index c4c35a1f74ba9..827febe845a4c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2097,6 +2097,8 @@ let IsSimpleMove=1, hasSideEffects=0 in {
"mov.u32 \t$dst, $sss;", []>;
def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
"mov.u64 \t$dst, $sss;", []>;
+ def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
+ "mov.b128 \t$dst, $sss;", []>;
def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
"mov.b16 \t$dst, $sss;", []>;
@@ -3545,6 +3547,9 @@ let hasSideEffects = false in {
def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
(ins Int32Regs:$s1, Int32Regs:$s2),
"mov.b64 \t$d, {{$s1, $s2}};", []>;
+ def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d),
+ (ins Int64Regs:$s1, Int64Regs:$s2),
+ "mov.b128 \t$d, {{$s1, $s2}};", []>;
def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
(ins Float32Regs:$s1, Float32Regs:$s2),
"mov.b64 \t$d, {{$s1, $s2}};", []>;
@@ -3560,6 +3565,9 @@ let hasSideEffects = false in {
def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
(ins Int64Regs:$s),
"mov.b64 \t{{$d1, $d2}}, $s;", []>;
+ def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2),
+ (ins Int128Regs:$s),
+ "mov.b128 \t{{$d1, $d2}}, $s;", []>;
def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
(ins Float64Regs:$s),
"mov.b64 \t{{$d1, $d2}}, $s;", []>;
@@ -3629,7 +3637,7 @@ def : Pat<(i32 (ctlz (i32 Int32Regs:$a))), (CLZr32 Int32Regs:$a)>;
// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
// truncating back down to 32 bits.
def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
+def : Pat<(i32 (trunc (i64 (ctlz Int64Regs:$a)))), (CLZr64 Int64Regs:$a)>;
// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
// result back to 16-bits if necessary. We also need to subtract 16 because
@@ -3667,7 +3675,7 @@ def : Pat<(i32 (ctpop (i32 Int32Regs:$a))), (POPCr32 Int32Regs:$a)>;
// pattern that avoids the type conversion if we're truncating the result to
// i32 anyway.
def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
-def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
+def : Pat<(i32 (trunc (i64 (ctpop Int64Regs:$a)))), (POPCr64 Int64Regs:$a)>;
// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
// If we know that we're storing into an i32, we can avoid the final trunc.
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index f1213f030bba7..a8a23f04c1249 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -31,6 +31,8 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
return ".f32";
if (RC == &NVPTX::Float64RegsRegClass)
return ".f64";
+ if (RC == &NVPTX::Int128RegsRegClass)
+ return ".b128";
if (RC == &NVPTX::Int64RegsRegClass)
// We use untyped (.b) integer registers here as NVCC does.
// Correctness of generated code does not depend on register type,
@@ -67,6 +69,8 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
return "%f";
if (RC == &NVPTX::Float64RegsRegClass)
return "%fd";
+ if (RC == &NVPTX::Int128RegsRegClass)
+ return "%rq";
if (RC == &NVPTX::Int64RegsRegClass)
return "%rd";
if (RC == &NVPTX::Int32RegsRegClass)
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index b5231a9cf67f9..2011f0f7e328f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -37,6 +37,7 @@ foreach i = 0...4 in {
def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
def R#i : NVPTXReg<"%r"#i>; // 32-bit
def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
+ def RQ#i : NVPTXReg<"%rq"#i>; // 128-bit
def H#i : NVPTXReg<"%h"#i>; // 16-bit float
def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float
def F#i : NVPTXReg<"%f"#i>; // 32-bit float
@@ -62,6 +63,8 @@ def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32,
(add (sequence "R%u", 0, 4),
VRFrame32, VRFrameLocal32)>;
def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
+// 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only.
+def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>;
def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
new file mode 100644
index 0000000000000..dec0451c34ccc
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+ at value = internal addrspace(1) global i128 0, align 16
+ at llvm.used = appending global [6 x ptr] [ptr @_Z7kernel1v, ptr @_Z7kernel2Pn, ptr @_Z7kernel3Pb, ptr @_Z7kernel4v, ptr @_Z7kernel5Pn, ptr addrspacecast (ptr addrspace(1) @value to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline convergent mustprogress willreturn
+define void @_Z7kernel1v() #0 {
+ ; CHECK-LABEL: _Z7kernel1v
+ ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
+ ; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42;
+ ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
+ ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; }
+
+ tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) #3
+ ret void
+}
+
+; Function Attrs: alwaysinline convergent mustprogress willreturn
+define void @_Z7kernel2Pn(ptr nocapture readonly %data) #0 {
+ ; CHECK-LABEL: _Z7kernel2Pn
+ ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8];
+ ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]];
+ ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
+ ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; }
+
+ %1 = addrspacecast ptr %data to ptr addrspace(1)
+ %2 = load <2 x i64>, ptr addrspace(1) %1, align 16
+ %3 = bitcast <2 x i64> %2 to i128
+ tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3) #3
+ ret void
+}
+
+; Function Attrs: alwaysinline convergent mustprogress willreturn
+define void @_Z7kernel3Pb(ptr nocapture readonly %flag) #0 {
+ ; CHECK-LABEL: _Z7kernel3Pb
+ ; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}};
+ ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
+ ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
+ ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; }
+
+ %1 = addrspacecast ptr %flag to ptr addrspace(1)
+ %tmp1 = load i8, ptr addrspace(1) %1, align 1
+ %tobool.not = icmp eq i8 %tmp1, 0
+ %. = select i1 %tobool.not, i128 24, i128 42
+ tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.) #3
+ ret void
+}
+
+; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel4v() #1 {
+ ; CHECK-LABEL: _Z7kernel4v
+ ; CHECK-O3: { mov.b128 [[REG_128:%rq[0-9]+]], 41; }
+ ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]];
+
+ %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() #4
+ %add = add nsw i128 %1, 1
+ %2 = bitcast i128 %add to <2 x i64>
+ store <2 x i64> %2, ptr addrspace(1) @value, align 16
+ ret void
+}
+
+; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none)
+define void @_Z7kernel5Pn(ptr nocapture readonly %data) #2 {
+ ; CHECK-LABEL: _Z7kernel5Pn
+ ; CHECK-O3: ld.global.v2.u64 {[[REG_LO_IN:%rd[0-9]+]], [[REG_HI_IN:%rd[0-9]+]]}, [{{%rd[0-9]+}}];
+ ; CHECK-O3: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO_IN]], [[REG_HI_IN]]};
+ ; CHECK-O3: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; }
+ ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]];
+
+ %1 = addrspacecast ptr %data to ptr addrspace(1)
+ %2 = load <2 x i64>, ptr addrspace(1) %1, align 16
+ %3 = bitcast <2 x i64> %2 to i128
+ %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3) #4
+ %add = add nsw i128 %4, 1
+ %5 = bitcast i128 %add to <2 x i64>
+ store <2 x i64> %5, ptr addrspace(1) @value, align 16
+ ret void
+}
+
+attributes #0 = { alwaysinline convergent mustprogress willreturn "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
+attributes #1 = { alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
+attributes #2 = { alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
+attributes #3 = { convergent nounwind }
+attributes #4 = { nounwind }
+
+
+!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
+
+!0 = !{i32 2, i32 0, i32 3, i32 1}
+!1 = !{i32 2, i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
new file mode 100644
index 0000000000000..337479a06c3f0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+ at u128_max = internal addrspace(1) global i128 0, align 16
+ at u128_zero = internal addrspace(1) global i128 0, align 16
+ at i128_max = internal addrspace(1) global i128 0, align 16
+ at i128_min = internal addrspace(1) global i128 0, align 16
+ at v_u128_max = internal addrspace(1) global i128 0, align 16
+ at v_u128_zero = internal addrspace(1) global i128 0, align 16
+ at v_i128_max = internal addrspace(1) global i128 0, align 16
+ at v_i128_min = internal addrspace(1) global i128 0, align 16
+ at v64 = internal addrspace(1) global i64* null, align 8
+ at llvm.used = appending global [10 x i8*] [i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_min to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*) to i8*), i8* bitcast (i64** addrspacecast (i64* addrspace(1)* @v64 to i64**) to i8*), i8* bitcast (void ()* @_Z6kernelv to i8*)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline
+define void @_Z6kernelv() #0 {
+ ; CHECK-LABLE: _Z6kernelv
+ ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1;
+ ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]};
+ ; CHECK: mov.u64 [[I128_MAX_HI:%rd[0-9]+]], 9223372036854775807;
+ ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I128_MAX_HI]]};
+ ; CHECK: mov.u64 [[I128_MIN_HI:%rd[0-9]+]], -9223372036854775808;
+ ; CHECK: mov.u64 [[ZERO:%rd[0-9]+]], 0;
+ ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[ZERO]], [[I128_MIN_HI]]};
+ ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[ZERO]], [[ZERO]]};
+
+ %tmp = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr = getelementptr inbounds i64, i64* %tmp, i32 0
+ %tmp1 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr2 = getelementptr inbounds i64, i64* %tmp1, i32 1
+ call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, i64* %add.ptr, i64* %add.ptr2, i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*)) #1
+ %tmp3 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr4 = getelementptr inbounds i64, i64* %tmp3, i32 2
+ %tmp5 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr6 = getelementptr inbounds i64, i64* %tmp5, i32 3
+ call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, i64* %add.ptr4, i64* %add.ptr6, i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*)) #1
+ %tmp7 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr8 = getelementptr inbounds i64, i64* %tmp7, i32 4
+ %tmp9 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr10 = getelementptr inbounds i64, i64* %tmp9, i32 5
+ call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, i64* %add.ptr8, i64* %add.ptr10, i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*)) #1
+ %tmp11 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr12 = getelementptr inbounds i64, i64* %tmp11, i32 6
+ %tmp13 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
+ %add.ptr14 = getelementptr inbounds i64, i64* %tmp13, i32 7
+ call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, i64* %add.ptr12, i64* %add.ptr14, i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*)) #1
+ ret void
+}
+
+attributes #0 = { alwaysinline "nvvm.annotations_transplanted" "nvvm.kernel" }
+attributes #1 = { nounwind }
+
+!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
+
+!0 = !{i32 2, i32 0, i32 3, i32 1}
+!1 = !{i32 2, i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
new file mode 100644
index 0000000000000..4f077ec5383c9
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -0,0 +1,179 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+ at size = internal addrspace(1) global i32 0, align 4
+ at value = internal addrspace(1) global i128 0, align 16
+ at x = internal addrspace(1) global i128 0, align 16
+ at y = internal addrspace(1) global i128 0, align 16
+ at z = internal addrspace(1) global i128 0, align 16
+ at llvm.used = appending global [6 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @size to ptr), ptr addrspacecast (ptr addrspace(1) @value to ptr), ptr addrspacecast (ptr addrspace(1) @x to ptr), ptr addrspacecast (ptr addrspace(1) @y to ptr), ptr addrspacecast (ptr addrspace(1) @z to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @_Z6kernelv() #0 {
+ ; CHECK-LABEL: _Z6kernelv
+ ; CHECK: mov.b128 [[X:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+ ; CHECK: mov.b128 [[Y:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+ ; CHECK: mov.b128 [[Z:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+ ; CHECK: mov.b128 {lo, hi}, [[X]];
+ ; CHECK: mov.b128 [[X]], {lo, hi};
+ ; CHECK: mov.b128 {lo, hi}, [[Y]];
+ ; CHECK: mov.b128 [[Y]], {lo, hi};
+ ; CHECK: mov.b128 {lo, hi}, [[Z]];
+ ; CHECK: mov.b128 [[Z]], {lo, hi};
+ ; CHECK: mov.b128 {[[X_LO:%rd[0-9]+]], [[X_HI:%rd[0-9]+]]}, [[X]];
+ ; CHECK: mov.b128 {[[Y_LO:%rd[0-9]+]], [[Y_HI:%rd[0-9]+]]}, [[Y]];
+ ; CHECK: mov.b128 {[[Z_LO:%rd[0-9]+]], [[Z_HI:%rd[0-9]+]]}, [[Z]];
+ ; CHECK: mov.b128 [[X_NEW:%rq[0-9]+]], {[[X_LO]], [[X_HI]]};
+ ; CHECK: mov.b128 [[Y_NEW:%rq[0-9]+]], {[[Y_LO]], [[Y_HI]]};
+ ; CHECK: mov.b128 [[Z_NEW:%rq[0-9]+]], {[[Z_LO]], [[Z_HI]]};
+ ; CHECK: mov.b128 {lo, hi}, [[X_NEW]];
+ ; CHECK: mov.b128 [[X_NEW]], {lo, hi};
+ ; CHECK: mov.b128 {lo, hi}, [[Y_NEW]];
+ ; CHECK: mov.b128 [[Y_NEW]], {lo, hi};
+ ; CHECK: mov.b128 {lo, hi}, [[Z_NEW]];
+ ; CHECK: mov.b128 [[Z_NEW]], {lo, hi};
+ ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[X_NEW]];
+ ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Y_NEW]];
+ ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Z_NEW]];
+
+ %tmp11 = load i32, ptr addrspace(1) @size, align 4
+ %cmp3.not = icmp eq i32 %tmp11, 0
+ br i1 %cmp3.not, label %._crit_edge, label %.lr.ph.preheader
+
+.lr.ph.preheader: ; preds = %0
+ %x.promoted5 = load i128, ptr addrspace(1) @x, align 16
+ %y.promoted6 = load i128, ptr addrspace(1) @y, align 16
+ %z.promoted7 = load i128, ptr addrspace(1) @z, align 16
+ %value.promoted8 = load i128, ptr addrspace(1) @value, align 16
+ %umax = sext i32 %tmp11 to i64
+ %xtraiter = and i64 %umax, 3
+ %1 = icmp ult i32 %tmp11, 4
+ br i1 %1, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph.preheader.new
+
+.lr.ph.preheader.new: ; preds = %.lr.ph.preheader
+ %unroll_iter = and i64 %umax, -4
+ br label %.lr.ph
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader.new
+ %2 = phi i128 [ %value.promoted8, %.lr.ph.preheader.new ], [ %add14.3, %.lr.ph ]
+ %3 = phi i128 [ %z.promoted7, %.lr.ph.preheader.new ], [ %asmresult21.3, %.lr.ph ]
+ %4 = phi i128 [ %y.promoted6, %.lr.ph.preheader.new ], [ %asmresult20.3, %.lr.ph ]
+ %5 = phi i128 [ %x.promoted5, %.lr.ph.preheader.new ], [ %asmresult19.3, %.lr.ph ]
+ %i.04 = phi i64 [ 0, %.lr.ph.preheader.new ], [ %inc.3, %.lr.ph ]
+ %niter = phi i64 [ 0, %.lr.ph.preheader.new ], [ %niter.next.3, %.lr.ph ]
+ %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3) #1
+ %asmresult = extractvalue { i128, i128, i128 } %6, 0
+ %asmresult7 = extractvalue { i128, i128, i128 } %6, 1
+ %asmresult8 = extractvalue { i128, i128, i128 } %6, 2
+ %add = add nsw i128 %asmresult, %asmresult7
+ %add12 = add nsw i128 %add, %asmresult8
+ %add14 = add nsw i128 %add12, %2
+ %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8) #1
+ %asmresult19 = extractvalue { i128, i128, i128 } %7, 0
+ %asmresult20 = extractvalue { i128, i128, i128 } %7, 1
+ %asmresult21 = extractvalue { i128, i128, i128 } %7, 2
+ %inc = add nuw nsw i64 %i.04, 1
+ %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21) #1
+ %asmresult.1 = extractvalue { i128, i128, i128 } %8, 0
+ %asmresult7.1 = extractvalue { i128, i128, i128 } %8, 1
+ %asmresult8.1 = extractvalue { i128, i128, i128 } %8, 2
+ %add.1 = add nsw i128 %asmresult.1, %asmresult7.1
+ %add12.1 = add nsw i128 %add.1, %asmresult8.1
+ %add14.1 = add nsw i128 %add12.1, %add14
+ %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1) #1
+ %asmresult19.1 = extractvalue { i128, i128, i128 } %9, 0
+ %asmresult20.1 = extractvalue { i128, i128, i128 } %9, 1
+ %asmresult21.1 = extractvalue { i128, i128, i128 } %9, 2
+ %inc.1 = add nuw nsw i64 %i.04, 2
+ %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1) #1
+ %asmresult.2 = extractvalue { i128, i128, i128 } %10, 0
+ %asmresult7.2 = extractvalue { i128, i128, i128 } %10, 1
+ %asmresult8.2 = extractvalue { i128, i128, i128 } %10, 2
+ %add.2 = add nsw i128 %asmresult.2, %asmresult7.2
+ %add12.2 = add nsw i128 %add.2, %asmresult8.2
+ %add14.2 = add nsw i128 %add12.2, %add14.1
+ %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2) #1
+ %asmresult19.2 = extractvalue { i128, i128, i128 } %11, 0
+ %asmresult20.2 = extractvalue { i128, i128, i128 } %11, 1
+ %asmresult21.2 = extractvalue { i128, i128, i128 } %11, 2
+ %inc.2 = add nuw nsw i64 %i.04, 3
+ %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2) #1
+ %asmresult.3 = extractvalue { i128, i128, i128 } %12, 0
+ %asmresult7.3 = extractvalue { i128, i128, i128 } %12, 1
+ %asmresult8.3 = extractvalue { i128, i128, i128 } %12, 2
+ %add.3 = add nsw i128 %asmresult.3, %asmresult7.3
+ %add12.3 = add nsw i128 %add.3, %asmresult8.3
+ %add14.3 = add nsw i128 %add12.3, %add14.2
+ %13 = bitcast i128 %add14.3 to <2 x i64>
+ store <2 x i64> %13, ptr addrspace(1) @value, align 16
+ %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3) #1
+ %asmresult19.3 = extractvalue { i128, i128, i128 } %14, 0
+ %asmresult20.3 = extractvalue { i128, i128, i128 } %14, 1
+ %asmresult21.3 = extractvalue { i128, i128, i128 } %14, 2
+ %15 = bitcast i128 %asmresult19.3 to <2 x i64>
+ store <2 x i64> %15, ptr addrspace(1) @x, align 16
+ %16 = bitcast i128 %asmresult20.3 to <2 x i64>
+ store <2 x i64> %16, ptr addrspace(1) @y, align 16
+ %17 = bitcast i128 %asmresult21.3 to <2 x i64>
+ store <2 x i64> %17, ptr addrspace(1) @z, align 16
+ %inc.3 = add nuw i64 %i.04, 4
+ %niter.next.3 = add i64 %niter, 4
+ %niter.ncmp.3.not = icmp eq i64 %niter.next.3, %unroll_iter
+ br i1 %niter.ncmp.3.not, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph, !llvm.loop !2
+
+._crit_edge.loopexit.unr-lcssa: ; preds = %.lr.ph, %.lr.ph.preheader
+ %.unr = phi i128 [ %value.promoted8, %.lr.ph.preheader ], [ %add14.3, %.lr.ph ]
+ %.unr9 = phi i128 [ %z.promoted7, %.lr.ph.preheader ], [ %asmresult21.3, %.lr.ph ]
+ %.unr10 = phi i128 [ %y.promoted6, %.lr.ph.preheader ], [ %asmresult20.3, %.lr.ph ]
+ %.unr11 = phi i128 [ %x.promoted5, %.lr.ph.preheader ], [ %asmresult19.3, %.lr.ph ]
+ %i.04.unr = phi i64 [ 0, %.lr.ph.preheader ], [ %inc.3, %.lr.ph ]
+ %lcmp.mod.not = icmp eq i64 %xtraiter, 0
+ br i1 %lcmp.mod.not, label %._crit_edge, label %.lr.ph.epil
+
+.lr.ph.epil: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa
+ %18 = phi i128 [ %add14.epil, %.lr.ph.epil ], [ %.unr, %._crit_edge.loopexit.unr-lcssa ]
+ %19 = phi i128 [ %asmresult21.epil, %.lr.ph.epil ], [ %.unr9, %._crit_edge.loopexit.unr-lcssa ]
+ %20 = phi i128 [ %asmresult20.epil, %.lr.ph.epil ], [ %.unr10, %._crit_edge.loopexit.unr-lcssa ]
+ %21 = phi i128 [ %asmresult19.epil, %.lr.ph.epil ], [ %.unr11, %._crit_edge.loopexit.unr-lcssa ]
+ %i.04.epil = phi i64 [ %inc.epil, %.lr.ph.epil ], [ %i.04.unr, %._crit_edge.loopexit.unr-lcssa ]
+ %epil.iter = phi i64 [ %epil.iter.next, %.lr.ph.epil ], [ 0, %._crit_edge.loopexit.unr-lcssa ]
+ %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19) #1
+ %asmresult.epil = extractvalue { i128, i128, i128 } %22, 0
+ %asmresult7.epil = extractvalue { i128, i128, i128 } %22, 1
+ %asmresult8.epil = extractvalue { i128, i128, i128 } %22, 2
+ %add.epil = add nsw i128 %asmresult.epil, %asmresult7.epil
+ %add12.epil = add nsw i128 %add.epil, %asmresult8.epil
+ %add14.epil = add nsw i128 %add12.epil, %18
+ %23 = bitcast i128 %add14.epil to <2 x i64>
+ store <2 x i64> %23, ptr addrspace(1) @value, align 16
+ %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil) #1
+ %asmresult19.epil = extractvalue { i128, i128, i128 } %24, 0
+ %asmresult20.epil = extractvalue { i128, i128, i128 } %24, 1
+ %asmresult21.epil = extractvalue { i128, i128, i128 } %24, 2
+ %25 = bitcast i128 %asmresult19.epil to <2 x i64>
+ store <2 x i64> %25, ptr addrspace(1) @x, align 16
+ %26 = bitcast i128 %asmresult20.epil to <2 x i64>
+ store <2 x i64> %26, ptr addrspace(1) @y, align 16
+ %27 = bitcast i128 %asmresult21.epil to <2 x i64>
+ store <2 x i64> %27, ptr addrspace(1) @z, align 16
+ %inc.epil = add nuw i64 %i.04.epil, 1
+ %epil.iter.next = add i64 %epil.iter, 1
+ %epil.iter.cmp.not = icmp eq i64 %epil.iter.next, %xtraiter
+ br i1 %epil.iter.cmp.not, label %._crit_edge, label %.lr.ph.epil, !llvm.loop !4
+
+._crit_edge: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa, %0
+ ret void
+}
+
+attributes #0 = { alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
+attributes #1 = { nounwind }
+
+!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
+
+!0 = !{i32 2, i32 0, i32 3, i32 1}
+!1 = !{i32 2, i32 0}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.mustprogress"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.unroll.disable"}
>From d3d53434eda400a44121c4a2e52dcaa8b9077227 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Tue, 25 Jun 2024 21:54:48 +0000
Subject: [PATCH 2/6] Format Code & Update tests
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 17 ++---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 23 +++---
.../CodeGen/NVPTX/inline-asm-b128-test1.ll | 28 +++-----
.../CodeGen/NVPTX/inline-asm-b128-test2.ll | 70 ++++++++-----------
.../CodeGen/NVPTX/inline-asm-b128-test3.ll | 24 +++----
5 files changed, 73 insertions(+), 89 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 05706e200bda6..9c0498560db21 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -527,7 +527,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
break;
}
case ISD::CopyFromReg: {
- if(N->getOperand(1).getValueType() == MVT::i128){
+ if (N->getOperand(1).getValueType() == MVT::i128) {
SelectI128toV2I64(N);
return;
}
@@ -3825,17 +3825,17 @@ void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
SDValue Dst = N->getOperand(1);
SDValue Lo = N->getOperand(2);
SDValue Hi = N->getOperand(3);
-
+
SDLoc DL(N);
SDNode *Mov =
CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
-
+
SmallVector<EVT, 8> ResultsType(N->value_begin(), N->value_end());
SmallVector<SDValue, 8> NewOps(N->getNumOperands() - 1);
NewOps[0] = N->getOperand(0);
NewOps[1] = Dst;
NewOps[2] = SDValue(Mov, 0);
- if (N->getNumOperands() == 5)
+ if (N->getNumOperands() == 5)
NewOps[3] = N->getOperand(4);
SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
@@ -3847,17 +3847,18 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
// Dst:i128, Src:i128
//
// {lo, hi} = CopyFromReg Src
- //
+ //
// ==>
- //
+ //
// {lo, hi} = I128toV2I64 Src
- //
+ //
SDValue Ch = N->getOperand(0);
SDValue Src = N->getOperand(1);
SDValue Glue = N->getOperand(2);
SDLoc DL(N);
- // Add Glue and Ch to the operands and results to avoid break the execution order
+ // Add Glue and Ch to the operands and results to avoid break the execution
+ // order
SDNode *Mov = CurDAG->getMachineNode(
NVPTX::I128toV2I64, DL,
{MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 927c3ef55dcb1..3cfac3baeb5c9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3107,15 +3107,15 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
assert(Op.getOperand(1).getValueType() == MVT::i128 &&
"Custom lowering for 128-bit CopyToReg only");
-
+
SDNode *Node = Op.getNode();
SDLoc DL(Node);
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op->getOperand(2));
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
- DAG.getIntPtrConstant(0, DL));
+ DAG.getIntPtrConstant(0, DL));
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
- DAG.getIntPtrConstant(1, DL));
+ DAG.getIntPtrConstant(1, DL));
SmallVector<SDValue, 8> NewOps(Op->getNumOperands() + 1);
SmallVector<EVT, 8> ResultsType(Node->value_begin(), Node->value_end());
@@ -3130,11 +3130,12 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
}
-unsigned NVPTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
- std::optional<MVT> RegisterVT = std::nullopt) const {
- if(VT == MVT::i128 && RegisterVT == MVT::i128)
- return 1;
- return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
+unsigned NVPTXTargetLowering::getNumRegisters(
+ LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT = std::nullopt) const {
+ if (VT == MVT::i128 && RegisterVT == MVT::i128)
+ return 1;
+ return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
}
bool NVPTXTargetLowering::splitValueIntoRegisterParts(
@@ -5205,7 +5206,7 @@ NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
case 'l':
case 'f':
case 'd':
- case 'q':
+ case 'q':
case '0':
case 'N':
return C_RegisterClass;
@@ -6306,8 +6307,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) {
- // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit result
- // so that it can pass the legalization
+ // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
+ // result so that it can pass the legalization
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Reg = N->getOperand(1);
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
index dec0451c34ccc..8b5369d2804b0 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
@@ -3,22 +3,21 @@
target triple = "nvptx64-nvidia-cuda"
@value = internal addrspace(1) global i128 0, align 16
- at llvm.used = appending global [6 x ptr] [ptr @_Z7kernel1v, ptr @_Z7kernel2Pn, ptr @_Z7kernel3Pb, ptr @_Z7kernel4v, ptr @_Z7kernel5Pn, ptr addrspacecast (ptr addrspace(1) @value to ptr)], section "llvm.metadata"
; Function Attrs: alwaysinline convergent mustprogress willreturn
-define void @_Z7kernel1v() #0 {
+define void @_Z7kernel1v() {
; CHECK-LABEL: _Z7kernel1v
; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42;
; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; }
- tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42) #3
+ tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42)
ret void
}
; Function Attrs: alwaysinline convergent mustprogress willreturn
-define void @_Z7kernel2Pn(ptr nocapture readonly %data) #0 {
+define void @_Z7kernel2Pn(ptr nocapture readonly %data) {
; CHECK-LABEL: _Z7kernel2Pn
; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8];
; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]];
@@ -28,12 +27,12 @@ define void @_Z7kernel2Pn(ptr nocapture readonly %data) #0 {
%1 = addrspacecast ptr %data to ptr addrspace(1)
%2 = load <2 x i64>, ptr addrspace(1) %1, align 16
%3 = bitcast <2 x i64> %2 to i128
- tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3) #3
+ tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3)
ret void
}
; Function Attrs: alwaysinline convergent mustprogress willreturn
-define void @_Z7kernel3Pb(ptr nocapture readonly %flag) #0 {
+define void @_Z7kernel3Pb(ptr nocapture readonly %flag) {
; CHECK-LABEL: _Z7kernel3Pb
; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}};
; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
@@ -44,17 +43,17 @@ define void @_Z7kernel3Pb(ptr nocapture readonly %flag) #0 {
%tmp1 = load i8, ptr addrspace(1) %1, align 1
%tobool.not = icmp eq i8 %tmp1, 0
%. = select i1 %tobool.not, i128 24, i128 42
- tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.) #3
+ tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.)
ret void
}
; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none)
-define void @_Z7kernel4v() #1 {
+define void @_Z7kernel4v() {
; CHECK-LABEL: _Z7kernel4v
; CHECK-O3: { mov.b128 [[REG_128:%rq[0-9]+]], 41; }
; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]];
- %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() #4
+ %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"()
%add = add nsw i128 %1, 1
%2 = bitcast i128 %add to <2 x i64>
store <2 x i64> %2, ptr addrspace(1) @value, align 16
@@ -62,7 +61,7 @@ define void @_Z7kernel4v() #1 {
}
; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none)
-define void @_Z7kernel5Pn(ptr nocapture readonly %data) #2 {
+define void @_Z7kernel5Pn(ptr nocapture readonly %data) {
; CHECK-LABEL: _Z7kernel5Pn
; CHECK-O3: ld.global.v2.u64 {[[REG_LO_IN:%rd[0-9]+]], [[REG_HI_IN:%rd[0-9]+]]}, [{{%rd[0-9]+}}];
; CHECK-O3: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO_IN]], [[REG_HI_IN]]};
@@ -72,20 +71,13 @@ define void @_Z7kernel5Pn(ptr nocapture readonly %data) #2 {
%1 = addrspacecast ptr %data to ptr addrspace(1)
%2 = load <2 x i64>, ptr addrspace(1) %1, align 16
%3 = bitcast <2 x i64> %2 to i128
- %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3) #4
+ %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3)
%add = add nsw i128 %4, 1
%5 = bitcast i128 %add to <2 x i64>
store <2 x i64> %5, ptr addrspace(1) @value, align 16
ret void
}
-attributes #0 = { alwaysinline convergent mustprogress willreturn "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
-attributes #1 = { alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
-attributes #2 = { alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
-attributes #3 = { convergent nounwind }
-attributes #4 = { nounwind }
-
-
!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
!0 = !{i32 2, i32 0, i32 3, i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
index 337479a06c3f0..94b641e8faf05 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
@@ -10,48 +10,40 @@ target triple = "nvptx64-nvidia-cuda"
@v_u128_zero = internal addrspace(1) global i128 0, align 16
@v_i128_max = internal addrspace(1) global i128 0, align 16
@v_i128_min = internal addrspace(1) global i128 0, align 16
- at v64 = internal addrspace(1) global i64* null, align 8
- at llvm.used = appending global [10 x i8*] [i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @i128_min to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*) to i8*), i8* bitcast (i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*) to i8*), i8* bitcast (i64** addrspacecast (i64* addrspace(1)* @v64 to i64**) to i8*), i8* bitcast (void ()* @_Z6kernelv to i8*)], section "llvm.metadata"
+ at v64 = internal addrspace(1) global ptr null, align 8
-; Function Attrs: alwaysinline
-define void @_Z6kernelv() #0 {
- ; CHECK-LABLE: _Z6kernelv
- ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1;
- ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]};
- ; CHECK: mov.u64 [[I128_MAX_HI:%rd[0-9]+]], 9223372036854775807;
- ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I128_MAX_HI]]};
- ; CHECK: mov.u64 [[I128_MIN_HI:%rd[0-9]+]], -9223372036854775808;
- ; CHECK: mov.u64 [[ZERO:%rd[0-9]+]], 0;
- ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[ZERO]], [[I128_MIN_HI]]};
- ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[ZERO]], [[ZERO]]};
-
- %tmp = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr = getelementptr inbounds i64, i64* %tmp, i32 0
- %tmp1 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr2 = getelementptr inbounds i64, i64* %tmp1, i32 1
- call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, i64* %add.ptr, i64* %add.ptr2, i128* addrspacecast (i128 addrspace(1)* @v_u128_max to i128*)) #1
- %tmp3 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr4 = getelementptr inbounds i64, i64* %tmp3, i32 2
- %tmp5 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr6 = getelementptr inbounds i64, i64* %tmp5, i32 3
- call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, i64* %add.ptr4, i64* %add.ptr6, i128* addrspacecast (i128 addrspace(1)* @v_i128_max to i128*)) #1
- %tmp7 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr8 = getelementptr inbounds i64, i64* %tmp7, i32 4
- %tmp9 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr10 = getelementptr inbounds i64, i64* %tmp9, i32 5
- call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, i64* %add.ptr8, i64* %add.ptr10, i128* addrspacecast (i128 addrspace(1)* @v_i128_min to i128*)) #1
- %tmp11 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr12 = getelementptr inbounds i64, i64* %tmp11, i32 6
- %tmp13 = load i64*, i64** addrspacecast (i64* addrspace(1)* @v64 to i64**), align 8
- %add.ptr14 = getelementptr inbounds i64, i64* %tmp13, i32 7
- call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, i64* %add.ptr12, i64* %add.ptr14, i128* addrspacecast (i128 addrspace(1)* @v_u128_zero to i128*)) #1
+; Function Attrs: alwaysinline convergent mustprogress willreturn
+define void @_Z6kernelv() {
+ ; CHECK-LABEL: _Z6kernelv
+ ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1;
+ ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]};
+ ; CHECK: mov.u64 [[I64_MAX:%rd[0-9]+]], 9223372036854775807;
+ ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I64_MAX]]}
+ ; CHECK: mov.u64 [[I64_MIN:%rd[0-9]+]], -9223372036854775808;
+ ; CHECK: mov.u64 [[U64_ZERO:%rd[0-9]+]], 0;
+ ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[U64_ZERO]], [[I64_MIN]]}
+ ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[U64_ZERO]], [[U64_ZERO]]}
+
+ %tmp = load ptr, ptr addrspace(1) @v64, align 8
+ %add.ptr2 = getelementptr inbounds i64, ptr %tmp, i64 1
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr))
+ %tmp3 = load ptr, ptr addrspace(1) @v64, align 8
+ %add.ptr4 = getelementptr inbounds i64, ptr %tmp3, i64 2
+ %add.ptr6 = getelementptr inbounds i64, ptr %tmp3, i64 3
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr))
+ %tmp7 = load ptr, ptr addrspace(1) @v64, align 8
+ %add.ptr8 = getelementptr inbounds i64, ptr %tmp7, i64 4
+ %add.ptr10 = getelementptr inbounds i64, ptr %tmp7, i64 5
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr))
+ %tmp11 = load ptr, ptr addrspace(1) @v64, align 8
+ %add.ptr12 = getelementptr inbounds i64, ptr %tmp11, i64 6
+ %add.ptr14 = getelementptr inbounds i64, ptr %tmp11, i64 7
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr))
ret void
}
-attributes #0 = { alwaysinline "nvvm.annotations_transplanted" "nvvm.kernel" }
-attributes #1 = { nounwind }
-!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
+!nvvmir.version = !{!2, !3, !2, !3, !3, !2, !2, !2, !3}
-!0 = !{i32 2, i32 0, i32 3, i32 1}
-!1 = !{i32 2, i32 0}
+!2 = !{i32 2, i32 0, i32 3, i32 1}
+!3 = !{i32 2, i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
index 4f077ec5383c9..9d7a25ca4d467 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -10,7 +10,7 @@ target triple = "nvptx64-nvidia-cuda"
@llvm.used = appending global [6 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @size to ptr), ptr addrspacecast (ptr addrspace(1) @value to ptr), ptr addrspacecast (ptr addrspace(1) @x to ptr), ptr addrspacecast (ptr addrspace(1) @y to ptr), ptr addrspacecast (ptr addrspace(1) @z to ptr)], section "llvm.metadata"
; Function Attrs: alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
-define void @_Z6kernelv() #0 {
+define void @_Z6kernelv() {
; CHECK-LABEL: _Z6kernelv
; CHECK: mov.b128 [[X:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: mov.b128 [[Y:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
@@ -62,43 +62,43 @@ define void @_Z6kernelv() #0 {
%5 = phi i128 [ %x.promoted5, %.lr.ph.preheader.new ], [ %asmresult19.3, %.lr.ph ]
%i.04 = phi i64 [ 0, %.lr.ph.preheader.new ], [ %inc.3, %.lr.ph ]
%niter = phi i64 [ 0, %.lr.ph.preheader.new ], [ %niter.next.3, %.lr.ph ]
- %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3) #1
+ %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3)
%asmresult = extractvalue { i128, i128, i128 } %6, 0
%asmresult7 = extractvalue { i128, i128, i128 } %6, 1
%asmresult8 = extractvalue { i128, i128, i128 } %6, 2
%add = add nsw i128 %asmresult, %asmresult7
%add12 = add nsw i128 %add, %asmresult8
%add14 = add nsw i128 %add12, %2
- %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8) #1
+ %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8)
%asmresult19 = extractvalue { i128, i128, i128 } %7, 0
%asmresult20 = extractvalue { i128, i128, i128 } %7, 1
%asmresult21 = extractvalue { i128, i128, i128 } %7, 2
%inc = add nuw nsw i64 %i.04, 1
- %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21) #1
+ %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21)
%asmresult.1 = extractvalue { i128, i128, i128 } %8, 0
%asmresult7.1 = extractvalue { i128, i128, i128 } %8, 1
%asmresult8.1 = extractvalue { i128, i128, i128 } %8, 2
%add.1 = add nsw i128 %asmresult.1, %asmresult7.1
%add12.1 = add nsw i128 %add.1, %asmresult8.1
%add14.1 = add nsw i128 %add12.1, %add14
- %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1) #1
+ %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1)
%asmresult19.1 = extractvalue { i128, i128, i128 } %9, 0
%asmresult20.1 = extractvalue { i128, i128, i128 } %9, 1
%asmresult21.1 = extractvalue { i128, i128, i128 } %9, 2
%inc.1 = add nuw nsw i64 %i.04, 2
- %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1) #1
+ %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1)
%asmresult.2 = extractvalue { i128, i128, i128 } %10, 0
%asmresult7.2 = extractvalue { i128, i128, i128 } %10, 1
%asmresult8.2 = extractvalue { i128, i128, i128 } %10, 2
%add.2 = add nsw i128 %asmresult.2, %asmresult7.2
%add12.2 = add nsw i128 %add.2, %asmresult8.2
%add14.2 = add nsw i128 %add12.2, %add14.1
- %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2) #1
+ %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2)
%asmresult19.2 = extractvalue { i128, i128, i128 } %11, 0
%asmresult20.2 = extractvalue { i128, i128, i128 } %11, 1
%asmresult21.2 = extractvalue { i128, i128, i128 } %11, 2
%inc.2 = add nuw nsw i64 %i.04, 3
- %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2) #1
+ %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2)
%asmresult.3 = extractvalue { i128, i128, i128 } %12, 0
%asmresult7.3 = extractvalue { i128, i128, i128 } %12, 1
%asmresult8.3 = extractvalue { i128, i128, i128 } %12, 2
@@ -107,7 +107,7 @@ define void @_Z6kernelv() #0 {
%add14.3 = add nsw i128 %add12.3, %add14.2
%13 = bitcast i128 %add14.3 to <2 x i64>
store <2 x i64> %13, ptr addrspace(1) @value, align 16
- %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3) #1
+ %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3)
%asmresult19.3 = extractvalue { i128, i128, i128 } %14, 0
%asmresult20.3 = extractvalue { i128, i128, i128 } %14, 1
%asmresult21.3 = extractvalue { i128, i128, i128 } %14, 2
@@ -138,7 +138,7 @@ define void @_Z6kernelv() #0 {
%21 = phi i128 [ %asmresult19.epil, %.lr.ph.epil ], [ %.unr11, %._crit_edge.loopexit.unr-lcssa ]
%i.04.epil = phi i64 [ %inc.epil, %.lr.ph.epil ], [ %i.04.unr, %._crit_edge.loopexit.unr-lcssa ]
%epil.iter = phi i64 [ %epil.iter.next, %.lr.ph.epil ], [ 0, %._crit_edge.loopexit.unr-lcssa ]
- %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19) #1
+ %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19)
%asmresult.epil = extractvalue { i128, i128, i128 } %22, 0
%asmresult7.epil = extractvalue { i128, i128, i128 } %22, 1
%asmresult8.epil = extractvalue { i128, i128, i128 } %22, 2
@@ -147,7 +147,7 @@ define void @_Z6kernelv() #0 {
%add14.epil = add nsw i128 %add12.epil, %18
%23 = bitcast i128 %add14.epil to <2 x i64>
store <2 x i64> %23, ptr addrspace(1) @value, align 16
- %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil) #1
+ %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil)
%asmresult19.epil = extractvalue { i128, i128, i128 } %24, 0
%asmresult20.epil = extractvalue { i128, i128, i128 } %24, 1
%asmresult21.epil = extractvalue { i128, i128, i128 } %24, 2
@@ -166,8 +166,6 @@ define void @_Z6kernelv() #0 {
ret void
}
-attributes #0 = { alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "nvvm.annotations_transplanted" "nvvm.kernel" "nvvm.restrict_processed" "target-cpu"="sm_89" }
-attributes #1 = { nounwind }
!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
>From 7274c66310656804df9d4f698fa3013c28b5db36 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Wed, 26 Jun 2024 00:04:10 +0000
Subject: [PATCH 3/6] Update kernel names in tests & Update one test for inline
asm in loops
---
.../CodeGen/NVPTX/inline-asm-b128-test1.ll | 36 ++--
.../CodeGen/NVPTX/inline-asm-b128-test2.ll | 7 +-
.../CodeGen/NVPTX/inline-asm-b128-test3.ll | 177 +++---------------
3 files changed, 45 insertions(+), 175 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
index 8b5369d2804b0..8a256d50d6050 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
target triple = "nvptx64-nvidia-cuda"
@value = internal addrspace(1) global i128 0, align 16
; Function Attrs: alwaysinline convergent mustprogress willreturn
-define void @_Z7kernel1v() {
- ; CHECK-LABEL: _Z7kernel1v
+define void @test_b128_input_from_const() {
+ ; CHECK-LABEL: test_b128_input_from_const
; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42;
; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
@@ -17,8 +18,8 @@ define void @_Z7kernel1v() {
}
; Function Attrs: alwaysinline convergent mustprogress willreturn
-define void @_Z7kernel2Pn(ptr nocapture readonly %data) {
- ; CHECK-LABEL: _Z7kernel2Pn
+define void @test_b128_input_from_load(ptr nocapture readonly %data) {
+ ; CHECK-LABEL: test_b128_input_from_load
; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8];
; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]];
; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
@@ -32,8 +33,8 @@ define void @_Z7kernel2Pn(ptr nocapture readonly %data) {
}
; Function Attrs: alwaysinline convergent mustprogress willreturn
-define void @_Z7kernel3Pb(ptr nocapture readonly %flag) {
- ; CHECK-LABEL: _Z7kernel3Pb
+define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
+ ; CHECK-LABEL: test_b128_input_from_select
; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}};
; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
@@ -48,10 +49,10 @@ define void @_Z7kernel3Pb(ptr nocapture readonly %flag) {
}
; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none)
-define void @_Z7kernel4v() {
- ; CHECK-LABEL: _Z7kernel4v
- ; CHECK-O3: { mov.b128 [[REG_128:%rq[0-9]+]], 41; }
- ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]];
+define void @test_store_b128_output() {
+ ; CHECK-LABEL: test_store_b128_output
+ ; CHECK: { mov.b128 [[REG_128:%rq[0-9]+]], 41; }
+ ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]];
%1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"()
%add = add nsw i128 %1, 1
@@ -61,12 +62,13 @@ define void @_Z7kernel4v() {
}
; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none)
-define void @_Z7kernel5Pn(ptr nocapture readonly %data) {
- ; CHECK-LABEL: _Z7kernel5Pn
- ; CHECK-O3: ld.global.v2.u64 {[[REG_LO_IN:%rd[0-9]+]], [[REG_HI_IN:%rd[0-9]+]]}, [{{%rd[0-9]+}}];
- ; CHECK-O3: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO_IN]], [[REG_HI_IN]]};
- ; CHECK-O3: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; }
- ; CHECK-O3: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]];
+define void @test_use_of_b128_output(ptr nocapture readonly %data) {
+ ; CHECK-LABEL: test_use_of_b128_output
+ ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8];
+ ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]];
+ ; CHECK: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
+ ; CHECK: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; }
+ ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]];
%1 = addrspacecast ptr %data to ptr addrspace(1)
%2 = load <2 x i64>, ptr addrspace(1) %1, align 16
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
index 94b641e8faf05..09b648d036a4c 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
target triple = "nvptx64-nvidia-cuda"
@@ -13,8 +14,8 @@ target triple = "nvptx64-nvidia-cuda"
@v64 = internal addrspace(1) global ptr null, align 8
; Function Attrs: alwaysinline convergent mustprogress willreturn
-define void @_Z6kernelv() {
- ; CHECK-LABEL: _Z6kernelv
+define void @test_corner_values() {
+ ; CHECK-LABEL: test_corner_values
; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1;
; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]};
; CHECK: mov.u64 [[I64_MAX:%rd[0-9]+]], 9223372036854775807;
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
index 9d7a25ca4d467..e187aa4370858 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -1,177 +1,44 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_70 -o - 2>&1 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
target triple = "nvptx64-nvidia-cuda"
@size = internal addrspace(1) global i32 0, align 4
- at value = internal addrspace(1) global i128 0, align 16
@x = internal addrspace(1) global i128 0, align 16
- at y = internal addrspace(1) global i128 0, align 16
- at z = internal addrspace(1) global i128 0, align 16
- at llvm.used = appending global [6 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @size to ptr), ptr addrspacecast (ptr addrspace(1) @value to ptr), ptr addrspacecast (ptr addrspace(1) @x to ptr), ptr addrspacecast (ptr addrspace(1) @y to ptr), ptr addrspacecast (ptr addrspace(1) @z to ptr)], section "llvm.metadata"
-; Function Attrs: alwaysinline mustprogress willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
-define void @_Z6kernelv() {
- ; CHECK-LABEL: _Z6kernelv
- ; CHECK: mov.b128 [[X:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
- ; CHECK: mov.b128 [[Y:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
- ; CHECK: mov.b128 [[Z:%rq[0-9]+]], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
- ; CHECK: mov.b128 {lo, hi}, [[X]];
- ; CHECK: mov.b128 [[X]], {lo, hi};
- ; CHECK: mov.b128 {lo, hi}, [[Y]];
- ; CHECK: mov.b128 [[Y]], {lo, hi};
- ; CHECK: mov.b128 {lo, hi}, [[Z]];
- ; CHECK: mov.b128 [[Z]], {lo, hi};
- ; CHECK: mov.b128 {[[X_LO:%rd[0-9]+]], [[X_HI:%rd[0-9]+]]}, [[X]];
- ; CHECK: mov.b128 {[[Y_LO:%rd[0-9]+]], [[Y_HI:%rd[0-9]+]]}, [[Y]];
- ; CHECK: mov.b128 {[[Z_LO:%rd[0-9]+]], [[Z_HI:%rd[0-9]+]]}, [[Z]];
- ; CHECK: mov.b128 [[X_NEW:%rq[0-9]+]], {[[X_LO]], [[X_HI]]};
- ; CHECK: mov.b128 [[Y_NEW:%rq[0-9]+]], {[[Y_LO]], [[Y_HI]]};
- ; CHECK: mov.b128 [[Z_NEW:%rq[0-9]+]], {[[Z_LO]], [[Z_HI]]};
- ; CHECK: mov.b128 {lo, hi}, [[X_NEW]];
- ; CHECK: mov.b128 [[X_NEW]], {lo, hi};
- ; CHECK: mov.b128 {lo, hi}, [[Y_NEW]];
- ; CHECK: mov.b128 [[Y_NEW]], {lo, hi};
- ; CHECK: mov.b128 {lo, hi}, [[Z_NEW]];
- ; CHECK: mov.b128 [[Z_NEW]], {lo, hi};
- ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[X_NEW]];
- ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Y_NEW]];
- ; CHECK: mov.b128 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[Z_NEW]];
-
+define void @test_b128_in_loop() {
+ ; CHECK-LABEL: test_b128_in_loop
+ ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [x+8];
+ ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [x];
+ ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
+ ; CHECK: mov.b128 {lo, hi}, [[REG_128]];
+ ; CHECK: add.cc.u64 lo, lo, {{%rd[0-9]+}};
+ ; CHECK: mov.b128 [[REG_128]], {lo, hi};
+
%tmp11 = load i32, ptr addrspace(1) @size, align 4
%cmp3.not = icmp eq i32 %tmp11, 0
br i1 %cmp3.not, label %._crit_edge, label %.lr.ph.preheader
.lr.ph.preheader: ; preds = %0
%x.promoted5 = load i128, ptr addrspace(1) @x, align 16
- %y.promoted6 = load i128, ptr addrspace(1) @y, align 16
- %z.promoted7 = load i128, ptr addrspace(1) @z, align 16
- %value.promoted8 = load i128, ptr addrspace(1) @value, align 16
%umax = sext i32 %tmp11 to i64
- %xtraiter = and i64 %umax, 3
- %1 = icmp ult i32 %tmp11, 4
- br i1 %1, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph.preheader.new
-
-.lr.ph.preheader.new: ; preds = %.lr.ph.preheader
- %unroll_iter = and i64 %umax, -4
br label %.lr.ph
-.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader.new
- %2 = phi i128 [ %value.promoted8, %.lr.ph.preheader.new ], [ %add14.3, %.lr.ph ]
- %3 = phi i128 [ %z.promoted7, %.lr.ph.preheader.new ], [ %asmresult21.3, %.lr.ph ]
- %4 = phi i128 [ %y.promoted6, %.lr.ph.preheader.new ], [ %asmresult20.3, %.lr.ph ]
- %5 = phi i128 [ %x.promoted5, %.lr.ph.preheader.new ], [ %asmresult19.3, %.lr.ph ]
- %i.04 = phi i64 [ 0, %.lr.ph.preheader.new ], [ %inc.3, %.lr.ph ]
- %niter = phi i64 [ 0, %.lr.ph.preheader.new ], [ %niter.next.3, %.lr.ph ]
- %6 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %5, i128 %4, i128 %3)
- %asmresult = extractvalue { i128, i128, i128 } %6, 0
- %asmresult7 = extractvalue { i128, i128, i128 } %6, 1
- %asmresult8 = extractvalue { i128, i128, i128 } %6, 2
- %add = add nsw i128 %asmresult, %asmresult7
- %add12 = add nsw i128 %add, %asmresult8
- %add14 = add nsw i128 %add12, %2
- %7 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04, i128 %asmresult, i128 %asmresult7, i128 %asmresult8)
- %asmresult19 = extractvalue { i128, i128, i128 } %7, 0
- %asmresult20 = extractvalue { i128, i128, i128 } %7, 1
- %asmresult21 = extractvalue { i128, i128, i128 } %7, 2
- %inc = add nuw nsw i64 %i.04, 1
- %8 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult19, i128 %asmresult20, i128 %asmresult21)
- %asmresult.1 = extractvalue { i128, i128, i128 } %8, 0
- %asmresult7.1 = extractvalue { i128, i128, i128 } %8, 1
- %asmresult8.1 = extractvalue { i128, i128, i128 } %8, 2
- %add.1 = add nsw i128 %asmresult.1, %asmresult7.1
- %add12.1 = add nsw i128 %add.1, %asmresult8.1
- %add14.1 = add nsw i128 %add12.1, %add14
- %9 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc, i128 %asmresult.1, i128 %asmresult7.1, i128 %asmresult8.1)
- %asmresult19.1 = extractvalue { i128, i128, i128 } %9, 0
- %asmresult20.1 = extractvalue { i128, i128, i128 } %9, 1
- %asmresult21.1 = extractvalue { i128, i128, i128 } %9, 2
- %inc.1 = add nuw nsw i64 %i.04, 2
- %10 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult19.1, i128 %asmresult20.1, i128 %asmresult21.1)
- %asmresult.2 = extractvalue { i128, i128, i128 } %10, 0
- %asmresult7.2 = extractvalue { i128, i128, i128 } %10, 1
- %asmresult8.2 = extractvalue { i128, i128, i128 } %10, 2
- %add.2 = add nsw i128 %asmresult.2, %asmresult7.2
- %add12.2 = add nsw i128 %add.2, %asmresult8.2
- %add14.2 = add nsw i128 %add12.2, %add14.1
- %11 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.1, i128 %asmresult.2, i128 %asmresult7.2, i128 %asmresult8.2)
- %asmresult19.2 = extractvalue { i128, i128, i128 } %11, 0
- %asmresult20.2 = extractvalue { i128, i128, i128 } %11, 1
- %asmresult21.2 = extractvalue { i128, i128, i128 } %11, 2
- %inc.2 = add nuw nsw i64 %i.04, 3
- %12 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult19.2, i128 %asmresult20.2, i128 %asmresult21.2)
- %asmresult.3 = extractvalue { i128, i128, i128 } %12, 0
- %asmresult7.3 = extractvalue { i128, i128, i128 } %12, 1
- %asmresult8.3 = extractvalue { i128, i128, i128 } %12, 2
- %add.3 = add nsw i128 %asmresult.3, %asmresult7.3
- %add12.3 = add nsw i128 %add.3, %asmresult8.3
- %add14.3 = add nsw i128 %add12.3, %add14.2
- %13 = bitcast i128 %add14.3 to <2 x i64>
- store <2 x i64> %13, ptr addrspace(1) @value, align 16
- %14 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %inc.2, i128 %asmresult.3, i128 %asmresult7.3, i128 %asmresult8.3)
- %asmresult19.3 = extractvalue { i128, i128, i128 } %14, 0
- %asmresult20.3 = extractvalue { i128, i128, i128 } %14, 1
- %asmresult21.3 = extractvalue { i128, i128, i128 } %14, 2
- %15 = bitcast i128 %asmresult19.3 to <2 x i64>
- store <2 x i64> %15, ptr addrspace(1) @x, align 16
- %16 = bitcast i128 %asmresult20.3 to <2 x i64>
- store <2 x i64> %16, ptr addrspace(1) @y, align 16
- %17 = bitcast i128 %asmresult21.3 to <2 x i64>
- store <2 x i64> %17, ptr addrspace(1) @z, align 16
- %inc.3 = add nuw i64 %i.04, 4
- %niter.next.3 = add i64 %niter, 4
- %niter.ncmp.3.not = icmp eq i64 %niter.next.3, %unroll_iter
- br i1 %niter.ncmp.3.not, label %._crit_edge.loopexit.unr-lcssa, label %.lr.ph, !llvm.loop !2
-
-._crit_edge.loopexit.unr-lcssa: ; preds = %.lr.ph, %.lr.ph.preheader
- %.unr = phi i128 [ %value.promoted8, %.lr.ph.preheader ], [ %add14.3, %.lr.ph ]
- %.unr9 = phi i128 [ %z.promoted7, %.lr.ph.preheader ], [ %asmresult21.3, %.lr.ph ]
- %.unr10 = phi i128 [ %y.promoted6, %.lr.ph.preheader ], [ %asmresult20.3, %.lr.ph ]
- %.unr11 = phi i128 [ %x.promoted5, %.lr.ph.preheader ], [ %asmresult19.3, %.lr.ph ]
- %i.04.unr = phi i64 [ 0, %.lr.ph.preheader ], [ %inc.3, %.lr.ph ]
- %lcmp.mod.not = icmp eq i64 %xtraiter, 0
- br i1 %lcmp.mod.not, label %._crit_edge, label %.lr.ph.epil
-
-.lr.ph.epil: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa
- %18 = phi i128 [ %add14.epil, %.lr.ph.epil ], [ %.unr, %._crit_edge.loopexit.unr-lcssa ]
- %19 = phi i128 [ %asmresult21.epil, %.lr.ph.epil ], [ %.unr9, %._crit_edge.loopexit.unr-lcssa ]
- %20 = phi i128 [ %asmresult20.epil, %.lr.ph.epil ], [ %.unr10, %._crit_edge.loopexit.unr-lcssa ]
- %21 = phi i128 [ %asmresult19.epil, %.lr.ph.epil ], [ %.unr11, %._crit_edge.loopexit.unr-lcssa ]
- %i.04.epil = phi i64 [ %inc.epil, %.lr.ph.epil ], [ %i.04.unr, %._crit_edge.loopexit.unr-lcssa ]
- %epil.iter = phi i64 [ %epil.iter.next, %.lr.ph.epil ], [ 0, %._crit_edge.loopexit.unr-lcssa ]
- %22 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09add.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09add.cc.u64 lo, lo, 3;\0A\09add.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %21, i128 %20, i128 %19)
- %asmresult.epil = extractvalue { i128, i128, i128 } %22, 0
- %asmresult7.epil = extractvalue { i128, i128, i128 } %22, 1
- %asmresult8.epil = extractvalue { i128, i128, i128 } %22, 2
- %add.epil = add nsw i128 %asmresult.epil, %asmresult7.epil
- %add12.epil = add nsw i128 %add.epil, %asmresult8.epil
- %add14.epil = add nsw i128 %add12.epil, %18
- %23 = bitcast i128 %add14.epil to <2 x i64>
- store <2 x i64> %23, ptr addrspace(1) @value, align 16
- %24 = tail call { i128, i128, i128 } asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09sub.cc.u64 lo, lo, 1;\0A\09mov.b128 $0, {lo, hi};\0A\09mov.b128 {lo, hi}, $1;\0A\09sub.cc.u64 hi, hi, 2;\0A\09mov.b128 $1, {lo, hi};\0A\09mov.b128 {lo, hi}, $2;\0A\09sub.cc.u64 lo, lo, 3;\0A\09sub.cc.u64 hi, hi, 3;\0A\09mov.b128 $2, {lo, hi};\0A\09}\0A\09", "=q,=q,=q,l,0,1,2"(i64 %i.04.epil, i128 %asmresult.epil, i128 %asmresult7.epil, i128 %asmresult8.epil)
- %asmresult19.epil = extractvalue { i128, i128, i128 } %24, 0
- %asmresult20.epil = extractvalue { i128, i128, i128 } %24, 1
- %asmresult21.epil = extractvalue { i128, i128, i128 } %24, 2
- %25 = bitcast i128 %asmresult19.epil to <2 x i64>
- store <2 x i64> %25, ptr addrspace(1) @x, align 16
- %26 = bitcast i128 %asmresult20.epil to <2 x i64>
- store <2 x i64> %26, ptr addrspace(1) @y, align 16
- %27 = bitcast i128 %asmresult21.epil to <2 x i64>
- store <2 x i64> %27, ptr addrspace(1) @z, align 16
- %inc.epil = add nuw i64 %i.04.epil, 1
- %epil.iter.next = add i64 %epil.iter, 1
- %epil.iter.cmp.not = icmp eq i64 %epil.iter.next, %xtraiter
- br i1 %epil.iter.cmp.not, label %._crit_edge, label %.lr.ph.epil, !llvm.loop !4
-
-._crit_edge: ; preds = %.lr.ph.epil, %._crit_edge.loopexit.unr-lcssa, %0
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %1 = phi i128 [ %2, %.lr.ph ], [ %x.promoted5, %.lr.ph.preheader ]
+ %i.04 = phi i64 [ %inc, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09} \0A\09", "=q,l,0"(i64 %i.04, i128 %1)
+ %3 = bitcast i128 %2 to <2 x i64>
+ store <2 x i64> %3, ptr addrspace(1) @x, align 16
+ %inc = add nuw i64 %i.04, 1
+ %exitcond.not = icmp eq i64 %inc, %umax
+ br i1 %exitcond.not, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
ret void
}
-
!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
!0 = !{i32 2, i32 0, i32 3, i32 1}
!1 = !{i32 2, i32 0}
-!2 = distinct !{!2, !3}
-!3 = !{!"llvm.loop.mustprogress"}
-!4 = distinct !{!4, !5}
-!5 = !{!"llvm.loop.unroll.disable"}
>From 2443c0bc80a94083c10672630f7d43d01b69afac Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Wed, 26 Jun 2024 20:47:52 +0000
Subject: [PATCH 4/6] Update testcases with checks generated by
update_llc_test_checks.py
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 17 +--
.../CodeGen/NVPTX/inline-asm-b128-test1.ll | 127 +++++++++++++-----
.../CodeGen/NVPTX/inline-asm-b128-test2.ll | 106 +++++++++++++--
.../CodeGen/NVPTX/inline-asm-b128-test3.ll | 44 ++++--
5 files changed, 232 insertions(+), 67 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 9c0498560db21..11193c11ede3b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3830,14 +3830,13 @@ void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
SDNode *Mov =
CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
- SmallVector<EVT, 8> ResultsType(N->value_begin(), N->value_end());
- SmallVector<SDValue, 8> NewOps(N->getNumOperands() - 1);
+ SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
NewOps[0] = N->getOperand(0);
NewOps[1] = Dst;
NewOps[2] = SDValue(Mov, 0);
if (N->getNumOperands() == 5)
NewOps[3] = N->getOperand(4);
- SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
+ SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
ReplaceNode(N, NewValue.getNode());
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3cfac3baeb5c9..fd4293e56cfb6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3117,8 +3117,8 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
DAG.getIntPtrConstant(1, DL));
- SmallVector<SDValue, 8> NewOps(Op->getNumOperands() + 1);
- SmallVector<EVT, 8> ResultsType(Node->value_begin(), Node->value_end());
+ SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);
+ SmallVector<EVT, 3> ResultsType(Node->value_begin(), Node->value_end());
NewOps[0] = Op->getOperand(0); // Chain
NewOps[1] = Op->getOperand(1); // Dst Reg
@@ -6316,16 +6316,9 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
assert(Reg.getValueType() == MVT::i128 &&
"Custom lowering for CopyFromReg with 128-bit reg only");
- SmallVector<EVT, 8> ResultsType(4);
- SmallVector<SDValue, 8> NewOps(3);
- ResultsType[0] = MVT::i64;
- ResultsType[1] = MVT::i64;
- ResultsType[2] = N->getValueType(1);
- ResultsType[3] = N->getValueType(2);
-
- NewOps[0] = Chain;
- NewOps[1] = Reg;
- NewOps[2] = Glue;
+ SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
+ N->getValueType(2)};
+ SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
index 8a256d50d6050..a04ed40dbf91a 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
@@ -5,25 +6,49 @@ target triple = "nvptx64-nvidia-cuda"
@value = internal addrspace(1) global i128 0, align 16
-; Function Attrs: alwaysinline convergent mustprogress willreturn
define void @test_b128_input_from_const() {
- ; CHECK-LABEL: test_b128_input_from_const
- ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
- ; CHECK: mov.u64 [[REG_LO:%rd[0-9]+]], 42;
- ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
- ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; }
+; CHECK-LABEL: test_b128_input_from_const(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b128 %rq<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u64 %rd2, 0;
+; CHECK-NEXT: mov.u64 %rd3, 42;
+; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2};
+; CHECK-NEXT: mov.u32 %r1, value;
+; CHECK-NEXT: cvta.global.u32 %r2, %r1;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: { st.b128 [%rd1], %rq1; }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: ret;
tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42)
ret void
}
-; Function Attrs: alwaysinline convergent mustprogress willreturn
define void @test_b128_input_from_load(ptr nocapture readonly %data) {
- ; CHECK-LABEL: test_b128_input_from_load
- ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8];
- ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]];
- ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
- ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; }
+; CHECK-LABEL: test_b128_input_from_load(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b128 %rq<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_b128_input_from_load_param_0];
+; CHECK-NEXT: cvta.to.global.u32 %r2, %r1;
+; CHECK-NEXT: ld.global.u64 %rd2, [%r2+8];
+; CHECK-NEXT: ld.global.u64 %rd3, [%r2];
+; CHECK-NEXT: mov.b128 %rq1, {%rd3, %rd2};
+; CHECK-NEXT: mov.u32 %r3, value;
+; CHECK-NEXT: cvta.global.u32 %r4, %r3;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r4;
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: { st.b128 [%rd1], %rq1; }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: ret;
%1 = addrspacecast ptr %data to ptr addrspace(1)
%2 = load <2 x i64>, ptr addrspace(1) %1, align 16
@@ -32,13 +57,30 @@ define void @test_b128_input_from_load(ptr nocapture readonly %data) {
ret void
}
-; Function Attrs: alwaysinline convergent mustprogress willreturn
define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
- ; CHECK-LABEL: test_b128_input_from_select
- ; CHECK: selp.b64 [[REG_LO:%rd[0-9]+]], 24, 42, {{%p[0-9]+}};
- ; CHECK: mov.u64 [[REG_HI:%rd[0-9]+]], 0;
- ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
- ; CHECK: { st.b128 [{{%rd[0-9]+}}], [[REG_128]]; }
+; CHECK-LABEL: test_b128_input_from_select(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b128 %rq<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_b128_input_from_select_param_0];
+; CHECK-NEXT: cvta.to.global.u32 %r2, %r1;
+; CHECK-NEXT: ld.global.u8 %rs1, [%r2];
+; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0;
+; CHECK-NEXT: selp.b64 %rd2, 24, 42, %p1;
+; CHECK-NEXT: mov.u64 %rd3, 0;
+; CHECK-NEXT: mov.b128 %rq1, {%rd2, %rd3};
+; CHECK-NEXT: mov.u32 %r3, value;
+; CHECK-NEXT: cvta.global.u32 %r4, %r3;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r4;
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: { st.b128 [%rd1], %rq1; }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: ret;
%1 = addrspacecast ptr %flag to ptr addrspace(1)
%tmp1 = load i8, ptr addrspace(1) %1, align 1
@@ -48,12 +90,23 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
ret void
}
-; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: none, inaccessiblemem: none)
define void @test_store_b128_output() {
- ; CHECK-LABEL: test_store_b128_output
- ; CHECK: { mov.b128 [[REG_128:%rq[0-9]+]], 41; }
- ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128]];
-
+; CHECK-LABEL: test_store_b128_output(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-NEXT: .reg .b128 %rq<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: { mov.b128 %rq1, 41; }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: mov.b128 {%rd1, %rd2}, %rq1;
+; CHECK-NEXT: add.cc.s64 %rd3, %rd1, 1;
+; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, 0;
+; CHECK-NEXT: st.global.u64 [value+8], %rd4;
+; CHECK-NEXT: st.global.u64 [value], %rd3;
+; CHECK-NEXT: ret;
+
%1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"()
%add = add nsw i128 %1, 1
%2 = bitcast i128 %add to <2 x i64>
@@ -61,14 +114,28 @@ define void @test_store_b128_output() {
ret void
}
-; Function Attrs: alwaysinline mustprogress willreturn memory(write, argmem: read, inaccessiblemem: none)
define void @test_use_of_b128_output(ptr nocapture readonly %data) {
- ; CHECK-LABEL: test_use_of_b128_output
- ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [[[REG_Addr:%r[0-9]+]]+8];
- ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [[[REG_Addr]]];
- ; CHECK: mov.b128 [[REG_128_IN:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
- ; CHECK: { mov.b128 [[REG_128_OUT:%rq[0-9]+]], [[REG_128_IN]]; }
- ; CHECK: mov.b128 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [[REG_128_OUT]];
+; CHECK-LABEL: test_use_of_b128_output(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
+; CHECK-NEXT: .reg .b128 %rq<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_use_of_b128_output_param_0];
+; CHECK-NEXT: cvta.to.global.u32 %r2, %r1;
+; CHECK-NEXT: ld.global.u64 %rd1, [%r2+8];
+; CHECK-NEXT: ld.global.u64 %rd2, [%r2];
+; CHECK-NEXT: mov.b128 %rq2, {%rd2, %rd1};
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: { mov.b128 %rq1, %rq2; }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: mov.b128 {%rd3, %rd4}, %rq1;
+; CHECK-NEXT: add.cc.s64 %rd5, %rd3, 1;
+; CHECK-NEXT: addc.cc.s64 %rd6, %rd4, 0;
+; CHECK-NEXT: st.global.u64 [value], %rd5;
+; CHECK-NEXT: st.global.u64 [value+8], %rd6;
+; CHECK-NEXT: ret;
%1 = addrspacecast ptr %data to ptr addrspace(1)
%2 = load <2 x i64>, ptr addrspace(1) %1, align 16
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
index 09b648d036a4c..bb45ff6ba2e27 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
@@ -13,33 +14,110 @@ target triple = "nvptx64-nvidia-cuda"
@v_i128_min = internal addrspace(1) global i128 0, align 16
@v64 = internal addrspace(1) global ptr null, align 8
-; Function Attrs: alwaysinline convergent mustprogress willreturn
define void @test_corner_values() {
- ; CHECK-LABEL: test_corner_values
- ; CHECK: mov.u64 [[U64_MAX:%rd[0-9]+]], -1;
- ; CHECK: mov.b128 [[U128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[U64_MAX]]};
- ; CHECK: mov.u64 [[I64_MAX:%rd[0-9]+]], 9223372036854775807;
- ; CHECK: mov.b128 [[I128_MAX:%rq[0-9]+]], {[[U64_MAX]], [[I64_MAX]]}
- ; CHECK: mov.u64 [[I64_MIN:%rd[0-9]+]], -9223372036854775808;
- ; CHECK: mov.u64 [[U64_ZERO:%rd[0-9]+]], 0;
- ; CHECK: mov.b128 [[I128_MIN:%rq[0-9]+]], {[[U64_ZERO]], [[I64_MIN]]}
- ; CHECK: mov.b128 [[U128_ZERO:%rq[0-9]+]], {[[U64_ZERO]], [[U64_ZERO]]}
+; CHECK-LABEL: test_corner_values(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<20>;
+; CHECK-NEXT: .reg .b64 %rd<17>;
+; CHECK-NEXT: .reg .b128 %rq<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.global.u32 %r1, [v64];
+; CHECK-NEXT: add.s32 %r2, %r1, 8;
+; CHECK-NEXT: mov.u64 %rd13, -1;
+; CHECK-NEXT: mov.b128 %rq1, {%rd13, %rd13};
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r2;
+; CHECK-NEXT: mov.u32 %r3, v_u128_max;
+; CHECK-NEXT: cvta.global.u32 %r4, %r3;
+; CHECK-NEXT: cvt.u64.u32 %rd3, %r4;
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b64 hi;
+; CHECK-NEXT: .reg .b64 lo;
+; CHECK-NEXT: mov.b128 {lo, hi}, %rq1;
+; CHECK-NEXT: st.b64 [%rd1], lo;
+; CHECK-NEXT: st.b64 [%rd2], hi;
+; CHECK-NEXT: st.b128 [%rd3], %rq1;
+; CHECK-NEXT: }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: ld.global.u32 %r5, [v64];
+; CHECK-NEXT: add.s32 %r6, %r5, 16;
+; CHECK-NEXT: add.s32 %r7, %r5, 24;
+; CHECK-NEXT: mov.u64 %rd14, 9223372036854775807;
+; CHECK-NEXT: mov.b128 %rq2, {%rd13, %rd14};
+; CHECK-NEXT: mov.u32 %r8, v_i128_max;
+; CHECK-NEXT: cvta.global.u32 %r9, %r8;
+; CHECK-NEXT: cvt.u64.u32 %rd6, %r9;
+; CHECK-NEXT: cvt.u64.u32 %rd4, %r6;
+; CHECK-NEXT: cvt.u64.u32 %rd5, %r7;
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b64 hi;
+; CHECK-NEXT: .reg .b64 lo;
+; CHECK-NEXT: mov.b128 {lo, hi}, %rq2;
+; CHECK-NEXT: st.b64 [%rd4], lo;
+; CHECK-NEXT: st.b64 [%rd5], hi;
+; CHECK-NEXT: st.b128 [%rd6], %rq2;
+; CHECK-NEXT: }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: ld.global.u32 %r10, [v64];
+; CHECK-NEXT: add.s32 %r11, %r10, 32;
+; CHECK-NEXT: add.s32 %r12, %r10, 40;
+; CHECK-NEXT: mov.u64 %rd15, -9223372036854775808;
+; CHECK-NEXT: mov.u64 %rd16, 0;
+; CHECK-NEXT: mov.b128 %rq3, {%rd16, %rd15};
+; CHECK-NEXT: mov.u32 %r13, v_i128_min;
+; CHECK-NEXT: cvta.global.u32 %r14, %r13;
+; CHECK-NEXT: cvt.u64.u32 %rd9, %r14;
+; CHECK-NEXT: cvt.u64.u32 %rd7, %r11;
+; CHECK-NEXT: cvt.u64.u32 %rd8, %r12;
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b64 hi;
+; CHECK-NEXT: .reg .b64 lo;
+; CHECK-NEXT: mov.b128 {lo, hi}, %rq3;
+; CHECK-NEXT: st.b64 [%rd7], lo;
+; CHECK-NEXT: st.b64 [%rd8], hi;
+; CHECK-NEXT: st.b128 [%rd9], %rq3;
+; CHECK-NEXT: }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: ld.global.u32 %r15, [v64];
+; CHECK-NEXT: add.s32 %r16, %r15, 48;
+; CHECK-NEXT: add.s32 %r17, %r15, 56;
+; CHECK-NEXT: mov.b128 %rq4, {%rd16, %rd16};
+; CHECK-NEXT: mov.u32 %r18, v_u128_zero;
+; CHECK-NEXT: cvta.global.u32 %r19, %r18;
+; CHECK-NEXT: cvt.u64.u32 %rd12, %r19;
+; CHECK-NEXT: cvt.u64.u32 %rd10, %r16;
+; CHECK-NEXT: cvt.u64.u32 %rd11, %r17;
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b64 hi;
+; CHECK-NEXT: .reg .b64 lo;
+; CHECK-NEXT: mov.b128 {lo, hi}, %rq4;
+; CHECK-NEXT: st.b64 [%rd10], lo;
+; CHECK-NEXT: st.b64 [%rd11], hi;
+; CHECK-NEXT: st.b128 [%rd12], %rq4;
+; CHECK-NEXT: }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: ret;
%tmp = load ptr, ptr addrspace(1) @v64, align 8
%add.ptr2 = getelementptr inbounds i64, ptr %tmp, i64 1
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr))
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr))
%tmp3 = load ptr, ptr addrspace(1) @v64, align 8
%add.ptr4 = getelementptr inbounds i64, ptr %tmp3, i64 2
%add.ptr6 = getelementptr inbounds i64, ptr %tmp3, i64 3
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr))
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr))
%tmp7 = load ptr, ptr addrspace(1) @v64, align 8
%add.ptr8 = getelementptr inbounds i64, ptr %tmp7, i64 4
%add.ptr10 = getelementptr inbounds i64, ptr %tmp7, i64 5
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr))
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr))
%tmp11 = load ptr, ptr addrspace(1) @v64, align 8
%add.ptr12 = getelementptr inbounds i64, ptr %tmp11, i64 6
%add.ptr14 = getelementptr inbounds i64, ptr %tmp11, i64 7
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}\0A\09", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr))
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr))
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
index e187aa4370858..081956447345c 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}
@@ -7,13 +8,40 @@ target triple = "nvptx64-nvidia-cuda"
@x = internal addrspace(1) global i128 0, align 16
define void @test_b128_in_loop() {
- ; CHECK-LABEL: test_b128_in_loop
- ; CHECK: ld.global.u64 [[REG_HI:%rd[0-9]+]], [x+8];
- ; CHECK: ld.global.u64 [[REG_LO:%rd[0-9]+]], [x];
- ; CHECK: mov.b128 [[REG_128:%rq[0-9]+]], {[[REG_LO]], [[REG_HI]]};
- ; CHECK: mov.b128 {lo, hi}, [[REG_128]];
- ; CHECK: add.cc.u64 lo, lo, {{%rd[0-9]+}};
- ; CHECK: mov.b128 [[REG_128]], {lo, hi};
+; CHECK-LABEL: test_b128_in_loop(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b64 %rd<15>;
+; CHECK-NEXT: .reg .b128 %rq<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.global.s32 %rd1, [size];
+; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_3;
+; CHECK-NEXT: // %bb.1: // %.lr.ph.preheader
+; CHECK-NEXT: ld.global.u64 %rd13, [x+8];
+; CHECK-NEXT: ld.global.u64 %rd12, [x];
+; CHECK-NEXT: mov.u64 %rd14, 0;
+; CHECK-NEXT: $L__BB0_2: // %.lr.ph
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13};
+; CHECK-NEXT: // begin inline asm
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b64 lo;
+; CHECK-NEXT: .reg .b64 hi;
+; CHECK-NEXT: mov.b128 {lo, hi}, %rq1;
+; CHECK-NEXT: add.cc.u64 lo, lo, %rd14;
+; CHECK-NEXT: mov.b128 %rq1, {lo, hi};
+; CHECK-NEXT: }
+; CHECK-NEXT: // end inline asm
+; CHECK-NEXT: mov.b128 {%rd12, %rd13}, %rq1;
+; CHECK-NEXT: st.global.u64 [x+8], %rd13;
+; CHECK-NEXT: st.global.u64 [x], %rd12;
+; CHECK-NEXT: add.s64 %rd14, %rd14, 1;
+; CHECK-NEXT: setp.ne.s64 %p2, %rd1, %rd14;
+; CHECK-NEXT: @%p2 bra $L__BB0_2;
+; CHECK-NEXT: $L__BB0_3: // %._crit_edge
+; CHECK-NEXT: ret;
%tmp11 = load i32, ptr addrspace(1) @size, align 4
%cmp3.not = icmp eq i32 %tmp11, 0
@@ -27,7 +55,7 @@ define void @test_b128_in_loop() {
.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
%1 = phi i128 [ %2, %.lr.ph ], [ %x.promoted5, %.lr.ph.preheader ]
%i.04 = phi i64 [ %inc, %.lr.ph ], [ 0, %.lr.ph.preheader ]
- %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09} \0A\09", "=q,l,0"(i64 %i.04, i128 %1)
+ %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09}", "=q,l,0"(i64 %i.04, i128 %1)
%3 = bitcast i128 %2 to <2 x i64>
store <2 x i64> %3, ptr addrspace(1) @x, align 16
%inc = add nuw i64 %i.04, 1
>From 6bb8dd74a609120c0b082161a575d4d5ba39ee32 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Wed, 26 Jun 2024 22:25:04 +0000
Subject: [PATCH 5/6] Rename variables in tests & Use dag helper to build nodes
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 +-
.../CodeGen/NVPTX/inline-asm-b128-test1.ll | 13 ++----
.../CodeGen/NVPTX/inline-asm-b128-test2.ll | 36 +++++++--------
.../CodeGen/NVPTX/inline-asm-b128-test3.ll | 45 +++++++++----------
4 files changed, 41 insertions(+), 57 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index fd4293e56cfb6..c02d874a9a6b3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3111,14 +3111,14 @@ SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
SDNode *Node = Op.getNode();
SDLoc DL(Node);
- SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op->getOperand(2));
+ SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
DAG.getIntPtrConstant(0, DL));
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
DAG.getIntPtrConstant(1, DL));
SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);
- SmallVector<EVT, 3> ResultsType(Node->value_begin(), Node->value_end());
+ SmallVector<EVT, 3> ResultsType(Node->values());
NewOps[0] = Op->getOperand(0); // Chain
NewOps[1] = Op->getOperand(1); // Dst Reg
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
index a04ed40dbf91a..3232f40a40a70 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
@@ -83,10 +83,10 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
; CHECK-NEXT: ret;
%1 = addrspacecast ptr %flag to ptr addrspace(1)
- %tmp1 = load i8, ptr addrspace(1) %1, align 1
- %tobool.not = icmp eq i8 %tmp1, 0
- %. = select i1 %tobool.not, i128 24, i128 42
- tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %.)
+ %2 = load i8, ptr addrspace(1) %1, align 1
+ %3 = icmp eq i8 %2, 0
+ %4 = select i1 %3, i128 24, i128 42
+ tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %4)
ret void
}
@@ -146,8 +146,3 @@ define void @test_use_of_b128_output(ptr nocapture readonly %data) {
store <2 x i64> %5, ptr addrspace(1) @value, align 16
ret void
}
-
-!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
-
-!0 = !{i32 2, i32 0, i32 3, i32 1}
-!1 = !{i32 2, i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
index bb45ff6ba2e27..3d1d7fbbe27e8 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
@@ -103,26 +103,20 @@ define void @test_corner_values() {
; CHECK-NEXT: // end inline asm
; CHECK-NEXT: ret;
- %tmp = load ptr, ptr addrspace(1) @v64, align 8
- %add.ptr2 = getelementptr inbounds i64, ptr %tmp, i64 1
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -1, ptr %tmp, ptr nonnull %add.ptr2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr))
- %tmp3 = load ptr, ptr addrspace(1) @v64, align 8
- %add.ptr4 = getelementptr inbounds i64, ptr %tmp3, i64 2
- %add.ptr6 = getelementptr inbounds i64, ptr %tmp3, i64 3
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %add.ptr4, ptr nonnull %add.ptr6, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr))
- %tmp7 = load ptr, ptr addrspace(1) @v64, align 8
- %add.ptr8 = getelementptr inbounds i64, ptr %tmp7, i64 4
- %add.ptr10 = getelementptr inbounds i64, ptr %tmp7, i64 5
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %add.ptr8, ptr nonnull %add.ptr10, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr))
- %tmp11 = load ptr, ptr addrspace(1) @v64, align 8
- %add.ptr12 = getelementptr inbounds i64, ptr %tmp11, i64 6
- %add.ptr14 = getelementptr inbounds i64, ptr %tmp11, i64 7
- tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 0, ptr nonnull %add.ptr12, ptr nonnull %add.ptr14, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr))
+ %1 = load ptr, ptr addrspace(1) @v64, align 8
+ %2 = getelementptr inbounds i64, ptr %1, i64 1
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -1, ptr %1, ptr nonnull %2, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_max to ptr))
+ %3 = load ptr, ptr addrspace(1) @v64, align 8
+ %4 = getelementptr inbounds i64, ptr %3, i64 2
+ %5 = getelementptr inbounds i64, ptr %3, i64 3
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 170141183460469231731687303715884105727, ptr nonnull %4, ptr nonnull %5, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_max to ptr))
+ %6 = load ptr, ptr addrspace(1) @v64, align 8
+ %7 = getelementptr inbounds i64, ptr %6, i64 4
+ %8 = getelementptr inbounds i64, ptr %6, i64 5
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 -170141183460469231731687303715884105728, ptr nonnull %7, ptr nonnull %8, ptr nonnull addrspacecast (ptr addrspace(1) @v_i128_min to ptr))
+ %9 = load ptr, ptr addrspace(1) @v64, align 8
+ %10 = getelementptr inbounds i64, ptr %9, i64 6
+ %11 = getelementptr inbounds i64, ptr %9, i64 7
+ tail call void asm sideeffect "{\0A\09.reg .b64 hi;\0A\09.reg .b64 lo;\0A\09mov.b128 {lo, hi}, $0;\0A\09st.b64 [$1], lo;\0A\09st.b64 [$2], hi;\0A\09st.b128 [$3], $0;\0A\09}", "q,l,l,l"(i128 0, ptr nonnull %10, ptr nonnull %11, ptr nonnull addrspacecast (ptr addrspace(1) @v_u128_zero to ptr))
ret void
}
-
-
-!nvvmir.version = !{!2, !3, !2, !3, !3, !2, !2, !2, !3}
-
-!2 = !{i32 2, i32 0, i32 3, i32 1}
-!3 = !{i32 2, i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
index 081956447345c..ae453977123e0 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -18,11 +18,11 @@ define void @test_b128_in_loop() {
; CHECK-NEXT: ld.global.s32 %rd1, [size];
; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
; CHECK-NEXT: @%p1 bra $L__BB0_3;
-; CHECK-NEXT: // %bb.1: // %.lr.ph.preheader
+; CHECK-NEXT: // %bb.1: // %BB1
; CHECK-NEXT: ld.global.u64 %rd13, [x+8];
; CHECK-NEXT: ld.global.u64 %rd12, [x];
; CHECK-NEXT: mov.u64 %rd14, 0;
-; CHECK-NEXT: $L__BB0_2: // %.lr.ph
+; CHECK-NEXT: $L__BB0_2: // %BB2
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13};
; CHECK-NEXT: // begin inline asm
@@ -40,33 +40,28 @@ define void @test_b128_in_loop() {
; CHECK-NEXT: add.s64 %rd14, %rd14, 1;
; CHECK-NEXT: setp.ne.s64 %p2, %rd1, %rd14;
; CHECK-NEXT: @%p2 bra $L__BB0_2;
-; CHECK-NEXT: $L__BB0_3: // %._crit_edge
+; CHECK-NEXT: $L__BB0_3: // %BB3
; CHECK-NEXT: ret;
- %tmp11 = load i32, ptr addrspace(1) @size, align 4
- %cmp3.not = icmp eq i32 %tmp11, 0
- br i1 %cmp3.not, label %._crit_edge, label %.lr.ph.preheader
+ %1 = load i32, ptr addrspace(1) @size, align 4
+ %2 = icmp eq i32 %1, 0
+ br i1 %2, label %BB3, label %BB1
-.lr.ph.preheader: ; preds = %0
- %x.promoted5 = load i128, ptr addrspace(1) @x, align 16
- %umax = sext i32 %tmp11 to i64
- br label %.lr.ph
+BB1: ; preds = %0
+ %3 = load i128, ptr addrspace(1) @x, align 16
+ %4 = sext i32 %1 to i64
+ br label %BB2
-.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
- %1 = phi i128 [ %2, %.lr.ph ], [ %x.promoted5, %.lr.ph.preheader ]
- %i.04 = phi i64 [ %inc, %.lr.ph ], [ 0, %.lr.ph.preheader ]
- %2 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09}", "=q,l,0"(i64 %i.04, i128 %1)
- %3 = bitcast i128 %2 to <2 x i64>
- store <2 x i64> %3, ptr addrspace(1) @x, align 16
- %inc = add nuw i64 %i.04, 1
- %exitcond.not = icmp eq i64 %inc, %umax
- br i1 %exitcond.not, label %._crit_edge, label %.lr.ph
+BB2: ; preds = %BB2, %BB1
+ %5 = phi i128 [ %7, %BB2 ], [ %3, %BB1 ]
+ %6 = phi i64 [ %9, %BB2 ], [ 0, %BB1 ]
+ %7 = tail call i128 asm "{\0A\09.reg .b64 lo;\0A\09.reg .b64 hi;\0A\09mov.b128 {lo, hi}, $0;\0A\09add.cc.u64 lo, lo, $1;\0A\09mov.b128 $0, {lo, hi};\0A\09}", "=q,l,0"(i64 %6, i128 %5)
+ %8 = bitcast i128 %7 to <2 x i64>
+ store <2 x i64> %8, ptr addrspace(1) @x, align 16
+ %9 = add nuw i64 %6, 1
+ %10 = icmp eq i64 %9, %4
+ br i1 %10, label %BB3, label %BB2
-._crit_edge: ; preds = %.lr.ph, %0
+BB3: ; preds = %BB2, %0
ret void
}
-
-!nvvmir.version = !{!0, !1, !0, !1, !1, !0, !0, !0, !1}
-
-!0 = !{i32 2, i32 0, i32 3, i32 1}
-!1 = !{i32 2, i32 0}
>From e7a3af718bfdbd703a4f616977303649d4b88209 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Fri, 28 Jun 2024 22:05:15 +0000
Subject: [PATCH 6/6] Update LangRef for the new constraint code for 128-bit
values in NVPTX.
---
llvm/docs/LangRef.rst | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index edb362c617565..d5d5a4622da4c 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5381,6 +5381,7 @@ NVPTX:
- ``c`` or ``h``: A 16-bit integer register.
- ``r``: A 32-bit integer register.
- ``l`` or ``N``: A 64-bit integer register.
+- ``q``: A 128-bit integer register.
- ``f``: A 32-bit float register.
- ``d``: A 64-bit float register.
More information about the cfe-commits
mailing list