[llvm] [NVPTX] Improve lowering of v4i8 (PR #67866)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 2 18:08:07 PDT 2023
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/67866
>From 4771c973c4659b814eacbacc23bd3c6c877ce2da Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 29 Sep 2023 14:37:46 -0700
Subject: [PATCH 1/2] [NVPTX] Improve lowering of v4i8
Make it a legal type and plumb through lowering of relevant instructions.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 20 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 20 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 52 +-
llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 2 +-
.../NVPTX/load-with-non-coherent-cache.ll | 4 +-
llvm/test/CodeGen/NVPTX/param-load-store.ll | 26 +-
...unfold-masked-merge-vector-variablemask.ll | 518 ++++--------------
llvm/test/CodeGen/NVPTX/vec8.ll | 5 +-
8 files changed, 177 insertions(+), 470 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 0aef2591c6e2394..1daa4971981c25c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -829,6 +829,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
case MVT::v2f16:
case MVT::v2bf16:
case MVT::v2i16:
+ case MVT::v4i8:
return Opcode_i32;
case MVT::f32:
return Opcode_f32;
@@ -910,7 +911,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
// Vector Setting
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
if (SimpleVT.isVector()) {
- assert(Isv2x16VT(LoadedVT) && "Unexpected vector type");
+ assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
+ "Unexpected vector type");
// v2f16/v2bf16/v2i16 is loaded using ld.b32
fromTypeWidth = 32;
}
@@ -1254,6 +1256,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
SDLoc DL(N);
SDNode *LD;
SDValue Base, Offset, Addr;
+ EVT OrigType = N->getValueType(0);
EVT EltVT = Mem->getMemoryVT();
unsigned NumElts = 1;
@@ -1261,12 +1264,15 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
NumElts = EltVT.getVectorNumElements();
EltVT = EltVT.getVectorElementType();
// vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
- if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) ||
- (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) ||
- (EltVT == MVT::i16 && N->getValueType(0) == MVT::v2i16)) {
+ if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
+ (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
+ (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
assert(NumElts % 2 == 0 && "Vector must have even number of elements");
- EltVT = N->getValueType(0);
+ EltVT = OrigType;
NumElts /= 2;
+ } else if (OrigType == MVT::v4i8) {
+ EltVT = OrigType;
+ NumElts = 1;
}
}
@@ -1601,7 +1607,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
// concept of sign-/zero-extension, so emulate it here by adding an explicit
// CVT instruction. Ptxas should clean up any redundancies here.
- EVT OrigType = N->getValueType(0);
LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
if (OrigType != EltVT &&
@@ -1679,7 +1684,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
MVT ScalarVT = SimpleVT.getScalarType();
unsigned toTypeWidth = ScalarVT.getSizeInBits();
if (SimpleVT.isVector()) {
- assert(Isv2x16VT(StoreVT) && "Unexpected vector type");
+ assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
+ "Unexpected vector type");
// v2x16 is stored using st.b32
toTypeWidth = 32;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b24aae4792ce6a6..7880d70fb2c6fea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -221,6 +221,11 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
llvm_unreachable("Unexpected type");
}
NumElts /= 2;
+ } else if (EltVT.getSimpleVT() == MVT::i8 &&
+ (NumElts % 4 == 0 || NumElts == 3)) {
+ // v*i8 are formally lowered as v4i8
+ EltVT = MVT::v4i8;
+ NumElts = (NumElts + 3) / 4;
}
for (unsigned j = 0; j != NumElts; ++j) {
ValueVTs.push_back(EltVT);
@@ -458,6 +463,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
+ addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
@@ -2631,7 +2637,7 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return expandUnalignedStore(Store, DAG);
// v2f16, v2bf16 and v2i16 don't need special handling.
- if (Isv2x16VT(VT))
+ if (Isv2x16VT(VT) || VT == MVT::v4i8)
return SDValue();
if (VT.isVector())
@@ -2903,7 +2909,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
EVT LoadVT = EltVT;
if (EltVT == MVT::i1)
LoadVT = MVT::i8;
- else if (Isv2x16VT(EltVT))
+ else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
// getLoad needs a vector type, but it can't handle
// vectors which contain v2f16 or v2bf16 elements. So we must load
// using i32 here and then bitcast back.
@@ -2929,7 +2935,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
if (EltVT == MVT::i1)
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
// v2f16 was loaded as an i32. Now we must bitcast it back.
- else if (Isv2x16VT(EltVT))
+ else if (EltVT != LoadVT)
Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
// If a promoted integer type is used, truncate down to the original
@@ -5256,9 +5262,9 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
SDValue Vector = N->getOperand(0);
EVT VectorVT = Vector.getValueType();
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
- IsPTXVectorType(VectorVT.getSimpleVT()))
+ IsPTXVectorType(VectorVT.getSimpleVT()) && VectorVT != MVT::v4i8)
return SDValue(); // Native vector loads already combine nicely w/
- // extract_vector_elt.
+ // extract_vector_elt, except for v4i8.
// Don't mess with singletons or v2*16 types, we already handle them OK.
if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT))
return SDValue();
@@ -5289,6 +5295,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
// If element has non-integer type, bitcast it back to the expected type.
if (EltVT != EltIVT)
Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
+ // Past legalizer, we may need to extent i8 -> i16 to match the register type.
+ if (EltVT != N->getValueType(0))
+ Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
+
return Result;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 28c4cadb303ad4f..047161fb2027dee 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1486,23 +1486,24 @@ defm OR : BITWISE<"or", or>;
defm AND : BITWISE<"and", and>;
defm XOR : BITWISE<"xor", xor>;
-// Lower logical v2i16 ops as bitwise ops on b32.
-def: Pat<(or (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
- (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
-def: Pat<(xor (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
- (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
-def: Pat<(and (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
- (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
-
-// The constants get legalized into a bitcast from i32, so that's what we need
-// to match here.
-def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
- (ORb32ri Int32Regs:$a, imm:$b)>;
-def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
- (XORb32ri Int32Regs:$a, imm:$b)>;
-def: Pat<(and Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
- (ANDb32ri Int32Regs:$a, imm:$b)>;
-
+// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
+foreach vt = [v2i16, v4i8] in {
+ def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
+ (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
+ def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
+ (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
+ def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
+ (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+
+ // The constants get legalized into a bitcast from i32, so that's what we need
+ // to match here.
+ def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
+ (ORb32ri Int32Regs:$a, imm:$b)>;
+ def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
+ (XORb32ri Int32Regs:$a, imm:$b)>;
+ def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
+ (ANDb32ri Int32Regs:$a, imm:$b)>;
+}
def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
"not.pred \t$dst, $src;",
@@ -2682,7 +2683,7 @@ foreach vt = [f16, bf16] in {
def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
}
-foreach vt = [v2f16, v2bf16, v2i16] in {
+foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
}
@@ -2995,8 +2996,8 @@ def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
(ProxyRegI16 Int16Regs:$a)>;
}
-foreach ta = [v2f16, v2bf16, v2i16, i32] in {
- foreach tb = [v2f16, v2bf16, v2i16, i32] in {
+foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
+ foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
if !ne(ta, tb) then {
def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
(ProxyRegI32 Int32Regs:$a)>;
@@ -3292,6 +3293,10 @@ let hasSideEffects = false in {
(ins Int16Regs:$s1, Int16Regs:$s2,
Int16Regs:$s3, Int16Regs:$s4),
"mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
+ def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2,
+ Int16Regs:$s3, Int16Regs:$s4),
+ "mov.b32 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
(ins Int16Regs:$s1, Int16Regs:$s2),
"mov.b32 \t$d, {{$s1, $s2}};", []>;
@@ -3307,6 +3312,10 @@ let hasSideEffects = false in {
Int16Regs:$d3, Int16Regs:$d4),
(ins Int64Regs:$s),
"mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+ def I32toV4I8 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+ Int16Regs:$d3, Int16Regs:$d4),
+ (ins Int32Regs:$s),
+ "mov.b32 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
(ins Int32Regs:$s),
"mov.b32 \t{{$d1, $d2}}, $s;", []>;
@@ -3354,6 +3363,9 @@ def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+def : Pat<(v4i8 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b),
+ (i16 Int16Regs:$c), (i16 Int16Regs:$d))),
+ (V4I8toI32 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c, Int16Regs:$d)>;
// Count leading zeros
let hasSideEffects = false in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index ed9dabf39dd7ad9..b5231a9cf67f93a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -58,7 +58,7 @@ foreach i = 0...31 in {
//===----------------------------------------------------------------------===//
def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>;
-def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16], 32,
+def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32,
(add (sequence "R%u", 0, 4),
VRFrame32, VRFrameLocal32)>;
def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index 9012339fb6b1e20..98ab93774588d28 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -130,9 +130,9 @@ define void @foo12(ptr noalias readonly %from, ptr %to) {
}
; SM20-LABEL: .visible .entry foo13(
-; SM20: ld.global.v4.u8
+; SM20: ld.global.u32
; SM35-LABEL: .visible .entry foo13(
-; SM35: ld.global.nc.v4.u8
+; SM35: ld.global.nc.u32
define void @foo13(ptr noalias readonly %from, ptr %to) {
%1 = load <4 x i8>, ptr %from
store <4 x i8> %1, ptr %to
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 2d87271e30ae0b7..b4208c691c91dfa 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -212,18 +212,16 @@ define signext i8 @test_i8s(i8 signext %a) {
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v3i8(
; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
-; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
-; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
+; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v3i8_param_0];
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b8 [param0+2], [[E2]];
+; CHECK: st.param.b32 [param0+0], [[R]]
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v3i8,
-; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
-; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
+; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0+0];
+; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very
+; interesting here, so it's skipped.
+; CHECK: st.param.b32 [func_retval0+0],
; CHECK-NEXT: ret;
define <3 x i8> @test_v3i8(<3 x i8> %a) {
%r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
@@ -233,14 +231,14 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) {
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v4i8(
; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
-; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
+; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0]
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: st.param.b32 [param0+0], [[R]];
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v4i8,
-; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[RET]];
; CHECK-NEXT: ret;
define <4 x i8> @test_v4i8(<4 x i8> %a) {
%r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
@@ -250,10 +248,10 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) {
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_v5i8(
; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
+; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_v5i8_param_0]
; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
-; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.v4.b8 [param0+0],
; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: call.uni (retval0),
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 16579de882ed4b6..74087be4834d966 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -60,46 +60,20 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin
define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot2[4];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<20>;
-; CHECK-NEXT: .reg .b32 %r<21>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot2;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_param_2];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: and.b32 %r4, %r2, %r3;
-; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8};
-; CHECK-NEXT: and.b32 %r6, %r1, %r5;
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_param_1];
-; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12};
-; CHECK-NEXT: mov.b32 %r8, {%rs9, %rs10};
-; CHECK-NEXT: xor.b32 %r9, %r5, 16711935;
-; CHECK-NEXT: xor.b32 %r10, %r3, 16711935;
-; CHECK-NEXT: and.b32 %r11, %r8, %r10;
-; CHECK-NEXT: and.b32 %r12, %r7, %r9;
-; CHECK-NEXT: or.b32 %r13, %r6, %r12;
-; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r13;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs13, %rs14};
-; CHECK-NEXT: or.b32 %r14, %r4, %r11;
-; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r14;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs15, %rs16};
-; CHECK-NEXT: ld.u16 %r15, [%SP+0];
-; CHECK-NEXT: shl.b32 %r16, %r15, 16;
-; CHECK-NEXT: ld.u16 %r17, [%SP+2];
-; CHECK-NEXT: or.b32 %r18, %r17, %r16;
-; CHECK-NEXT: shr.u32 %r19, %r18, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs17, %r19;
-; CHECK-NEXT: cvt.u16.u32 %rs18, %r15;
-; CHECK-NEXT: bfe.s32 %r20, %r15, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs19, %r20;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19};
+; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_2];
+; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_param_0];
+; CHECK-NEXT: and.b32 %r5, %r4, %r1;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
+; CHECK-NEXT: and.b32 %r9, %r3, %r8;
+; CHECK-NEXT: or.b32 %r10, %r5, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -111,48 +85,20 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8_undef(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot3[4];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<22>;
-; CHECK-NEXT: .reg .b32 %r<22>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot3;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_undef_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_undef_param_2];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: and.b32 %r4, %r2, %r3;
-; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8};
-; CHECK-NEXT: and.b32 %r6, %r1, %r5;
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_undef_param_1];
-; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12};
-; CHECK-NEXT: mov.b32 %r8, {%rs9, %rs10};
-; CHECK-NEXT: mov.u16 %rs13, 255;
-; CHECK-NEXT: mov.b32 %r9, {%rs14, %rs13};
-; CHECK-NEXT: xor.b32 %r10, %r5, %r9;
-; CHECK-NEXT: xor.b32 %r11, %r3, 16711935;
-; CHECK-NEXT: and.b32 %r12, %r8, %r11;
-; CHECK-NEXT: and.b32 %r13, %r7, %r10;
-; CHECK-NEXT: or.b32 %r14, %r6, %r13;
-; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r14;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs15, %rs16};
-; CHECK-NEXT: or.b32 %r15, %r4, %r12;
-; CHECK-NEXT: mov.b32 {%rs17, %rs18}, %r15;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs17, %rs18};
-; CHECK-NEXT: ld.u16 %r16, [%SP+0];
-; CHECK-NEXT: shl.b32 %r17, %r16, 16;
-; CHECK-NEXT: ld.u16 %r18, [%SP+2];
-; CHECK-NEXT: or.b32 %r19, %r18, %r17;
-; CHECK-NEXT: shr.u32 %r20, %r19, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs19, %r20;
-; CHECK-NEXT: cvt.u16.u32 %rs20, %r16;
-; CHECK-NEXT: bfe.s32 %r21, %r16, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs21, %r21;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs17, %rs19, %rs20, %rs21};
+; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_2];
+; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_undef_param_0];
+; CHECK-NEXT: and.b32 %r5, %r4, %r1;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs2, %rs1};
+; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
+; CHECK-NEXT: and.b32 %r9, %r3, %r8;
+; CHECK-NEXT: or.b32 %r10, %r5, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
@@ -212,84 +158,24 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin
define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-LABEL: out_v8i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot6[8];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<40>;
-; CHECK-NEXT: .reg .b32 %r<38>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<22>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot6;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v8i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v8i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8};
-; CHECK-NEXT: mov.b32 %r4, {%rs5, %rs6};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v8i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r5, {%rs9, %rs10};
-; CHECK-NEXT: and.b32 %r6, %r4, %r5;
-; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12};
-; CHECK-NEXT: and.b32 %r8, %r3, %r7;
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v8i8_param_2];
-; CHECK-NEXT: mov.b32 %r9, {%rs13, %rs14};
-; CHECK-NEXT: and.b32 %r10, %r2, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs15, %rs16};
-; CHECK-NEXT: and.b32 %r12, %r1, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v8i8_param_1];
-; CHECK-NEXT: mov.b32 %r13, {%rs19, %rs20};
-; CHECK-NEXT: mov.b32 %r14, {%rs17, %rs18};
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v8i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r15, {%rs23, %rs24};
-; CHECK-NEXT: mov.b32 %r16, {%rs21, %rs22};
-; CHECK-NEXT: xor.b32 %r17, %r11, 16711935;
-; CHECK-NEXT: xor.b32 %r18, %r9, 16711935;
-; CHECK-NEXT: xor.b32 %r19, %r7, 16711935;
-; CHECK-NEXT: xor.b32 %r20, %r5, 16711935;
-; CHECK-NEXT: and.b32 %r21, %r16, %r20;
-; CHECK-NEXT: and.b32 %r22, %r15, %r19;
-; CHECK-NEXT: and.b32 %r23, %r14, %r18;
-; CHECK-NEXT: and.b32 %r24, %r13, %r17;
-; CHECK-NEXT: or.b32 %r25, %r12, %r24;
-; CHECK-NEXT: mov.b32 {%rs25, %rs26}, %r25;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs25, %rs26};
-; CHECK-NEXT: or.b32 %r26, %r10, %r23;
-; CHECK-NEXT: mov.b32 {%rs27, %rs28}, %r26;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs27, %rs28};
-; CHECK-NEXT: or.b32 %r27, %r8, %r22;
-; CHECK-NEXT: mov.b32 {%rs29, %rs30}, %r27;
-; CHECK-NEXT: st.v2.u8 [%SP+4], {%rs29, %rs30};
-; CHECK-NEXT: or.b32 %r28, %r6, %r21;
-; CHECK-NEXT: mov.b32 {%rs31, %rs32}, %r28;
-; CHECK-NEXT: st.v2.u8 [%SP+6], {%rs31, %rs32};
-; CHECK-NEXT: ld.u16 %r29, [%SP+0];
-; CHECK-NEXT: shl.b32 %r30, %r29, 16;
-; CHECK-NEXT: ld.u16 %r31, [%SP+2];
-; CHECK-NEXT: or.b32 %r32, %r31, %r30;
-; CHECK-NEXT: cvt.u64.u32 %rd1, %r32;
-; CHECK-NEXT: ld.u16 %r33, [%SP+4];
-; CHECK-NEXT: shl.b32 %r34, %r33, 16;
-; CHECK-NEXT: ld.u16 %r35, [%SP+6];
-; CHECK-NEXT: or.b32 %r36, %r35, %r34;
-; CHECK-NEXT: cvt.u64.u32 %rd2, %r36;
-; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
-; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
-; CHECK-NEXT: shr.u32 %r37, %r36, 8;
-; CHECK-NEXT: shr.u64 %rd5, %rd4, 24;
-; CHECK-NEXT: cvt.u16.u64 %rs33, %rd5;
-; CHECK-NEXT: shr.u64 %rd6, %rd1, 16;
-; CHECK-NEXT: cvt.u16.u64 %rs34, %rd6;
-; CHECK-NEXT: shr.u64 %rd7, %rd1, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs35, %rd7;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33};
-; CHECK-NEXT: cvt.u16.u32 %rs36, %r37;
-; CHECK-NEXT: bfe.s64 %rd8, %rd2, 24, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs37, %rd8;
-; CHECK-NEXT: cvt.u16.u32 %rs38, %r33;
-; CHECK-NEXT: cvt.u16.u32 %rs39, %r35;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37};
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_1];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [out_v8i8_param_2];
+; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v8i8_param_0];
+; CHECK-NEXT: and.b32 %r11, %r9, %r5;
+; CHECK-NEXT: and.b32 %r13, %r10, %r6;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r15, {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT: xor.b32 %r16, %r6, %r15;
+; CHECK-NEXT: xor.b32 %r17, %r5, %r15;
+; CHECK-NEXT: and.b32 %r18, %r1, %r17;
+; CHECK-NEXT: and.b32 %r19, %r2, %r16;
+; CHECK-NEXT: or.b32 %r20, %r13, %r19;
+; CHECK-NEXT: or.b32 %r21, %r11, %r18;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r21, %r20};
; CHECK-NEXT: ret;
%mx = and <8 x i8> %x, %mask
%notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -408,90 +294,32 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-LABEL: out_v16i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<65>;
-; CHECK-NEXT: .reg .b32 %r<57>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<42>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v16i8_param_0+12];
-; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v16i8_param_0+8];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v16i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r5, {%rs9, %rs10};
-; CHECK-NEXT: mov.b32 %r6, {%rs11, %rs12};
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v16i8_param_0];
-; CHECK-NEXT: mov.b32 %r7, {%rs13, %rs14};
-; CHECK-NEXT: mov.b32 %r8, {%rs15, %rs16};
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v16i8_param_2];
-; CHECK-NEXT: mov.b32 %r9, {%rs19, %rs20};
-; CHECK-NEXT: and.b32 %r10, %r8, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs17, %rs18};
-; CHECK-NEXT: and.b32 %r12, %r7, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v16i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r13, {%rs23, %rs24};
-; CHECK-NEXT: and.b32 %r14, %r6, %r13;
-; CHECK-NEXT: mov.b32 %r15, {%rs21, %rs22};
-; CHECK-NEXT: and.b32 %r16, %r5, %r15;
-; CHECK-NEXT: ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [out_v16i8_param_2+8];
-; CHECK-NEXT: mov.b32 %r17, {%rs27, %rs28};
-; CHECK-NEXT: and.b32 %r18, %r4, %r17;
-; CHECK-NEXT: mov.b32 %r19, {%rs25, %rs26};
-; CHECK-NEXT: and.b32 %r20, %r3, %r19;
-; CHECK-NEXT: ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [out_v16i8_param_2+12];
-; CHECK-NEXT: mov.b32 %r21, {%rs31, %rs32};
-; CHECK-NEXT: and.b32 %r22, %r2, %r21;
-; CHECK-NEXT: mov.b32 %r23, {%rs29, %rs30};
-; CHECK-NEXT: and.b32 %r24, %r1, %r23;
-; CHECK-NEXT: ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [out_v16i8_param_1+12];
-; CHECK-NEXT: mov.b32 %r25, {%rs33, %rs34};
-; CHECK-NEXT: mov.b32 %r26, {%rs35, %rs36};
-; CHECK-NEXT: ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [out_v16i8_param_1+8];
-; CHECK-NEXT: mov.b32 %r27, {%rs37, %rs38};
-; CHECK-NEXT: mov.b32 %r28, {%rs39, %rs40};
-; CHECK-NEXT: ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [out_v16i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r29, {%rs41, %rs42};
-; CHECK-NEXT: mov.b32 %r30, {%rs43, %rs44};
-; CHECK-NEXT: ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [out_v16i8_param_1];
-; CHECK-NEXT: mov.b32 %r31, {%rs45, %rs46};
-; CHECK-NEXT: mov.b32 %r32, {%rs47, %rs48};
-; CHECK-NEXT: xor.b32 %r33, %r23, 16711935;
-; CHECK-NEXT: xor.b32 %r34, %r21, 16711935;
-; CHECK-NEXT: xor.b32 %r35, %r19, 16711935;
-; CHECK-NEXT: xor.b32 %r36, %r17, 16711935;
-; CHECK-NEXT: xor.b32 %r37, %r15, 16711935;
-; CHECK-NEXT: xor.b32 %r38, %r13, 16711935;
-; CHECK-NEXT: xor.b32 %r39, %r11, 16711935;
-; CHECK-NEXT: xor.b32 %r40, %r9, 16711935;
-; CHECK-NEXT: and.b32 %r41, %r32, %r40;
-; CHECK-NEXT: and.b32 %r42, %r31, %r39;
-; CHECK-NEXT: and.b32 %r43, %r30, %r38;
-; CHECK-NEXT: and.b32 %r44, %r29, %r37;
-; CHECK-NEXT: and.b32 %r45, %r28, %r36;
-; CHECK-NEXT: and.b32 %r46, %r27, %r35;
-; CHECK-NEXT: and.b32 %r47, %r26, %r34;
-; CHECK-NEXT: and.b32 %r48, %r25, %r33;
-; CHECK-NEXT: or.b32 %r49, %r24, %r48;
-; CHECK-NEXT: or.b32 %r50, %r22, %r47;
-; CHECK-NEXT: or.b32 %r51, %r20, %r46;
-; CHECK-NEXT: or.b32 %r52, %r18, %r45;
-; CHECK-NEXT: or.b32 %r53, %r16, %r44;
-; CHECK-NEXT: or.b32 %r54, %r14, %r43;
-; CHECK-NEXT: or.b32 %r55, %r12, %r42;
-; CHECK-NEXT: or.b32 %r56, %r10, %r41;
-; CHECK-NEXT: mov.b32 {%rs49, %rs50}, %r56;
-; CHECK-NEXT: mov.b32 {%rs51, %rs52}, %r55;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50};
-; CHECK-NEXT: mov.b32 {%rs53, %rs54}, %r54;
-; CHECK-NEXT: mov.b32 {%rs55, %rs56}, %r53;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54};
-; CHECK-NEXT: mov.b32 {%rs57, %rs58}, %r52;
-; CHECK-NEXT: mov.b32 {%rs59, %rs60}, %r51;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58};
-; CHECK-NEXT: mov.b32 {%rs61, %rs62}, %r50;
-; CHECK-NEXT: mov.b32 {%rs63, %rs64}, %r49;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62};
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_1];
+; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [out_v16i8_param_2];
+; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [out_v16i8_param_0];
+; CHECK-NEXT: and.b32 %r21, %r17, %r9;
+; CHECK-NEXT: and.b32 %r23, %r18, %r10;
+; CHECK-NEXT: and.b32 %r25, %r19, %r11;
+; CHECK-NEXT: and.b32 %r27, %r20, %r12;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r29, {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT: xor.b32 %r30, %r12, %r29;
+; CHECK-NEXT: xor.b32 %r31, %r11, %r29;
+; CHECK-NEXT: xor.b32 %r32, %r10, %r29;
+; CHECK-NEXT: xor.b32 %r33, %r9, %r29;
+; CHECK-NEXT: and.b32 %r34, %r1, %r33;
+; CHECK-NEXT: and.b32 %r35, %r2, %r32;
+; CHECK-NEXT: and.b32 %r36, %r3, %r31;
+; CHECK-NEXT: and.b32 %r37, %r4, %r30;
+; CHECK-NEXT: or.b32 %r38, %r27, %r37;
+; CHECK-NEXT: or.b32 %r39, %r25, %r36;
+; CHECK-NEXT: or.b32 %r40, %r23, %r35;
+; CHECK-NEXT: or.b32 %r41, %r21, %r34;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r41, %r40, %r39, %r38};
; CHECK-NEXT: ret;
%mx = and <16 x i8> %x, %mask
%notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -684,44 +512,16 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind
define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: in_v4i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot18[4];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<20>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot18;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v4i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v4i8_param_1];
-; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8};
-; CHECK-NEXT: xor.b32 %r4, %r2, %r3;
-; CHECK-NEXT: mov.b32 %r5, {%rs5, %rs6};
-; CHECK-NEXT: xor.b32 %r6, %r1, %r5;
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v4i8_param_2];
-; CHECK-NEXT: mov.b32 %r7, {%rs9, %rs10};
-; CHECK-NEXT: and.b32 %r8, %r6, %r7;
-; CHECK-NEXT: mov.b32 %r9, {%rs11, %rs12};
-; CHECK-NEXT: and.b32 %r10, %r4, %r9;
-; CHECK-NEXT: xor.b32 %r11, %r10, %r3;
-; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r11;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs13, %rs14};
-; CHECK-NEXT: xor.b32 %r12, %r8, %r5;
-; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r12;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs15, %rs16};
-; CHECK-NEXT: ld.u16 %r13, [%SP+0];
-; CHECK-NEXT: shl.b32 %r14, %r13, 16;
-; CHECK-NEXT: ld.u16 %r15, [%SP+2];
-; CHECK-NEXT: or.b32 %r16, %r15, %r14;
-; CHECK-NEXT: shr.u32 %r17, %r16, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs17, %r17;
-; CHECK-NEXT: cvt.u16.u32 %rs18, %r13;
-; CHECK-NEXT: bfe.s32 %r18, %r13, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs19, %r18;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19};
+; CHECK-NEXT: ld.param.u32 %r1, [in_v4i8_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [in_v4i8_param_1];
+; CHECK-NEXT: xor.b32 %r3, %r1, %r2;
+; CHECK-NEXT: ld.param.u32 %r4, [in_v4i8_param_2];
+; CHECK-NEXT: and.b32 %r5, %r3, %r4;
+; CHECK-NEXT: xor.b32 %r6, %r5, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6;
; CHECK-NEXT: ret;
%n0 = xor <4 x i8> %x, %y
%n1 = and <4 x i8> %n0, %mask
@@ -776,80 +576,19 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind
define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-LABEL: in_v8i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot21[8];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<40>;
-; CHECK-NEXT: .reg .b32 %r<34>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b32 %r<15>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot21;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v8i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v8i8_param_0];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v8i8_param_1];
-; CHECK-NEXT: mov.b32 %r5, {%rs11, %rs12};
-; CHECK-NEXT: xor.b32 %r6, %r4, %r5;
-; CHECK-NEXT: mov.b32 %r7, {%rs9, %rs10};
-; CHECK-NEXT: xor.b32 %r8, %r3, %r7;
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v8i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r9, {%rs15, %rs16};
-; CHECK-NEXT: xor.b32 %r10, %r2, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs13, %rs14};
-; CHECK-NEXT: xor.b32 %r12, %r1, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v8i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r13, {%rs17, %rs18};
-; CHECK-NEXT: and.b32 %r14, %r12, %r13;
-; CHECK-NEXT: mov.b32 %r15, {%rs19, %rs20};
-; CHECK-NEXT: and.b32 %r16, %r10, %r15;
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v8i8_param_2];
-; CHECK-NEXT: mov.b32 %r17, {%rs21, %rs22};
-; CHECK-NEXT: and.b32 %r18, %r8, %r17;
-; CHECK-NEXT: mov.b32 %r19, {%rs23, %rs24};
-; CHECK-NEXT: and.b32 %r20, %r6, %r19;
-; CHECK-NEXT: xor.b32 %r21, %r20, %r5;
-; CHECK-NEXT: mov.b32 {%rs25, %rs26}, %r21;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs25, %rs26};
-; CHECK-NEXT: xor.b32 %r22, %r18, %r7;
-; CHECK-NEXT: mov.b32 {%rs27, %rs28}, %r22;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs27, %rs28};
-; CHECK-NEXT: xor.b32 %r23, %r16, %r9;
-; CHECK-NEXT: mov.b32 {%rs29, %rs30}, %r23;
-; CHECK-NEXT: st.v2.u8 [%SP+4], {%rs29, %rs30};
-; CHECK-NEXT: xor.b32 %r24, %r14, %r11;
-; CHECK-NEXT: mov.b32 {%rs31, %rs32}, %r24;
-; CHECK-NEXT: st.v2.u8 [%SP+6], {%rs31, %rs32};
-; CHECK-NEXT: ld.u16 %r25, [%SP+0];
-; CHECK-NEXT: shl.b32 %r26, %r25, 16;
-; CHECK-NEXT: ld.u16 %r27, [%SP+2];
-; CHECK-NEXT: or.b32 %r28, %r27, %r26;
-; CHECK-NEXT: cvt.u64.u32 %rd1, %r28;
-; CHECK-NEXT: ld.u16 %r29, [%SP+4];
-; CHECK-NEXT: shl.b32 %r30, %r29, 16;
-; CHECK-NEXT: ld.u16 %r31, [%SP+6];
-; CHECK-NEXT: or.b32 %r32, %r31, %r30;
-; CHECK-NEXT: cvt.u64.u32 %rd2, %r32;
-; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
-; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
-; CHECK-NEXT: shr.u32 %r33, %r32, 8;
-; CHECK-NEXT: shr.u64 %rd5, %rd4, 24;
-; CHECK-NEXT: cvt.u16.u64 %rs33, %rd5;
-; CHECK-NEXT: shr.u64 %rd6, %rd1, 16;
-; CHECK-NEXT: cvt.u16.u64 %rs34, %rd6;
-; CHECK-NEXT: shr.u64 %rd7, %rd1, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs35, %rd7;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33};
-; CHECK-NEXT: cvt.u16.u32 %rs36, %r33;
-; CHECK-NEXT: bfe.s64 %rd8, %rd2, 24, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs37, %rd8;
-; CHECK-NEXT: cvt.u16.u32 %rs38, %r29;
-; CHECK-NEXT: cvt.u16.u32 %rs39, %r31;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37};
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v8i8_param_0];
+; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [in_v8i8_param_1];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [in_v8i8_param_2];
+; CHECK-NEXT: xor.b32 %r7, %r2, %r4;
+; CHECK-NEXT: and.b32 %r8, %r7, %r6;
+; CHECK-NEXT: xor.b32 %r9, %r8, %r4;
+; CHECK-NEXT: xor.b32 %r11, %r1, %r3;
+; CHECK-NEXT: and.b32 %r12, %r11, %r5;
+; CHECK-NEXT: xor.b32 %r13, %r12, %r3;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r13, %r9};
; CHECK-NEXT: ret;
%n0 = xor <8 x i8> %x, %y
%n1 = and <8 x i8> %n0, %mask
@@ -930,82 +669,25 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind
define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-LABEL: in_v16i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<65>;
-; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b32 %r<29>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v16i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v16i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8};
-; CHECK-NEXT: mov.b32 %r4, {%rs5, %rs6};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v16i8_param_0+8];
-; CHECK-NEXT: mov.b32 %r5, {%rs11, %rs12};
-; CHECK-NEXT: mov.b32 %r6, {%rs9, %rs10};
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v16i8_param_0+12];
-; CHECK-NEXT: mov.b32 %r7, {%rs15, %rs16};
-; CHECK-NEXT: mov.b32 %r8, {%rs13, %rs14};
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v16i8_param_1+12];
-; CHECK-NEXT: mov.b32 %r9, {%rs17, %rs18};
-; CHECK-NEXT: xor.b32 %r10, %r8, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs19, %rs20};
-; CHECK-NEXT: xor.b32 %r12, %r7, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v16i8_param_1+8];
-; CHECK-NEXT: mov.b32 %r13, {%rs21, %rs22};
-; CHECK-NEXT: xor.b32 %r14, %r6, %r13;
-; CHECK-NEXT: mov.b32 %r15, {%rs23, %rs24};
-; CHECK-NEXT: xor.b32 %r16, %r5, %r15;
-; CHECK-NEXT: ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [in_v16i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r17, {%rs25, %rs26};
-; CHECK-NEXT: xor.b32 %r18, %r4, %r17;
-; CHECK-NEXT: mov.b32 %r19, {%rs27, %rs28};
-; CHECK-NEXT: xor.b32 %r20, %r3, %r19;
-; CHECK-NEXT: ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [in_v16i8_param_1];
-; CHECK-NEXT: mov.b32 %r21, {%rs29, %rs30};
-; CHECK-NEXT: xor.b32 %r22, %r2, %r21;
-; CHECK-NEXT: mov.b32 %r23, {%rs31, %rs32};
-; CHECK-NEXT: xor.b32 %r24, %r1, %r23;
-; CHECK-NEXT: ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [in_v16i8_param_2];
-; CHECK-NEXT: mov.b32 %r25, {%rs35, %rs36};
-; CHECK-NEXT: and.b32 %r26, %r24, %r25;
-; CHECK-NEXT: mov.b32 %r27, {%rs33, %rs34};
-; CHECK-NEXT: and.b32 %r28, %r22, %r27;
-; CHECK-NEXT: ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [in_v16i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r29, {%rs39, %rs40};
-; CHECK-NEXT: and.b32 %r30, %r20, %r29;
-; CHECK-NEXT: mov.b32 %r31, {%rs37, %rs38};
-; CHECK-NEXT: and.b32 %r32, %r18, %r31;
-; CHECK-NEXT: ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [in_v16i8_param_2+8];
-; CHECK-NEXT: mov.b32 %r33, {%rs43, %rs44};
-; CHECK-NEXT: and.b32 %r34, %r16, %r33;
-; CHECK-NEXT: mov.b32 %r35, {%rs41, %rs42};
-; CHECK-NEXT: and.b32 %r36, %r14, %r35;
-; CHECK-NEXT: ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [in_v16i8_param_2+12];
-; CHECK-NEXT: mov.b32 %r37, {%rs47, %rs48};
-; CHECK-NEXT: and.b32 %r38, %r12, %r37;
-; CHECK-NEXT: mov.b32 %r39, {%rs45, %rs46};
-; CHECK-NEXT: and.b32 %r40, %r10, %r39;
-; CHECK-NEXT: xor.b32 %r41, %r40, %r9;
-; CHECK-NEXT: xor.b32 %r42, %r38, %r11;
-; CHECK-NEXT: xor.b32 %r43, %r36, %r13;
-; CHECK-NEXT: xor.b32 %r44, %r34, %r15;
-; CHECK-NEXT: xor.b32 %r45, %r32, %r17;
-; CHECK-NEXT: xor.b32 %r46, %r30, %r19;
-; CHECK-NEXT: xor.b32 %r47, %r28, %r21;
-; CHECK-NEXT: xor.b32 %r48, %r26, %r23;
-; CHECK-NEXT: mov.b32 {%rs49, %rs50}, %r48;
-; CHECK-NEXT: mov.b32 {%rs51, %rs52}, %r47;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50};
-; CHECK-NEXT: mov.b32 {%rs53, %rs54}, %r46;
-; CHECK-NEXT: mov.b32 {%rs55, %rs56}, %r45;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54};
-; CHECK-NEXT: mov.b32 {%rs57, %rs58}, %r44;
-; CHECK-NEXT: mov.b32 {%rs59, %rs60}, %r43;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58};
-; CHECK-NEXT: mov.b32 {%rs61, %rs62}, %r42;
-; CHECK-NEXT: mov.b32 {%rs63, %rs64}, %r41;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62};
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0];
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v16i8_param_1];
+; CHECK-NEXT: xor.b32 %r9, %r4, %r8;
+; CHECK-NEXT: xor.b32 %r10, %r3, %r7;
+; CHECK-NEXT: xor.b32 %r11, %r2, %r6;
+; CHECK-NEXT: xor.b32 %r12, %r1, %r5;
+; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v16i8_param_2];
+; CHECK-NEXT: and.b32 %r17, %r12, %r13;
+; CHECK-NEXT: and.b32 %r18, %r11, %r14;
+; CHECK-NEXT: and.b32 %r19, %r10, %r15;
+; CHECK-NEXT: and.b32 %r20, %r9, %r16;
+; CHECK-NEXT: xor.b32 %r21, %r20, %r8;
+; CHECK-NEXT: xor.b32 %r23, %r19, %r7;
+; CHECK-NEXT: xor.b32 %r25, %r18, %r6;
+; CHECK-NEXT: xor.b32 %r27, %r17, %r5;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r27, %r25, %r23, %r21};
; CHECK-NEXT: ret;
%n0 = xor <16 x i8> %x, %y
%n1 = and <16 x i8> %n0, %mask
diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll
index 092607462f3329b..8333a9b935d6af8 100644
--- a/llvm/test/CodeGen/NVPTX/vec8.ll
+++ b/llvm/test/CodeGen/NVPTX/vec8.ll
@@ -5,10 +5,9 @@ target triple = "nvptx-unknown-cuda"
; CHECK: .visible .func foo
define void @foo(<8 x i8> %a, ptr %b) {
-; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
-; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo_param_0]
; CHECK-DAG: ld.param.u64 %[[B:rd[0-9+]]], [foo_param_1]
-; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]];
+; CHECK: add.s16 [[T:%rs[0-9+]]],
; CHECK: st.u8 [%[[B]]], [[T]];
%t0 = extractelement <8 x i8> %a, i32 1
%t1 = extractelement <8 x i8> %a, i32 6
>From bda4bd36ded20dba4ac89824a42b8a2017c41247 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 2 Oct 2023 18:05:42 -0700
Subject: [PATCH 2/2] More work on fleshing out extractelt/build_vector for
v4i8
Verified that NVPTX tests pass with ptxas being able to compiler PTX produced by
llc tests.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 18 ++++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 71 +++++++-------
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 64 +++++++++----
llvm/test/CodeGen/NVPTX/extractelement.ll | 55 ++++++++++-
llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 2 +-
...unfold-masked-merge-vector-variablemask.ll | 95 ++++++++-----------
7 files changed, 196 insertions(+), 110 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1daa4971981c25c..c3bcf8f05a278ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTXUtilities.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
@@ -3569,6 +3570,23 @@ bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
}
+bool NVPTXDAGToDAGISel::SelectExtractEltFromV4I8(SDValue N, SDValue &V,
+ SDValue &BitOffset) {
+ SDValue Vector = N->getOperand(0);
+ if (!(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Vector->getValueType(0) == MVT::v4i8))
+ return false;
+
+ if (const ConstantSDNode *IdxConst =
+ dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ V = Vector;
+ BitOffset = CurDAG->getTargetConstant(IdxConst->getZExtValue() * 8,
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+ return false;
+}
+
bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
unsigned int spN) const {
const Value *Src = nullptr;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 06922331f5e2059..34b5dd449ce086f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -95,6 +95,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
SDValue &Offset);
bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);
+ bool SelectExtractEltFromV4I8(SDValue N, SDValue &Value, SDValue &Idx);
bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7880d70fb2c6fea..66dcdb53b136b96 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -497,6 +497,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
+ // TODO: we should eventually lower it as PRMT instruction.
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
+
// Operations not directly supported by NVPTX.
for (MVT VT :
{MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::f64,
@@ -2156,45 +2160,47 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
-// We can init constant f16x2 with a single .b32 move. Normally it
+// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
// would get lowered as two constant loads and vector-packing move.
-// mov.b16 %h1, 0x4000;
-// mov.b16 %h2, 0x3C00;
-// mov.b32 %hh2, {%h2, %h1};
// Instead we want just a constant move:
// mov.b32 %hh2, 0x40003C00
-//
-// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
-// generates good SASS in both cases.
SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op->getValueType(0);
- if (!(Isv2x16VT(VT)))
+ if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
+ return Op;
+
+ if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
+ return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
+ isa<ConstantFPSDNode>(Operand);
+ }))
return Op;
- APInt E0;
- APInt E1;
- if (VT == MVT::v2f16 || VT == MVT::v2bf16) {
- if (!(isa<ConstantFPSDNode>(Op->getOperand(0)) &&
- isa<ConstantFPSDNode>(Op->getOperand(1))))
- return Op;
-
- E0 = cast<ConstantFPSDNode>(Op->getOperand(0))
- ->getValueAPF()
- .bitcastToAPInt();
- E1 = cast<ConstantFPSDNode>(Op->getOperand(1))
- ->getValueAPF()
- .bitcastToAPInt();
- } else {
- assert(VT == MVT::v2i16);
- if (!(isa<ConstantSDNode>(Op->getOperand(0)) &&
- isa<ConstantSDNode>(Op->getOperand(1))))
- return Op;
- E0 = cast<ConstantSDNode>(Op->getOperand(0))->getAPIntValue();
- E1 = cast<ConstantSDNode>(Op->getOperand(1))->getAPIntValue();
+ // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
+ auto GetOperand = [](SDValue Op, int N) -> APInt {
+ const SDValue &Operand = Op->getOperand(N);
+ EVT VT = Op->getValueType(0);
+ if (Operand->isUndef())
+ return APInt(32, 0);
+ APInt Value;
+ if (VT == MVT::v2f16 || VT == MVT::v2bf16)
+ Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
+ else if (VT == MVT::v2i16 || VT == MVT::v4i8)
+ Value = cast<ConstantSDNode>(Operand)->getAPIntValue();
+ else
+ llvm_unreachable("Unsupported type");
+ return Value.zext(32);
+ };
+ APInt Value;
+ if (Isv2x16VT(VT)) {
+ Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
+ } else if (VT == MVT::v4i8) {
+ Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
+ GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
+ } else {
+ llvm_unreachable("Unsupported type");
}
- SDValue Const =
- DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+ SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
}
@@ -5262,11 +5268,12 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
SDValue Vector = N->getOperand(0);
EVT VectorVT = Vector.getValueType();
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
- IsPTXVectorType(VectorVT.getSimpleVT()) && VectorVT != MVT::v4i8)
+ IsPTXVectorType(VectorVT.getSimpleVT()))
return SDValue(); // Native vector loads already combine nicely w/
// extract_vector_elt, except for v4i8.
// Don't mess with singletons or v2*16 types, we already handle them OK.
- if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT))
+ if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
+ VectorVT == MVT::v4i8)
return SDValue();
uint64_t VectorBits = VectorVT.getSizeInBits();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 047161fb2027dee..307963aaa800b88 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1738,7 +1738,7 @@ def FUNSHFRCLAMP :
// restriction in PTX?
//
// dest and src may be int32 or int64, but start and end are always int32.
-multiclass BFX<string Instr, RegisterClass RC> {
+multiclass BFE<string Instr, RegisterClass RC> {
def rrr
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, Int32Regs:$c),
@@ -1752,17 +1752,29 @@ multiclass BFX<string Instr, RegisterClass RC> {
(ins RC:$a, i32imm:$b, i32imm:$c),
!strconcat(Instr, " \t$d, $a, $b, $c;"), []>;
}
+multiclass BFI<string Instr, RegisterClass RC> {
+ def rrr
+ : NVPTXInst<(outs RC:$f),
+ (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
+ def rri
+ : NVPTXInst<(outs RC:$f),
+ (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
+ def rii
+ : NVPTXInst<(outs RC:$f),
+ (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
+}
let hasSideEffects = false in {
- defm BFE_S32 : BFX<"bfe.s32", Int32Regs>;
- defm BFE_U32 : BFX<"bfe.u32", Int32Regs>;
- defm BFE_S64 : BFX<"bfe.s64", Int64Regs>;
- defm BFE_U64 : BFX<"bfe.u64", Int64Regs>;
-
- defm BFI_S32 : BFX<"bfi.s32", Int32Regs>;
- defm BFI_U32 : BFX<"bfi.u32", Int32Regs>;
- defm BFI_S64 : BFX<"bfi.s64", Int64Regs>;
- defm BFI_U64 : BFX<"bfi.u64", Int64Regs>;
+ defm BFE_S32 : BFE<"bfe.s32", Int32Regs>;
+ defm BFE_U32 : BFE<"bfe.u32", Int32Regs>;
+ defm BFE_S64 : BFE<"bfe.s64", Int64Regs>;
+ defm BFE_U64 : BFE<"bfe.u64", Int64Regs>;
+
+ defm BFI_B32 : BFI<"bfi.b32", Int32Regs>;
+ defm BFI_B64 : BFI<"bfi.b64", Int64Regs>;
}
// Common byte extraction patterns
@@ -1782,6 +1794,24 @@ def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)),
(CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
+def ExtractFromV4I8 : ComplexPattern<i16, 2, "SelectExtractEltFromV4I8", [extractelt]>;
+def: Pat<(i32 (sext_inreg (i32 (anyext (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)))), i8)),
+ (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8)>;
+def: Pat<(i32 (and (i32 (anyext (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)))), 255)),
+ (BFE_U32rii Int32Regs:$src, imm:$bitidx, 8)>;
+def: Pat<(i16 (sext_inreg (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)), i8)),
+ (CVT_s8_s32 (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8), CvtNONE)>;
+def: Pat<(ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)),
+ (CVT_s16_s32 (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8), CvtNONE)>;
+
+
+def : Pat<(v4i8 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b),
+ (i16 Int16Regs:$c), (i16 Int16Regs:$d))),
+ (BFI_B32rii
+ (BFI_B32rii (CVT_u32_u16 Int16Regs:$d, CvtNONE), (CVT_u32_u16 Int16Regs:$c, CvtNONE), 8, 8),
+ (BFI_B32rii (CVT_u32_u16 Int16Regs:$b, CvtNONE), (CVT_u32_u16 Int16Regs:$a, CvtNONE), 8, 8),
+ 16, 16)>;
+
//-----------------------------------
// Comparison instructions (setp, set)
//-----------------------------------
@@ -3293,10 +3323,6 @@ let hasSideEffects = false in {
(ins Int16Regs:$s1, Int16Regs:$s2,
Int16Regs:$s3, Int16Regs:$s4),
"mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
- def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
- (ins Int16Regs:$s1, Int16Regs:$s2,
- Int16Regs:$s3, Int16Regs:$s4),
- "mov.b32 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
(ins Int16Regs:$s1, Int16Regs:$s2),
"mov.b32 \t$d, {{$s1, $s2}};", []>;
@@ -3312,10 +3338,6 @@ let hasSideEffects = false in {
Int16Regs:$d3, Int16Regs:$d4),
(ins Int64Regs:$s),
"mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
- def I32toV4I8 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
- Int16Regs:$d3, Int16Regs:$d4),
- (ins Int32Regs:$s),
- "mov.b32 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
(ins Int32Regs:$s),
"mov.b32 \t{{$d1, $d2}}, $s;", []>;
@@ -3351,6 +3373,9 @@ def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
(I64toI32H Int64Regs:$s)>;
+def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
+ (CVT_INREG_s32_s16 Int32Regs:$src)>;
+
foreach vt = [v2f16, v2bf16, v2i16] in {
def : Pat<(extractelt (vt Int32Regs:$src), 0),
(I32toI16L Int32Regs:$src)>;
@@ -3363,9 +3388,6 @@ def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
-def : Pat<(v4i8 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b),
- (i16 Int16Regs:$c), (i16 Int16Regs:$d))),
- (V4I8toI32 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c, Int16Regs:$d)>;
// Count leading zeros
let hasSideEffects = false in {
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index da07f973501c855..ed7dd45ab7b4502 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -18,7 +18,8 @@ define i16 @test_v2i8(i16 %a) {
; CHECK-LABEL: test_v4i8
; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_param_0];
-; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9+]]], [[R]];
+; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8;
+; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9+]]], [[R0]];
; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8;
; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9+]]], [[R1]];
; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8;
@@ -41,6 +42,58 @@ define i16 @test_v4i8(i32 %a) {
ret i16 %r
}
+; CHECK-LABEL: test_v4i8_s32
+; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_s32_param_0];
+; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8;
+; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8;
+; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8;
+; CHECK-DAG: bfe.s32 [[R3:%r[0-9+]]], [[R]], 24, 8;
+; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]]
+; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]]
+; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+define i32 @test_v4i8_s32(i32 %a) {
+ %v = bitcast i32 %a to <4 x i8>
+ %r0 = extractelement <4 x i8> %v, i64 0
+ %r1 = extractelement <4 x i8> %v, i64 1
+ %r2 = extractelement <4 x i8> %v, i64 2
+ %r3 = extractelement <4 x i8> %v, i64 3
+ %r0i = sext i8 %r0 to i32
+ %r1i = sext i8 %r1 to i32
+ %r2i = sext i8 %r2 to i32
+ %r3i = sext i8 %r3 to i32
+ %r01 = add i32 %r0i, %r1i
+ %r23 = add i32 %r2i, %r3i
+ %r = add i32 %r01, %r23
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_v4i8_u32
+; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_u32_param_0];
+; CHECK-DAG: bfe.u32 [[R0:%r[0-9+]]], [[R]], 0, 8;
+; CHECK-DAG: bfe.u32 [[R1:%r[0-9+]]], [[R]], 8, 8;
+; CHECK-DAG: bfe.u32 [[R2:%r[0-9+]]], [[R]], 16, 8;
+; CHECK-DAG: bfe.u32 [[R3:%r[0-9+]]], [[R]], 24, 8;
+; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]]
+; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]]
+; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+define i32 @test_v4i8_u32(i32 %a) {
+ %v = bitcast i32 %a to <4 x i8>
+ %r0 = extractelement <4 x i8> %v, i64 0
+ %r1 = extractelement <4 x i8> %v, i64 1
+ %r2 = extractelement <4 x i8> %v, i64 2
+ %r3 = extractelement <4 x i8> %v, i64 3
+ %r0i = zext i8 %r0 to i32
+ %r1i = zext i8 %r1 to i32
+ %r2i = zext i8 %r2 to i32
+ %r3i = zext i8 %r3 to i32
+ %r01 = add i32 %r0i, %r1i
+ %r23 = add i32 %r2i, %r3i
+ %r = add i32 %r01, %r23
+ ret i32 %r
+}
+
+
+
; CHECK-LABEL: test_v8i8
; CHECK: ld.param.u64 [[R:%rd[0-9+]]], [test_v8i8_param_0];
; CHECK-DAG: cvt.s8.s64 [[E0:%rs[0-9+]]], [[R]];
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 5a22bbcf7416c17..684e4bc38d83de1 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -5,7 +5,7 @@
; RUN: %if ptxas %{ \
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | %ptxas-verify -arch=sm_53 \
+; RUN: | %ptxas-verify -arch=sm_90 \
; RUN: %}
; ## No support for i16x2 instructions
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 74087be4834d966..97b1e38a3388413 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -60,20 +60,17 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin
define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_2];
; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_1];
; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_param_0];
; CHECK-NEXT: and.b32 %r5, %r4, %r1;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs1, %rs1};
-; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
-; CHECK-NEXT: and.b32 %r9, %r3, %r8;
-; CHECK-NEXT: or.b32 %r10, %r5, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
+; CHECK-NEXT: xor.b32 %r7, %r1, -1;
+; CHECK-NEXT: and.b32 %r8, %r3, %r7;
+; CHECK-NEXT: or.b32 %r9, %r5, %r8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -85,20 +82,17 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8_undef(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_2];
; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_1];
; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_undef_param_0];
; CHECK-NEXT: and.b32 %r5, %r4, %r1;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs2, %rs1};
-; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
-; CHECK-NEXT: and.b32 %r9, %r3, %r8;
-; CHECK-NEXT: or.b32 %r10, %r5, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
+; CHECK-NEXT: xor.b32 %r7, %r1, -1;
+; CHECK-NEXT: and.b32 %r8, %r3, %r7;
+; CHECK-NEXT: or.b32 %r9, %r5, %r8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
@@ -158,8 +152,7 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin
define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-LABEL: out_v8i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<22>;
+; CHECK-NEXT: .reg .b32 %r<21>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_1];
@@ -167,15 +160,13 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v8i8_param_0];
; CHECK-NEXT: and.b32 %r11, %r9, %r5;
; CHECK-NEXT: and.b32 %r13, %r10, %r6;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r15, {%rs1, %rs1, %rs1, %rs1};
-; CHECK-NEXT: xor.b32 %r16, %r6, %r15;
-; CHECK-NEXT: xor.b32 %r17, %r5, %r15;
-; CHECK-NEXT: and.b32 %r18, %r1, %r17;
-; CHECK-NEXT: and.b32 %r19, %r2, %r16;
-; CHECK-NEXT: or.b32 %r20, %r13, %r19;
-; CHECK-NEXT: or.b32 %r21, %r11, %r18;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r21, %r20};
+; CHECK-NEXT: xor.b32 %r15, %r6, -1;
+; CHECK-NEXT: xor.b32 %r16, %r5, -1;
+; CHECK-NEXT: and.b32 %r17, %r1, %r16;
+; CHECK-NEXT: and.b32 %r18, %r2, %r15;
+; CHECK-NEXT: or.b32 %r19, %r13, %r18;
+; CHECK-NEXT: or.b32 %r20, %r11, %r17;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19};
; CHECK-NEXT: ret;
%mx = and <8 x i8> %x, %mask
%notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -213,8 +204,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
; CHECK-LABEL: out_v4i16_undef(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<22>;
+; CHECK-NEXT: .reg .b32 %r<21>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_1];
@@ -222,15 +212,13 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v4i16_undef_param_0];
; CHECK-NEXT: and.b32 %r11, %r9, %r5;
; CHECK-NEXT: and.b32 %r13, %r10, %r6;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r15, {%rs2, %rs1};
-; CHECK-NEXT: xor.b32 %r16, %r6, %r15;
-; CHECK-NEXT: xor.b32 %r17, %r5, -1;
-; CHECK-NEXT: and.b32 %r18, %r1, %r17;
-; CHECK-NEXT: and.b32 %r19, %r2, %r16;
-; CHECK-NEXT: or.b32 %r20, %r13, %r19;
-; CHECK-NEXT: or.b32 %r21, %r11, %r18;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r21, %r20};
+; CHECK-NEXT: xor.b32 %r15, %r6, -65536;
+; CHECK-NEXT: xor.b32 %r16, %r5, -1;
+; CHECK-NEXT: and.b32 %r17, %r1, %r16;
+; CHECK-NEXT: and.b32 %r18, %r2, %r15;
+; CHECK-NEXT: or.b32 %r19, %r13, %r18;
+; CHECK-NEXT: or.b32 %r20, %r11, %r17;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19};
; CHECK-NEXT: ret;
%mx = and <4 x i16> %x, %mask
%notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -294,8 +282,7 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-LABEL: out_v16i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<42>;
+; CHECK-NEXT: .reg .b32 %r<41>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_1];
@@ -305,21 +292,19 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-NEXT: and.b32 %r23, %r18, %r10;
; CHECK-NEXT: and.b32 %r25, %r19, %r11;
; CHECK-NEXT: and.b32 %r27, %r20, %r12;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r29, {%rs1, %rs1, %rs1, %rs1};
-; CHECK-NEXT: xor.b32 %r30, %r12, %r29;
-; CHECK-NEXT: xor.b32 %r31, %r11, %r29;
-; CHECK-NEXT: xor.b32 %r32, %r10, %r29;
-; CHECK-NEXT: xor.b32 %r33, %r9, %r29;
-; CHECK-NEXT: and.b32 %r34, %r1, %r33;
-; CHECK-NEXT: and.b32 %r35, %r2, %r32;
-; CHECK-NEXT: and.b32 %r36, %r3, %r31;
-; CHECK-NEXT: and.b32 %r37, %r4, %r30;
-; CHECK-NEXT: or.b32 %r38, %r27, %r37;
-; CHECK-NEXT: or.b32 %r39, %r25, %r36;
-; CHECK-NEXT: or.b32 %r40, %r23, %r35;
-; CHECK-NEXT: or.b32 %r41, %r21, %r34;
-; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r41, %r40, %r39, %r38};
+; CHECK-NEXT: xor.b32 %r29, %r12, -1;
+; CHECK-NEXT: xor.b32 %r30, %r11, -1;
+; CHECK-NEXT: xor.b32 %r31, %r10, -1;
+; CHECK-NEXT: xor.b32 %r32, %r9, -1;
+; CHECK-NEXT: and.b32 %r33, %r1, %r32;
+; CHECK-NEXT: and.b32 %r34, %r2, %r31;
+; CHECK-NEXT: and.b32 %r35, %r3, %r30;
+; CHECK-NEXT: and.b32 %r36, %r4, %r29;
+; CHECK-NEXT: or.b32 %r37, %r27, %r36;
+; CHECK-NEXT: or.b32 %r38, %r25, %r35;
+; CHECK-NEXT: or.b32 %r39, %r23, %r34;
+; CHECK-NEXT: or.b32 %r40, %r21, %r33;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r40, %r39, %r38, %r37};
; CHECK-NEXT: ret;
%mx = and <16 x i8> %x, %mask
%notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
More information about the llvm-commits
mailing list