[Lldb-commits] [lldb] [NVPTX] Improve lowering of v4i8 (PR #67866)
Artem Belevich via lldb-commits
lldb-commits at lists.llvm.org
Mon Oct 9 12:19:22 PDT 2023
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/67866
>From 4771c973c4659b814eacbacc23bd3c6c877ce2da Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 29 Sep 2023 14:37:46 -0700
Subject: [PATCH 01/10] [NVPTX] Improve lowering of v4i8
Make it a legal type and plumb through lowering of relevant instructions.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 20 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 20 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 52 +-
llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 2 +-
.../NVPTX/load-with-non-coherent-cache.ll | 4 +-
llvm/test/CodeGen/NVPTX/param-load-store.ll | 26 +-
...unfold-masked-merge-vector-variablemask.ll | 518 ++++--------------
llvm/test/CodeGen/NVPTX/vec8.ll | 5 +-
8 files changed, 177 insertions(+), 470 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 0aef2591c6e2394..1daa4971981c25c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -829,6 +829,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
case MVT::v2f16:
case MVT::v2bf16:
case MVT::v2i16:
+ case MVT::v4i8:
return Opcode_i32;
case MVT::f32:
return Opcode_f32;
@@ -910,7 +911,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
// Vector Setting
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
if (SimpleVT.isVector()) {
- assert(Isv2x16VT(LoadedVT) && "Unexpected vector type");
+ assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
+ "Unexpected vector type");
// v2f16/v2bf16/v2i16 is loaded using ld.b32
fromTypeWidth = 32;
}
@@ -1254,6 +1256,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
SDLoc DL(N);
SDNode *LD;
SDValue Base, Offset, Addr;
+ EVT OrigType = N->getValueType(0);
EVT EltVT = Mem->getMemoryVT();
unsigned NumElts = 1;
@@ -1261,12 +1264,15 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
NumElts = EltVT.getVectorNumElements();
EltVT = EltVT.getVectorElementType();
// vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
- if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) ||
- (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) ||
- (EltVT == MVT::i16 && N->getValueType(0) == MVT::v2i16)) {
+ if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
+ (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
+ (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
assert(NumElts % 2 == 0 && "Vector must have even number of elements");
- EltVT = N->getValueType(0);
+ EltVT = OrigType;
NumElts /= 2;
+ } else if (OrigType == MVT::v4i8) {
+ EltVT = OrigType;
+ NumElts = 1;
}
}
@@ -1601,7 +1607,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
// concept of sign-/zero-extension, so emulate it here by adding an explicit
// CVT instruction. Ptxas should clean up any redundancies here.
- EVT OrigType = N->getValueType(0);
LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
if (OrigType != EltVT &&
@@ -1679,7 +1684,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
MVT ScalarVT = SimpleVT.getScalarType();
unsigned toTypeWidth = ScalarVT.getSizeInBits();
if (SimpleVT.isVector()) {
- assert(Isv2x16VT(StoreVT) && "Unexpected vector type");
+ assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
+ "Unexpected vector type");
// v2x16 is stored using st.b32
toTypeWidth = 32;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b24aae4792ce6a6..7880d70fb2c6fea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -221,6 +221,11 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
llvm_unreachable("Unexpected type");
}
NumElts /= 2;
+ } else if (EltVT.getSimpleVT() == MVT::i8 &&
+ (NumElts % 4 == 0 || NumElts == 3)) {
+ // v*i8 are formally lowered as v4i8
+ EltVT = MVT::v4i8;
+ NumElts = (NumElts + 3) / 4;
}
for (unsigned j = 0; j != NumElts; ++j) {
ValueVTs.push_back(EltVT);
@@ -458,6 +463,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
+ addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
@@ -2631,7 +2637,7 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return expandUnalignedStore(Store, DAG);
// v2f16, v2bf16 and v2i16 don't need special handling.
- if (Isv2x16VT(VT))
+ if (Isv2x16VT(VT) || VT == MVT::v4i8)
return SDValue();
if (VT.isVector())
@@ -2903,7 +2909,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
EVT LoadVT = EltVT;
if (EltVT == MVT::i1)
LoadVT = MVT::i8;
- else if (Isv2x16VT(EltVT))
+ else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
// getLoad needs a vector type, but it can't handle
// vectors which contain v2f16 or v2bf16 elements. So we must load
// using i32 here and then bitcast back.
@@ -2929,7 +2935,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
if (EltVT == MVT::i1)
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
// v2f16 was loaded as an i32. Now we must bitcast it back.
- else if (Isv2x16VT(EltVT))
+ else if (EltVT != LoadVT)
Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
// If a promoted integer type is used, truncate down to the original
@@ -5256,9 +5262,9 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
SDValue Vector = N->getOperand(0);
EVT VectorVT = Vector.getValueType();
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
- IsPTXVectorType(VectorVT.getSimpleVT()))
+ IsPTXVectorType(VectorVT.getSimpleVT()) && VectorVT != MVT::v4i8)
return SDValue(); // Native vector loads already combine nicely w/
- // extract_vector_elt.
+ // extract_vector_elt, except for v4i8.
// Don't mess with singletons or v2*16 types, we already handle them OK.
if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT))
return SDValue();
@@ -5289,6 +5295,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
// If element has non-integer type, bitcast it back to the expected type.
if (EltVT != EltIVT)
Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
+ // Past legalizer, we may need to extent i8 -> i16 to match the register type.
+ if (EltVT != N->getValueType(0))
+ Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
+
return Result;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 28c4cadb303ad4f..047161fb2027dee 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1486,23 +1486,24 @@ defm OR : BITWISE<"or", or>;
defm AND : BITWISE<"and", and>;
defm XOR : BITWISE<"xor", xor>;
-// Lower logical v2i16 ops as bitwise ops on b32.
-def: Pat<(or (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
- (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
-def: Pat<(xor (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
- (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
-def: Pat<(and (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)),
- (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
-
-// The constants get legalized into a bitcast from i32, so that's what we need
-// to match here.
-def: Pat<(or Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
- (ORb32ri Int32Regs:$a, imm:$b)>;
-def: Pat<(xor Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
- (XORb32ri Int32Regs:$a, imm:$b)>;
-def: Pat<(and Int32Regs:$a, (v2i16 (bitconvert (i32 imm:$b)))),
- (ANDb32ri Int32Regs:$a, imm:$b)>;
-
+// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
+foreach vt = [v2i16, v4i8] in {
+ def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
+ (ORb32rr Int32Regs:$a, Int32Regs:$b)>;
+ def: Pat<(xor (vt Int32Regs:$a), (vt Int32Regs:$b)),
+ (XORb32rr Int32Regs:$a, Int32Regs:$b)>;
+ def: Pat<(and (vt Int32Regs:$a), (vt Int32Regs:$b)),
+ (ANDb32rr Int32Regs:$a, Int32Regs:$b)>;
+
+ // The constants get legalized into a bitcast from i32, so that's what we need
+ // to match here.
+ def: Pat<(or Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
+ (ORb32ri Int32Regs:$a, imm:$b)>;
+ def: Pat<(xor Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
+ (XORb32ri Int32Regs:$a, imm:$b)>;
+ def: Pat<(and Int32Regs:$a, (vt (bitconvert (i32 imm:$b)))),
+ (ANDb32ri Int32Regs:$a, imm:$b)>;
+}
def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
"not.pred \t$dst, $src;",
@@ -2682,7 +2683,7 @@ foreach vt = [f16, bf16] in {
def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>;
}
-foreach vt = [v2f16, v2bf16, v2i16] in {
+foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>;
}
@@ -2995,8 +2996,8 @@ def: Pat<(i16 (bitconvert (vt Int16Regs:$a))),
(ProxyRegI16 Int16Regs:$a)>;
}
-foreach ta = [v2f16, v2bf16, v2i16, i32] in {
- foreach tb = [v2f16, v2bf16, v2i16, i32] in {
+foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
+ foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
if !ne(ta, tb) then {
def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
(ProxyRegI32 Int32Regs:$a)>;
@@ -3292,6 +3293,10 @@ let hasSideEffects = false in {
(ins Int16Regs:$s1, Int16Regs:$s2,
Int16Regs:$s3, Int16Regs:$s4),
"mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
+ def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2,
+ Int16Regs:$s3, Int16Regs:$s4),
+ "mov.b32 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
(ins Int16Regs:$s1, Int16Regs:$s2),
"mov.b32 \t$d, {{$s1, $s2}};", []>;
@@ -3307,6 +3312,10 @@ let hasSideEffects = false in {
Int16Regs:$d3, Int16Regs:$d4),
(ins Int64Regs:$s),
"mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+ def I32toV4I8 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+ Int16Regs:$d3, Int16Regs:$d4),
+ (ins Int32Regs:$s),
+ "mov.b32 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
(ins Int32Regs:$s),
"mov.b32 \t{{$d1, $d2}}, $s;", []>;
@@ -3354,6 +3363,9 @@ def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
+def : Pat<(v4i8 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b),
+ (i16 Int16Regs:$c), (i16 Int16Regs:$d))),
+ (V4I8toI32 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c, Int16Regs:$d)>;
// Count leading zeros
let hasSideEffects = false in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index ed9dabf39dd7ad9..b5231a9cf67f93a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -58,7 +58,7 @@ foreach i = 0...31 in {
//===----------------------------------------------------------------------===//
def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>;
-def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16], 32,
+def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32,
(add (sequence "R%u", 0, 4),
VRFrame32, VRFrameLocal32)>;
def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index 9012339fb6b1e20..98ab93774588d28 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -130,9 +130,9 @@ define void @foo12(ptr noalias readonly %from, ptr %to) {
}
; SM20-LABEL: .visible .entry foo13(
-; SM20: ld.global.v4.u8
+; SM20: ld.global.u32
; SM35-LABEL: .visible .entry foo13(
-; SM35: ld.global.nc.v4.u8
+; SM35: ld.global.nc.u32
define void @foo13(ptr noalias readonly %from, ptr %to) {
%1 = load <4 x i8>, ptr %from
store <4 x i8> %1, ptr %to
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 2d87271e30ae0b7..b4208c691c91dfa 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -212,18 +212,16 @@ define signext i8 @test_i8s(i8 signext %a) {
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v3i8(
; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
-; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
-; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
+; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v3i8_param_0];
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b8 [param0+2], [[E2]];
+; CHECK: st.param.b32 [param0+0], [[R]]
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v3i8,
-; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
-; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
+; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0+0];
+; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very
+; interesting here, so it's skipped.
+; CHECK: st.param.b32 [func_retval0+0],
; CHECK-NEXT: ret;
define <3 x i8> @test_v3i8(<3 x i8> %a) {
%r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
@@ -233,14 +231,14 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) {
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v4i8(
; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
-; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
+; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0]
; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: st.param.b32 [param0+0], [[R]];
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v4i8,
-; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[RET]];
; CHECK-NEXT: ret;
define <4 x i8> @test_v4i8(<4 x i8> %a) {
%r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
@@ -250,10 +248,10 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) {
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_v5i8(
; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
+; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_v5i8_param_0]
; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
-; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
; CHECK: .param .align 8 .b8 param0[8];
-; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.v4.b8 [param0+0],
; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: call.uni (retval0),
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 16579de882ed4b6..74087be4834d966 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -60,46 +60,20 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin
define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot2[4];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<20>;
-; CHECK-NEXT: .reg .b32 %r<21>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot2;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_param_2];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: and.b32 %r4, %r2, %r3;
-; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8};
-; CHECK-NEXT: and.b32 %r6, %r1, %r5;
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_param_1];
-; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12};
-; CHECK-NEXT: mov.b32 %r8, {%rs9, %rs10};
-; CHECK-NEXT: xor.b32 %r9, %r5, 16711935;
-; CHECK-NEXT: xor.b32 %r10, %r3, 16711935;
-; CHECK-NEXT: and.b32 %r11, %r8, %r10;
-; CHECK-NEXT: and.b32 %r12, %r7, %r9;
-; CHECK-NEXT: or.b32 %r13, %r6, %r12;
-; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r13;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs13, %rs14};
-; CHECK-NEXT: or.b32 %r14, %r4, %r11;
-; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r14;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs15, %rs16};
-; CHECK-NEXT: ld.u16 %r15, [%SP+0];
-; CHECK-NEXT: shl.b32 %r16, %r15, 16;
-; CHECK-NEXT: ld.u16 %r17, [%SP+2];
-; CHECK-NEXT: or.b32 %r18, %r17, %r16;
-; CHECK-NEXT: shr.u32 %r19, %r18, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs17, %r19;
-; CHECK-NEXT: cvt.u16.u32 %rs18, %r15;
-; CHECK-NEXT: bfe.s32 %r20, %r15, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs19, %r20;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19};
+; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_2];
+; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_param_0];
+; CHECK-NEXT: and.b32 %r5, %r4, %r1;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
+; CHECK-NEXT: and.b32 %r9, %r3, %r8;
+; CHECK-NEXT: or.b32 %r10, %r5, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -111,48 +85,20 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8_undef(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot3[4];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<22>;
-; CHECK-NEXT: .reg .b32 %r<22>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot3;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v4i8_undef_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v4i8_undef_param_2];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: and.b32 %r4, %r2, %r3;
-; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8};
-; CHECK-NEXT: and.b32 %r6, %r1, %r5;
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v4i8_undef_param_1];
-; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12};
-; CHECK-NEXT: mov.b32 %r8, {%rs9, %rs10};
-; CHECK-NEXT: mov.u16 %rs13, 255;
-; CHECK-NEXT: mov.b32 %r9, {%rs14, %rs13};
-; CHECK-NEXT: xor.b32 %r10, %r5, %r9;
-; CHECK-NEXT: xor.b32 %r11, %r3, 16711935;
-; CHECK-NEXT: and.b32 %r12, %r8, %r11;
-; CHECK-NEXT: and.b32 %r13, %r7, %r10;
-; CHECK-NEXT: or.b32 %r14, %r6, %r13;
-; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r14;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs15, %rs16};
-; CHECK-NEXT: or.b32 %r15, %r4, %r12;
-; CHECK-NEXT: mov.b32 {%rs17, %rs18}, %r15;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs17, %rs18};
-; CHECK-NEXT: ld.u16 %r16, [%SP+0];
-; CHECK-NEXT: shl.b32 %r17, %r16, 16;
-; CHECK-NEXT: ld.u16 %r18, [%SP+2];
-; CHECK-NEXT: or.b32 %r19, %r18, %r17;
-; CHECK-NEXT: shr.u32 %r20, %r19, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs19, %r20;
-; CHECK-NEXT: cvt.u16.u32 %rs20, %r16;
-; CHECK-NEXT: bfe.s32 %r21, %r16, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs21, %r21;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs17, %rs19, %rs20, %rs21};
+; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_2];
+; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_undef_param_0];
+; CHECK-NEXT: and.b32 %r5, %r4, %r1;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs2, %rs1};
+; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
+; CHECK-NEXT: and.b32 %r9, %r3, %r8;
+; CHECK-NEXT: or.b32 %r10, %r5, %r9;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
@@ -212,84 +158,24 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin
define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-LABEL: out_v8i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot6[8];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<40>;
-; CHECK-NEXT: .reg .b32 %r<38>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<22>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot6;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v8i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v8i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8};
-; CHECK-NEXT: mov.b32 %r4, {%rs5, %rs6};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v8i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r5, {%rs9, %rs10};
-; CHECK-NEXT: and.b32 %r6, %r4, %r5;
-; CHECK-NEXT: mov.b32 %r7, {%rs11, %rs12};
-; CHECK-NEXT: and.b32 %r8, %r3, %r7;
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v8i8_param_2];
-; CHECK-NEXT: mov.b32 %r9, {%rs13, %rs14};
-; CHECK-NEXT: and.b32 %r10, %r2, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs15, %rs16};
-; CHECK-NEXT: and.b32 %r12, %r1, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v8i8_param_1];
-; CHECK-NEXT: mov.b32 %r13, {%rs19, %rs20};
-; CHECK-NEXT: mov.b32 %r14, {%rs17, %rs18};
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v8i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r15, {%rs23, %rs24};
-; CHECK-NEXT: mov.b32 %r16, {%rs21, %rs22};
-; CHECK-NEXT: xor.b32 %r17, %r11, 16711935;
-; CHECK-NEXT: xor.b32 %r18, %r9, 16711935;
-; CHECK-NEXT: xor.b32 %r19, %r7, 16711935;
-; CHECK-NEXT: xor.b32 %r20, %r5, 16711935;
-; CHECK-NEXT: and.b32 %r21, %r16, %r20;
-; CHECK-NEXT: and.b32 %r22, %r15, %r19;
-; CHECK-NEXT: and.b32 %r23, %r14, %r18;
-; CHECK-NEXT: and.b32 %r24, %r13, %r17;
-; CHECK-NEXT: or.b32 %r25, %r12, %r24;
-; CHECK-NEXT: mov.b32 {%rs25, %rs26}, %r25;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs25, %rs26};
-; CHECK-NEXT: or.b32 %r26, %r10, %r23;
-; CHECK-NEXT: mov.b32 {%rs27, %rs28}, %r26;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs27, %rs28};
-; CHECK-NEXT: or.b32 %r27, %r8, %r22;
-; CHECK-NEXT: mov.b32 {%rs29, %rs30}, %r27;
-; CHECK-NEXT: st.v2.u8 [%SP+4], {%rs29, %rs30};
-; CHECK-NEXT: or.b32 %r28, %r6, %r21;
-; CHECK-NEXT: mov.b32 {%rs31, %rs32}, %r28;
-; CHECK-NEXT: st.v2.u8 [%SP+6], {%rs31, %rs32};
-; CHECK-NEXT: ld.u16 %r29, [%SP+0];
-; CHECK-NEXT: shl.b32 %r30, %r29, 16;
-; CHECK-NEXT: ld.u16 %r31, [%SP+2];
-; CHECK-NEXT: or.b32 %r32, %r31, %r30;
-; CHECK-NEXT: cvt.u64.u32 %rd1, %r32;
-; CHECK-NEXT: ld.u16 %r33, [%SP+4];
-; CHECK-NEXT: shl.b32 %r34, %r33, 16;
-; CHECK-NEXT: ld.u16 %r35, [%SP+6];
-; CHECK-NEXT: or.b32 %r36, %r35, %r34;
-; CHECK-NEXT: cvt.u64.u32 %rd2, %r36;
-; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
-; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
-; CHECK-NEXT: shr.u32 %r37, %r36, 8;
-; CHECK-NEXT: shr.u64 %rd5, %rd4, 24;
-; CHECK-NEXT: cvt.u16.u64 %rs33, %rd5;
-; CHECK-NEXT: shr.u64 %rd6, %rd1, 16;
-; CHECK-NEXT: cvt.u16.u64 %rs34, %rd6;
-; CHECK-NEXT: shr.u64 %rd7, %rd1, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs35, %rd7;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33};
-; CHECK-NEXT: cvt.u16.u32 %rs36, %r37;
-; CHECK-NEXT: bfe.s64 %rd8, %rd2, 24, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs37, %rd8;
-; CHECK-NEXT: cvt.u16.u32 %rs38, %r33;
-; CHECK-NEXT: cvt.u16.u32 %rs39, %r35;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37};
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_1];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [out_v8i8_param_2];
+; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v8i8_param_0];
+; CHECK-NEXT: and.b32 %r11, %r9, %r5;
+; CHECK-NEXT: and.b32 %r13, %r10, %r6;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r15, {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT: xor.b32 %r16, %r6, %r15;
+; CHECK-NEXT: xor.b32 %r17, %r5, %r15;
+; CHECK-NEXT: and.b32 %r18, %r1, %r17;
+; CHECK-NEXT: and.b32 %r19, %r2, %r16;
+; CHECK-NEXT: or.b32 %r20, %r13, %r19;
+; CHECK-NEXT: or.b32 %r21, %r11, %r18;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r21, %r20};
; CHECK-NEXT: ret;
%mx = and <8 x i8> %x, %mask
%notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -408,90 +294,32 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-LABEL: out_v16i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<65>;
-; CHECK-NEXT: .reg .b32 %r<57>;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<42>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [out_v16i8_param_0+12];
-; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [out_v16i8_param_0+8];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [out_v16i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r5, {%rs9, %rs10};
-; CHECK-NEXT: mov.b32 %r6, {%rs11, %rs12};
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [out_v16i8_param_0];
-; CHECK-NEXT: mov.b32 %r7, {%rs13, %rs14};
-; CHECK-NEXT: mov.b32 %r8, {%rs15, %rs16};
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [out_v16i8_param_2];
-; CHECK-NEXT: mov.b32 %r9, {%rs19, %rs20};
-; CHECK-NEXT: and.b32 %r10, %r8, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs17, %rs18};
-; CHECK-NEXT: and.b32 %r12, %r7, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [out_v16i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r13, {%rs23, %rs24};
-; CHECK-NEXT: and.b32 %r14, %r6, %r13;
-; CHECK-NEXT: mov.b32 %r15, {%rs21, %rs22};
-; CHECK-NEXT: and.b32 %r16, %r5, %r15;
-; CHECK-NEXT: ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [out_v16i8_param_2+8];
-; CHECK-NEXT: mov.b32 %r17, {%rs27, %rs28};
-; CHECK-NEXT: and.b32 %r18, %r4, %r17;
-; CHECK-NEXT: mov.b32 %r19, {%rs25, %rs26};
-; CHECK-NEXT: and.b32 %r20, %r3, %r19;
-; CHECK-NEXT: ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [out_v16i8_param_2+12];
-; CHECK-NEXT: mov.b32 %r21, {%rs31, %rs32};
-; CHECK-NEXT: and.b32 %r22, %r2, %r21;
-; CHECK-NEXT: mov.b32 %r23, {%rs29, %rs30};
-; CHECK-NEXT: and.b32 %r24, %r1, %r23;
-; CHECK-NEXT: ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [out_v16i8_param_1+12];
-; CHECK-NEXT: mov.b32 %r25, {%rs33, %rs34};
-; CHECK-NEXT: mov.b32 %r26, {%rs35, %rs36};
-; CHECK-NEXT: ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [out_v16i8_param_1+8];
-; CHECK-NEXT: mov.b32 %r27, {%rs37, %rs38};
-; CHECK-NEXT: mov.b32 %r28, {%rs39, %rs40};
-; CHECK-NEXT: ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [out_v16i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r29, {%rs41, %rs42};
-; CHECK-NEXT: mov.b32 %r30, {%rs43, %rs44};
-; CHECK-NEXT: ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [out_v16i8_param_1];
-; CHECK-NEXT: mov.b32 %r31, {%rs45, %rs46};
-; CHECK-NEXT: mov.b32 %r32, {%rs47, %rs48};
-; CHECK-NEXT: xor.b32 %r33, %r23, 16711935;
-; CHECK-NEXT: xor.b32 %r34, %r21, 16711935;
-; CHECK-NEXT: xor.b32 %r35, %r19, 16711935;
-; CHECK-NEXT: xor.b32 %r36, %r17, 16711935;
-; CHECK-NEXT: xor.b32 %r37, %r15, 16711935;
-; CHECK-NEXT: xor.b32 %r38, %r13, 16711935;
-; CHECK-NEXT: xor.b32 %r39, %r11, 16711935;
-; CHECK-NEXT: xor.b32 %r40, %r9, 16711935;
-; CHECK-NEXT: and.b32 %r41, %r32, %r40;
-; CHECK-NEXT: and.b32 %r42, %r31, %r39;
-; CHECK-NEXT: and.b32 %r43, %r30, %r38;
-; CHECK-NEXT: and.b32 %r44, %r29, %r37;
-; CHECK-NEXT: and.b32 %r45, %r28, %r36;
-; CHECK-NEXT: and.b32 %r46, %r27, %r35;
-; CHECK-NEXT: and.b32 %r47, %r26, %r34;
-; CHECK-NEXT: and.b32 %r48, %r25, %r33;
-; CHECK-NEXT: or.b32 %r49, %r24, %r48;
-; CHECK-NEXT: or.b32 %r50, %r22, %r47;
-; CHECK-NEXT: or.b32 %r51, %r20, %r46;
-; CHECK-NEXT: or.b32 %r52, %r18, %r45;
-; CHECK-NEXT: or.b32 %r53, %r16, %r44;
-; CHECK-NEXT: or.b32 %r54, %r14, %r43;
-; CHECK-NEXT: or.b32 %r55, %r12, %r42;
-; CHECK-NEXT: or.b32 %r56, %r10, %r41;
-; CHECK-NEXT: mov.b32 {%rs49, %rs50}, %r56;
-; CHECK-NEXT: mov.b32 {%rs51, %rs52}, %r55;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50};
-; CHECK-NEXT: mov.b32 {%rs53, %rs54}, %r54;
-; CHECK-NEXT: mov.b32 {%rs55, %rs56}, %r53;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54};
-; CHECK-NEXT: mov.b32 {%rs57, %rs58}, %r52;
-; CHECK-NEXT: mov.b32 {%rs59, %rs60}, %r51;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58};
-; CHECK-NEXT: mov.b32 {%rs61, %rs62}, %r50;
-; CHECK-NEXT: mov.b32 {%rs63, %rs64}, %r49;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62};
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_1];
+; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [out_v16i8_param_2];
+; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [out_v16i8_param_0];
+; CHECK-NEXT: and.b32 %r21, %r17, %r9;
+; CHECK-NEXT: and.b32 %r23, %r18, %r10;
+; CHECK-NEXT: and.b32 %r25, %r19, %r11;
+; CHECK-NEXT: and.b32 %r27, %r20, %r12;
+; CHECK-NEXT: mov.u16 %rs1, -1;
+; CHECK-NEXT: mov.b32 %r29, {%rs1, %rs1, %rs1, %rs1};
+; CHECK-NEXT: xor.b32 %r30, %r12, %r29;
+; CHECK-NEXT: xor.b32 %r31, %r11, %r29;
+; CHECK-NEXT: xor.b32 %r32, %r10, %r29;
+; CHECK-NEXT: xor.b32 %r33, %r9, %r29;
+; CHECK-NEXT: and.b32 %r34, %r1, %r33;
+; CHECK-NEXT: and.b32 %r35, %r2, %r32;
+; CHECK-NEXT: and.b32 %r36, %r3, %r31;
+; CHECK-NEXT: and.b32 %r37, %r4, %r30;
+; CHECK-NEXT: or.b32 %r38, %r27, %r37;
+; CHECK-NEXT: or.b32 %r39, %r25, %r36;
+; CHECK-NEXT: or.b32 %r40, %r23, %r35;
+; CHECK-NEXT: or.b32 %r41, %r21, %r34;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r41, %r40, %r39, %r38};
; CHECK-NEXT: ret;
%mx = and <16 x i8> %x, %mask
%notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -684,44 +512,16 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind
define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: in_v4i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot18[4];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<20>;
-; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot18;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v4i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v4i8_param_1];
-; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8};
-; CHECK-NEXT: xor.b32 %r4, %r2, %r3;
-; CHECK-NEXT: mov.b32 %r5, {%rs5, %rs6};
-; CHECK-NEXT: xor.b32 %r6, %r1, %r5;
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v4i8_param_2];
-; CHECK-NEXT: mov.b32 %r7, {%rs9, %rs10};
-; CHECK-NEXT: and.b32 %r8, %r6, %r7;
-; CHECK-NEXT: mov.b32 %r9, {%rs11, %rs12};
-; CHECK-NEXT: and.b32 %r10, %r4, %r9;
-; CHECK-NEXT: xor.b32 %r11, %r10, %r3;
-; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r11;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs13, %rs14};
-; CHECK-NEXT: xor.b32 %r12, %r8, %r5;
-; CHECK-NEXT: mov.b32 {%rs15, %rs16}, %r12;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs15, %rs16};
-; CHECK-NEXT: ld.u16 %r13, [%SP+0];
-; CHECK-NEXT: shl.b32 %r14, %r13, 16;
-; CHECK-NEXT: ld.u16 %r15, [%SP+2];
-; CHECK-NEXT: or.b32 %r16, %r15, %r14;
-; CHECK-NEXT: shr.u32 %r17, %r16, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs17, %r17;
-; CHECK-NEXT: cvt.u16.u32 %rs18, %r13;
-; CHECK-NEXT: bfe.s32 %r18, %r13, 8, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs19, %r18;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs15, %rs17, %rs18, %rs19};
+; CHECK-NEXT: ld.param.u32 %r1, [in_v4i8_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [in_v4i8_param_1];
+; CHECK-NEXT: xor.b32 %r3, %r1, %r2;
+; CHECK-NEXT: ld.param.u32 %r4, [in_v4i8_param_2];
+; CHECK-NEXT: and.b32 %r5, %r3, %r4;
+; CHECK-NEXT: xor.b32 %r6, %r5, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6;
; CHECK-NEXT: ret;
%n0 = xor <4 x i8> %x, %y
%n1 = and <4 x i8> %n0, %mask
@@ -776,80 +576,19 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind
define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-LABEL: in_v8i8(
; CHECK: {
-; CHECK-NEXT: .local .align 2 .b8 __local_depot21[8];
-; CHECK-NEXT: .reg .b64 %SP;
-; CHECK-NEXT: .reg .b64 %SPL;
-; CHECK-NEXT: .reg .b16 %rs<40>;
-; CHECK-NEXT: .reg .b32 %r<34>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b32 %r<15>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u64 %SPL, __local_depot21;
-; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v8i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT: mov.b32 %r2, {%rs3, %rs4};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v8i8_param_0];
-; CHECK-NEXT: mov.b32 %r3, {%rs5, %rs6};
-; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v8i8_param_1];
-; CHECK-NEXT: mov.b32 %r5, {%rs11, %rs12};
-; CHECK-NEXT: xor.b32 %r6, %r4, %r5;
-; CHECK-NEXT: mov.b32 %r7, {%rs9, %rs10};
-; CHECK-NEXT: xor.b32 %r8, %r3, %r7;
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v8i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r9, {%rs15, %rs16};
-; CHECK-NEXT: xor.b32 %r10, %r2, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs13, %rs14};
-; CHECK-NEXT: xor.b32 %r12, %r1, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v8i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r13, {%rs17, %rs18};
-; CHECK-NEXT: and.b32 %r14, %r12, %r13;
-; CHECK-NEXT: mov.b32 %r15, {%rs19, %rs20};
-; CHECK-NEXT: and.b32 %r16, %r10, %r15;
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v8i8_param_2];
-; CHECK-NEXT: mov.b32 %r17, {%rs21, %rs22};
-; CHECK-NEXT: and.b32 %r18, %r8, %r17;
-; CHECK-NEXT: mov.b32 %r19, {%rs23, %rs24};
-; CHECK-NEXT: and.b32 %r20, %r6, %r19;
-; CHECK-NEXT: xor.b32 %r21, %r20, %r5;
-; CHECK-NEXT: mov.b32 {%rs25, %rs26}, %r21;
-; CHECK-NEXT: st.v2.u8 [%SP+0], {%rs25, %rs26};
-; CHECK-NEXT: xor.b32 %r22, %r18, %r7;
-; CHECK-NEXT: mov.b32 {%rs27, %rs28}, %r22;
-; CHECK-NEXT: st.v2.u8 [%SP+2], {%rs27, %rs28};
-; CHECK-NEXT: xor.b32 %r23, %r16, %r9;
-; CHECK-NEXT: mov.b32 {%rs29, %rs30}, %r23;
-; CHECK-NEXT: st.v2.u8 [%SP+4], {%rs29, %rs30};
-; CHECK-NEXT: xor.b32 %r24, %r14, %r11;
-; CHECK-NEXT: mov.b32 {%rs31, %rs32}, %r24;
-; CHECK-NEXT: st.v2.u8 [%SP+6], {%rs31, %rs32};
-; CHECK-NEXT: ld.u16 %r25, [%SP+0];
-; CHECK-NEXT: shl.b32 %r26, %r25, 16;
-; CHECK-NEXT: ld.u16 %r27, [%SP+2];
-; CHECK-NEXT: or.b32 %r28, %r27, %r26;
-; CHECK-NEXT: cvt.u64.u32 %rd1, %r28;
-; CHECK-NEXT: ld.u16 %r29, [%SP+4];
-; CHECK-NEXT: shl.b32 %r30, %r29, 16;
-; CHECK-NEXT: ld.u16 %r31, [%SP+6];
-; CHECK-NEXT: or.b32 %r32, %r31, %r30;
-; CHECK-NEXT: cvt.u64.u32 %rd2, %r32;
-; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
-; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
-; CHECK-NEXT: shr.u32 %r33, %r32, 8;
-; CHECK-NEXT: shr.u64 %rd5, %rd4, 24;
-; CHECK-NEXT: cvt.u16.u64 %rs33, %rd5;
-; CHECK-NEXT: shr.u64 %rd6, %rd1, 16;
-; CHECK-NEXT: cvt.u16.u64 %rs34, %rd6;
-; CHECK-NEXT: shr.u64 %rd7, %rd1, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs35, %rd7;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs27, %rs35, %rs34, %rs33};
-; CHECK-NEXT: cvt.u16.u32 %rs36, %r33;
-; CHECK-NEXT: bfe.s64 %rd8, %rd2, 24, 8;
-; CHECK-NEXT: cvt.u16.u64 %rs37, %rd8;
-; CHECK-NEXT: cvt.u16.u32 %rs38, %r29;
-; CHECK-NEXT: cvt.u16.u32 %rs39, %r31;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs39, %rs36, %rs38, %rs37};
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v8i8_param_0];
+; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [in_v8i8_param_1];
+; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [in_v8i8_param_2];
+; CHECK-NEXT: xor.b32 %r7, %r2, %r4;
+; CHECK-NEXT: and.b32 %r8, %r7, %r6;
+; CHECK-NEXT: xor.b32 %r9, %r8, %r4;
+; CHECK-NEXT: xor.b32 %r11, %r1, %r3;
+; CHECK-NEXT: and.b32 %r12, %r11, %r5;
+; CHECK-NEXT: xor.b32 %r13, %r12, %r3;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r13, %r9};
; CHECK-NEXT: ret;
%n0 = xor <8 x i8> %x, %y
%n1 = and <8 x i8> %n0, %mask
@@ -930,82 +669,25 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind
define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-LABEL: in_v16i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<65>;
-; CHECK-NEXT: .reg .b32 %r<49>;
+; CHECK-NEXT: .reg .b32 %r<29>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v4.u8 {%rs1, %rs2, %rs3, %rs4}, [in_v16i8_param_0];
-; CHECK-NEXT: mov.b32 %r1, {%rs3, %rs4};
-; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2};
-; CHECK-NEXT: ld.param.v4.u8 {%rs5, %rs6, %rs7, %rs8}, [in_v16i8_param_0+4];
-; CHECK-NEXT: mov.b32 %r3, {%rs7, %rs8};
-; CHECK-NEXT: mov.b32 %r4, {%rs5, %rs6};
-; CHECK-NEXT: ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [in_v16i8_param_0+8];
-; CHECK-NEXT: mov.b32 %r5, {%rs11, %rs12};
-; CHECK-NEXT: mov.b32 %r6, {%rs9, %rs10};
-; CHECK-NEXT: ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [in_v16i8_param_0+12];
-; CHECK-NEXT: mov.b32 %r7, {%rs15, %rs16};
-; CHECK-NEXT: mov.b32 %r8, {%rs13, %rs14};
-; CHECK-NEXT: ld.param.v4.u8 {%rs17, %rs18, %rs19, %rs20}, [in_v16i8_param_1+12];
-; CHECK-NEXT: mov.b32 %r9, {%rs17, %rs18};
-; CHECK-NEXT: xor.b32 %r10, %r8, %r9;
-; CHECK-NEXT: mov.b32 %r11, {%rs19, %rs20};
-; CHECK-NEXT: xor.b32 %r12, %r7, %r11;
-; CHECK-NEXT: ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [in_v16i8_param_1+8];
-; CHECK-NEXT: mov.b32 %r13, {%rs21, %rs22};
-; CHECK-NEXT: xor.b32 %r14, %r6, %r13;
-; CHECK-NEXT: mov.b32 %r15, {%rs23, %rs24};
-; CHECK-NEXT: xor.b32 %r16, %r5, %r15;
-; CHECK-NEXT: ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [in_v16i8_param_1+4];
-; CHECK-NEXT: mov.b32 %r17, {%rs25, %rs26};
-; CHECK-NEXT: xor.b32 %r18, %r4, %r17;
-; CHECK-NEXT: mov.b32 %r19, {%rs27, %rs28};
-; CHECK-NEXT: xor.b32 %r20, %r3, %r19;
-; CHECK-NEXT: ld.param.v4.u8 {%rs29, %rs30, %rs31, %rs32}, [in_v16i8_param_1];
-; CHECK-NEXT: mov.b32 %r21, {%rs29, %rs30};
-; CHECK-NEXT: xor.b32 %r22, %r2, %r21;
-; CHECK-NEXT: mov.b32 %r23, {%rs31, %rs32};
-; CHECK-NEXT: xor.b32 %r24, %r1, %r23;
-; CHECK-NEXT: ld.param.v4.u8 {%rs33, %rs34, %rs35, %rs36}, [in_v16i8_param_2];
-; CHECK-NEXT: mov.b32 %r25, {%rs35, %rs36};
-; CHECK-NEXT: and.b32 %r26, %r24, %r25;
-; CHECK-NEXT: mov.b32 %r27, {%rs33, %rs34};
-; CHECK-NEXT: and.b32 %r28, %r22, %r27;
-; CHECK-NEXT: ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [in_v16i8_param_2+4];
-; CHECK-NEXT: mov.b32 %r29, {%rs39, %rs40};
-; CHECK-NEXT: and.b32 %r30, %r20, %r29;
-; CHECK-NEXT: mov.b32 %r31, {%rs37, %rs38};
-; CHECK-NEXT: and.b32 %r32, %r18, %r31;
-; CHECK-NEXT: ld.param.v4.u8 {%rs41, %rs42, %rs43, %rs44}, [in_v16i8_param_2+8];
-; CHECK-NEXT: mov.b32 %r33, {%rs43, %rs44};
-; CHECK-NEXT: and.b32 %r34, %r16, %r33;
-; CHECK-NEXT: mov.b32 %r35, {%rs41, %rs42};
-; CHECK-NEXT: and.b32 %r36, %r14, %r35;
-; CHECK-NEXT: ld.param.v4.u8 {%rs45, %rs46, %rs47, %rs48}, [in_v16i8_param_2+12];
-; CHECK-NEXT: mov.b32 %r37, {%rs47, %rs48};
-; CHECK-NEXT: and.b32 %r38, %r12, %r37;
-; CHECK-NEXT: mov.b32 %r39, {%rs45, %rs46};
-; CHECK-NEXT: and.b32 %r40, %r10, %r39;
-; CHECK-NEXT: xor.b32 %r41, %r40, %r9;
-; CHECK-NEXT: xor.b32 %r42, %r38, %r11;
-; CHECK-NEXT: xor.b32 %r43, %r36, %r13;
-; CHECK-NEXT: xor.b32 %r44, %r34, %r15;
-; CHECK-NEXT: xor.b32 %r45, %r32, %r17;
-; CHECK-NEXT: xor.b32 %r46, %r30, %r19;
-; CHECK-NEXT: xor.b32 %r47, %r28, %r21;
-; CHECK-NEXT: xor.b32 %r48, %r26, %r23;
-; CHECK-NEXT: mov.b32 {%rs49, %rs50}, %r48;
-; CHECK-NEXT: mov.b32 {%rs51, %rs52}, %r47;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+0], {%rs51, %rs52, %rs49, %rs50};
-; CHECK-NEXT: mov.b32 {%rs53, %rs54}, %r46;
-; CHECK-NEXT: mov.b32 {%rs55, %rs56}, %r45;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+4], {%rs55, %rs56, %rs53, %rs54};
-; CHECK-NEXT: mov.b32 {%rs57, %rs58}, %r44;
-; CHECK-NEXT: mov.b32 {%rs59, %rs60}, %r43;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+8], {%rs59, %rs60, %rs57, %rs58};
-; CHECK-NEXT: mov.b32 {%rs61, %rs62}, %r42;
-; CHECK-NEXT: mov.b32 {%rs63, %rs64}, %r41;
-; CHECK-NEXT: st.param.v4.b8 [func_retval0+12], {%rs63, %rs64, %rs61, %rs62};
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0];
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v16i8_param_1];
+; CHECK-NEXT: xor.b32 %r9, %r4, %r8;
+; CHECK-NEXT: xor.b32 %r10, %r3, %r7;
+; CHECK-NEXT: xor.b32 %r11, %r2, %r6;
+; CHECK-NEXT: xor.b32 %r12, %r1, %r5;
+; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v16i8_param_2];
+; CHECK-NEXT: and.b32 %r17, %r12, %r13;
+; CHECK-NEXT: and.b32 %r18, %r11, %r14;
+; CHECK-NEXT: and.b32 %r19, %r10, %r15;
+; CHECK-NEXT: and.b32 %r20, %r9, %r16;
+; CHECK-NEXT: xor.b32 %r21, %r20, %r8;
+; CHECK-NEXT: xor.b32 %r23, %r19, %r7;
+; CHECK-NEXT: xor.b32 %r25, %r18, %r6;
+; CHECK-NEXT: xor.b32 %r27, %r17, %r5;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r27, %r25, %r23, %r21};
; CHECK-NEXT: ret;
%n0 = xor <16 x i8> %x, %y
%n1 = and <16 x i8> %n0, %mask
diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll
index 092607462f3329b..8333a9b935d6af8 100644
--- a/llvm/test/CodeGen/NVPTX/vec8.ll
+++ b/llvm/test/CodeGen/NVPTX/vec8.ll
@@ -5,10 +5,9 @@ target triple = "nvptx-unknown-cuda"
; CHECK: .visible .func foo
define void @foo(<8 x i8> %a, ptr %b) {
-; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
-; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo_param_0]
; CHECK-DAG: ld.param.u64 %[[B:rd[0-9+]]], [foo_param_1]
-; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]];
+; CHECK: add.s16 [[T:%rs[0-9+]]],
; CHECK: st.u8 [%[[B]]], [[T]];
%t0 = extractelement <8 x i8> %a, i32 1
%t1 = extractelement <8 x i8> %a, i32 6
>From bda4bd36ded20dba4ac89824a42b8a2017c41247 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 2 Oct 2023 18:05:42 -0700
Subject: [PATCH 02/10] More work on fleshing out extractelt/build_vector for
v4i8
Verified that NVPTX tests pass with ptxas being able to compiler PTX produced by
llc tests.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 18 ++++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 71 +++++++-------
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 64 +++++++++----
llvm/test/CodeGen/NVPTX/extractelement.ll | 55 ++++++++++-
llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 2 +-
...unfold-masked-merge-vector-variablemask.ll | 95 ++++++++-----------
7 files changed, 196 insertions(+), 110 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1daa4971981c25c..c3bcf8f05a278ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTXUtilities.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
@@ -3569,6 +3570,23 @@ bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
}
+bool NVPTXDAGToDAGISel::SelectExtractEltFromV4I8(SDValue N, SDValue &V,
+ SDValue &BitOffset) {
+ SDValue Vector = N->getOperand(0);
+ if (!(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Vector->getValueType(0) == MVT::v4i8))
+ return false;
+
+ if (const ConstantSDNode *IdxConst =
+ dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ V = Vector;
+ BitOffset = CurDAG->getTargetConstant(IdxConst->getZExtValue() * 8,
+ SDLoc(N), MVT::i32);
+ return true;
+ }
+ return false;
+}
+
bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
unsigned int spN) const {
const Value *Src = nullptr;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 06922331f5e2059..34b5dd449ce086f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -95,6 +95,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
SDValue &Offset);
bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);
+ bool SelectExtractEltFromV4I8(SDValue N, SDValue &Value, SDValue &Idx);
bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7880d70fb2c6fea..66dcdb53b136b96 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -497,6 +497,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
+ // TODO: we should eventually lower it as PRMT instruction.
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
+
// Operations not directly supported by NVPTX.
for (MVT VT :
{MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::f64,
@@ -2156,45 +2160,47 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
-// We can init constant f16x2 with a single .b32 move. Normally it
+// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
// would get lowered as two constant loads and vector-packing move.
-// mov.b16 %h1, 0x4000;
-// mov.b16 %h2, 0x3C00;
-// mov.b32 %hh2, {%h2, %h1};
// Instead we want just a constant move:
// mov.b32 %hh2, 0x40003C00
-//
-// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
-// generates good SASS in both cases.
SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op->getValueType(0);
- if (!(Isv2x16VT(VT)))
+ if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
+ return Op;
+
+ if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
+ return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
+ isa<ConstantFPSDNode>(Operand);
+ }))
return Op;
- APInt E0;
- APInt E1;
- if (VT == MVT::v2f16 || VT == MVT::v2bf16) {
- if (!(isa<ConstantFPSDNode>(Op->getOperand(0)) &&
- isa<ConstantFPSDNode>(Op->getOperand(1))))
- return Op;
-
- E0 = cast<ConstantFPSDNode>(Op->getOperand(0))
- ->getValueAPF()
- .bitcastToAPInt();
- E1 = cast<ConstantFPSDNode>(Op->getOperand(1))
- ->getValueAPF()
- .bitcastToAPInt();
- } else {
- assert(VT == MVT::v2i16);
- if (!(isa<ConstantSDNode>(Op->getOperand(0)) &&
- isa<ConstantSDNode>(Op->getOperand(1))))
- return Op;
- E0 = cast<ConstantSDNode>(Op->getOperand(0))->getAPIntValue();
- E1 = cast<ConstantSDNode>(Op->getOperand(1))->getAPIntValue();
+ // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
+ auto GetOperand = [](SDValue Op, int N) -> APInt {
+ const SDValue &Operand = Op->getOperand(N);
+ EVT VT = Op->getValueType(0);
+ if (Operand->isUndef())
+ return APInt(32, 0);
+ APInt Value;
+ if (VT == MVT::v2f16 || VT == MVT::v2bf16)
+ Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
+ else if (VT == MVT::v2i16 || VT == MVT::v4i8)
+ Value = cast<ConstantSDNode>(Operand)->getAPIntValue();
+ else
+ llvm_unreachable("Unsupported type");
+ return Value.zext(32);
+ };
+ APInt Value;
+ if (Isv2x16VT(VT)) {
+ Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
+ } else if (VT == MVT::v4i8) {
+ Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
+ GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
+ } else {
+ llvm_unreachable("Unsupported type");
}
- SDValue Const =
- DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+ SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
}
@@ -5262,11 +5268,12 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
SDValue Vector = N->getOperand(0);
EVT VectorVT = Vector.getValueType();
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
- IsPTXVectorType(VectorVT.getSimpleVT()) && VectorVT != MVT::v4i8)
+ IsPTXVectorType(VectorVT.getSimpleVT()))
return SDValue(); // Native vector loads already combine nicely w/
// extract_vector_elt, except for v4i8.
// Don't mess with singletons or v2*16 types, we already handle them OK.
- if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT))
+ if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
+ VectorVT == MVT::v4i8)
return SDValue();
uint64_t VectorBits = VectorVT.getSizeInBits();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 047161fb2027dee..307963aaa800b88 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1738,7 +1738,7 @@ def FUNSHFRCLAMP :
// restriction in PTX?
//
// dest and src may be int32 or int64, but start and end are always int32.
-multiclass BFX<string Instr, RegisterClass RC> {
+multiclass BFE<string Instr, RegisterClass RC> {
def rrr
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, Int32Regs:$c),
@@ -1752,17 +1752,29 @@ multiclass BFX<string Instr, RegisterClass RC> {
(ins RC:$a, i32imm:$b, i32imm:$c),
!strconcat(Instr, " \t$d, $a, $b, $c;"), []>;
}
+multiclass BFI<string Instr, RegisterClass RC> {
+ def rrr
+ : NVPTXInst<(outs RC:$f),
+ (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
+ def rri
+ : NVPTXInst<(outs RC:$f),
+ (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
+ def rii
+ : NVPTXInst<(outs RC:$f),
+ (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
+}
let hasSideEffects = false in {
- defm BFE_S32 : BFX<"bfe.s32", Int32Regs>;
- defm BFE_U32 : BFX<"bfe.u32", Int32Regs>;
- defm BFE_S64 : BFX<"bfe.s64", Int64Regs>;
- defm BFE_U64 : BFX<"bfe.u64", Int64Regs>;
-
- defm BFI_S32 : BFX<"bfi.s32", Int32Regs>;
- defm BFI_U32 : BFX<"bfi.u32", Int32Regs>;
- defm BFI_S64 : BFX<"bfi.s64", Int64Regs>;
- defm BFI_U64 : BFX<"bfi.u64", Int64Regs>;
+ defm BFE_S32 : BFE<"bfe.s32", Int32Regs>;
+ defm BFE_U32 : BFE<"bfe.u32", Int32Regs>;
+ defm BFE_S64 : BFE<"bfe.s64", Int64Regs>;
+ defm BFE_U64 : BFE<"bfe.u64", Int64Regs>;
+
+ defm BFI_B32 : BFI<"bfi.b32", Int32Regs>;
+ defm BFI_B64 : BFI<"bfi.b64", Int64Regs>;
}
// Common byte extraction patterns
@@ -1782,6 +1794,24 @@ def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)),
(CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
+def ExtractFromV4I8 : ComplexPattern<i16, 2, "SelectExtractEltFromV4I8", [extractelt]>;
+def: Pat<(i32 (sext_inreg (i32 (anyext (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)))), i8)),
+ (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8)>;
+def: Pat<(i32 (and (i32 (anyext (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)))), 255)),
+ (BFE_U32rii Int32Regs:$src, imm:$bitidx, 8)>;
+def: Pat<(i16 (sext_inreg (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)), i8)),
+ (CVT_s8_s32 (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8), CvtNONE)>;
+def: Pat<(ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)),
+ (CVT_s16_s32 (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8), CvtNONE)>;
+
+
+def : Pat<(v4i8 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b),
+ (i16 Int16Regs:$c), (i16 Int16Regs:$d))),
+ (BFI_B32rii
+ (BFI_B32rii (CVT_u32_u16 Int16Regs:$d, CvtNONE), (CVT_u32_u16 Int16Regs:$c, CvtNONE), 8, 8),
+ (BFI_B32rii (CVT_u32_u16 Int16Regs:$b, CvtNONE), (CVT_u32_u16 Int16Regs:$a, CvtNONE), 8, 8),
+ 16, 16)>;
+
//-----------------------------------
// Comparison instructions (setp, set)
//-----------------------------------
@@ -3293,10 +3323,6 @@ let hasSideEffects = false in {
(ins Int16Regs:$s1, Int16Regs:$s2,
Int16Regs:$s3, Int16Regs:$s4),
"mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
- def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
- (ins Int16Regs:$s1, Int16Regs:$s2,
- Int16Regs:$s3, Int16Regs:$s4),
- "mov.b32 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
(ins Int16Regs:$s1, Int16Regs:$s2),
"mov.b32 \t$d, {{$s1, $s2}};", []>;
@@ -3312,10 +3338,6 @@ let hasSideEffects = false in {
Int16Regs:$d3, Int16Regs:$d4),
(ins Int64Regs:$s),
"mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
- def I32toV4I8 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
- Int16Regs:$d3, Int16Regs:$d4),
- (ins Int32Regs:$s),
- "mov.b32 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
(ins Int32Regs:$s),
"mov.b32 \t{{$d1, $d2}}, $s;", []>;
@@ -3351,6 +3373,9 @@ def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
(I64toI32H Int64Regs:$s)>;
+def: Pat<(i32 (sext (extractelt (v2i16 Int32Regs:$src), 0))),
+ (CVT_INREG_s32_s16 Int32Regs:$src)>;
+
foreach vt = [v2f16, v2bf16, v2i16] in {
def : Pat<(extractelt (vt Int32Regs:$src), 0),
(I32toI16L Int32Regs:$src)>;
@@ -3363,9 +3388,6 @@ def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
(V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;
-def : Pat<(v4i8 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b),
- (i16 Int16Regs:$c), (i16 Int16Regs:$d))),
- (V4I8toI32 Int16Regs:$a, Int16Regs:$b, Int16Regs:$c, Int16Regs:$d)>;
// Count leading zeros
let hasSideEffects = false in {
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index da07f973501c855..ed7dd45ab7b4502 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -18,7 +18,8 @@ define i16 @test_v2i8(i16 %a) {
; CHECK-LABEL: test_v4i8
; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_param_0];
-; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9+]]], [[R]];
+; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8;
+; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9+]]], [[R0]];
; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8;
; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9+]]], [[R1]];
; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8;
@@ -41,6 +42,58 @@ define i16 @test_v4i8(i32 %a) {
ret i16 %r
}
+; CHECK-LABEL: test_v4i8_s32
+; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_s32_param_0];
+; CHECK-DAG: bfe.s32 [[R0:%r[0-9+]]], [[R]], 0, 8;
+; CHECK-DAG: bfe.s32 [[R1:%r[0-9+]]], [[R]], 8, 8;
+; CHECK-DAG: bfe.s32 [[R2:%r[0-9+]]], [[R]], 16, 8;
+; CHECK-DAG: bfe.s32 [[R3:%r[0-9+]]], [[R]], 24, 8;
+; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]]
+; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]]
+; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+define i32 @test_v4i8_s32(i32 %a) {
+ %v = bitcast i32 %a to <4 x i8>
+ %r0 = extractelement <4 x i8> %v, i64 0
+ %r1 = extractelement <4 x i8> %v, i64 1
+ %r2 = extractelement <4 x i8> %v, i64 2
+ %r3 = extractelement <4 x i8> %v, i64 3
+ %r0i = sext i8 %r0 to i32
+ %r1i = sext i8 %r1 to i32
+ %r2i = sext i8 %r2 to i32
+ %r3i = sext i8 %r3 to i32
+ %r01 = add i32 %r0i, %r1i
+ %r23 = add i32 %r2i, %r3i
+ %r = add i32 %r01, %r23
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_v4i8_u32
+; CHECK: ld.param.u32 [[R:%r[0-9+]]], [test_v4i8_u32_param_0];
+; CHECK-DAG: bfe.u32 [[R0:%r[0-9+]]], [[R]], 0, 8;
+; CHECK-DAG: bfe.u32 [[R1:%r[0-9+]]], [[R]], 8, 8;
+; CHECK-DAG: bfe.u32 [[R2:%r[0-9+]]], [[R]], 16, 8;
+; CHECK-DAG: bfe.u32 [[R3:%r[0-9+]]], [[R]], 24, 8;
+; CHECK-DAG: add.s32 [[R01:%r[0-9+]]], [[R0]], [[R1]]
+; CHECK-DAG: add.s32 [[R23:%r[0-9+]]], [[R2]], [[R3]]
+; CHECK-DAG: add.s32 [[R0123:%r[0-9+]]], [[R01]], [[R23]]
+define i32 @test_v4i8_u32(i32 %a) {
+ %v = bitcast i32 %a to <4 x i8>
+ %r0 = extractelement <4 x i8> %v, i64 0
+ %r1 = extractelement <4 x i8> %v, i64 1
+ %r2 = extractelement <4 x i8> %v, i64 2
+ %r3 = extractelement <4 x i8> %v, i64 3
+ %r0i = zext i8 %r0 to i32
+ %r1i = zext i8 %r1 to i32
+ %r2i = zext i8 %r2 to i32
+ %r3i = zext i8 %r3 to i32
+ %r01 = add i32 %r0i, %r1i
+ %r23 = add i32 %r2i, %r3i
+ %r = add i32 %r01, %r23
+ ret i32 %r
+}
+
+
+
; CHECK-LABEL: test_v8i8
; CHECK: ld.param.u64 [[R:%rd[0-9+]]], [test_v8i8_param_0];
; CHECK-DAG: cvt.s8.s64 [[E0:%rs[0-9+]]], [[R]];
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 5a22bbcf7416c17..684e4bc38d83de1 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -5,7 +5,7 @@
; RUN: %if ptxas %{ \
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | %ptxas-verify -arch=sm_53 \
+; RUN: | %ptxas-verify -arch=sm_90 \
; RUN: %}
; ## No support for i16x2 instructions
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 74087be4834d966..97b1e38a3388413 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -60,20 +60,17 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin
define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_2];
; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_1];
; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_param_0];
; CHECK-NEXT: and.b32 %r5, %r4, %r1;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs1, %rs1};
-; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
-; CHECK-NEXT: and.b32 %r9, %r3, %r8;
-; CHECK-NEXT: or.b32 %r10, %r5, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
+; CHECK-NEXT: xor.b32 %r7, %r1, -1;
+; CHECK-NEXT: and.b32 %r8, %r3, %r7;
+; CHECK-NEXT: or.b32 %r9, %r5, %r8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -85,20 +82,17 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: out_v4i8_undef(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_2];
; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_1];
; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_undef_param_0];
; CHECK-NEXT: and.b32 %r5, %r4, %r1;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r7, {%rs1, %rs1, %rs2, %rs1};
-; CHECK-NEXT: xor.b32 %r8, %r1, %r7;
-; CHECK-NEXT: and.b32 %r9, %r3, %r8;
-; CHECK-NEXT: or.b32 %r10, %r5, %r9;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r10;
+; CHECK-NEXT: xor.b32 %r7, %r1, -1;
+; CHECK-NEXT: and.b32 %r8, %r3, %r7;
+; CHECK-NEXT: or.b32 %r9, %r5, %r8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9;
; CHECK-NEXT: ret;
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
@@ -158,8 +152,7 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin
define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-LABEL: out_v8i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<22>;
+; CHECK-NEXT: .reg .b32 %r<21>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_1];
@@ -167,15 +160,13 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v8i8_param_0];
; CHECK-NEXT: and.b32 %r11, %r9, %r5;
; CHECK-NEXT: and.b32 %r13, %r10, %r6;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r15, {%rs1, %rs1, %rs1, %rs1};
-; CHECK-NEXT: xor.b32 %r16, %r6, %r15;
-; CHECK-NEXT: xor.b32 %r17, %r5, %r15;
-; CHECK-NEXT: and.b32 %r18, %r1, %r17;
-; CHECK-NEXT: and.b32 %r19, %r2, %r16;
-; CHECK-NEXT: or.b32 %r20, %r13, %r19;
-; CHECK-NEXT: or.b32 %r21, %r11, %r18;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r21, %r20};
+; CHECK-NEXT: xor.b32 %r15, %r6, -1;
+; CHECK-NEXT: xor.b32 %r16, %r5, -1;
+; CHECK-NEXT: and.b32 %r17, %r1, %r16;
+; CHECK-NEXT: and.b32 %r18, %r2, %r15;
+; CHECK-NEXT: or.b32 %r19, %r13, %r18;
+; CHECK-NEXT: or.b32 %r20, %r11, %r17;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19};
; CHECK-NEXT: ret;
%mx = and <8 x i8> %x, %mask
%notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -213,8 +204,7 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
; CHECK-LABEL: out_v4i16_undef(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<22>;
+; CHECK-NEXT: .reg .b32 %r<21>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_1];
@@ -222,15 +212,13 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
; CHECK-NEXT: ld.param.v2.u32 {%r9, %r10}, [out_v4i16_undef_param_0];
; CHECK-NEXT: and.b32 %r11, %r9, %r5;
; CHECK-NEXT: and.b32 %r13, %r10, %r6;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r15, {%rs2, %rs1};
-; CHECK-NEXT: xor.b32 %r16, %r6, %r15;
-; CHECK-NEXT: xor.b32 %r17, %r5, -1;
-; CHECK-NEXT: and.b32 %r18, %r1, %r17;
-; CHECK-NEXT: and.b32 %r19, %r2, %r16;
-; CHECK-NEXT: or.b32 %r20, %r13, %r19;
-; CHECK-NEXT: or.b32 %r21, %r11, %r18;
-; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r21, %r20};
+; CHECK-NEXT: xor.b32 %r15, %r6, -65536;
+; CHECK-NEXT: xor.b32 %r16, %r5, -1;
+; CHECK-NEXT: and.b32 %r17, %r1, %r16;
+; CHECK-NEXT: and.b32 %r18, %r2, %r15;
+; CHECK-NEXT: or.b32 %r19, %r13, %r18;
+; CHECK-NEXT: or.b32 %r20, %r11, %r17;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {%r20, %r19};
; CHECK-NEXT: ret;
%mx = and <4 x i16> %x, %mask
%notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -294,8 +282,7 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-LABEL: out_v16i8(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<42>;
+; CHECK-NEXT: .reg .b32 %r<41>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_1];
@@ -305,21 +292,19 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
; CHECK-NEXT: and.b32 %r23, %r18, %r10;
; CHECK-NEXT: and.b32 %r25, %r19, %r11;
; CHECK-NEXT: and.b32 %r27, %r20, %r12;
-; CHECK-NEXT: mov.u16 %rs1, -1;
-; CHECK-NEXT: mov.b32 %r29, {%rs1, %rs1, %rs1, %rs1};
-; CHECK-NEXT: xor.b32 %r30, %r12, %r29;
-; CHECK-NEXT: xor.b32 %r31, %r11, %r29;
-; CHECK-NEXT: xor.b32 %r32, %r10, %r29;
-; CHECK-NEXT: xor.b32 %r33, %r9, %r29;
-; CHECK-NEXT: and.b32 %r34, %r1, %r33;
-; CHECK-NEXT: and.b32 %r35, %r2, %r32;
-; CHECK-NEXT: and.b32 %r36, %r3, %r31;
-; CHECK-NEXT: and.b32 %r37, %r4, %r30;
-; CHECK-NEXT: or.b32 %r38, %r27, %r37;
-; CHECK-NEXT: or.b32 %r39, %r25, %r36;
-; CHECK-NEXT: or.b32 %r40, %r23, %r35;
-; CHECK-NEXT: or.b32 %r41, %r21, %r34;
-; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r41, %r40, %r39, %r38};
+; CHECK-NEXT: xor.b32 %r29, %r12, -1;
+; CHECK-NEXT: xor.b32 %r30, %r11, -1;
+; CHECK-NEXT: xor.b32 %r31, %r10, -1;
+; CHECK-NEXT: xor.b32 %r32, %r9, -1;
+; CHECK-NEXT: and.b32 %r33, %r1, %r32;
+; CHECK-NEXT: and.b32 %r34, %r2, %r31;
+; CHECK-NEXT: and.b32 %r35, %r3, %r30;
+; CHECK-NEXT: and.b32 %r36, %r4, %r29;
+; CHECK-NEXT: or.b32 %r37, %r27, %r36;
+; CHECK-NEXT: or.b32 %r38, %r25, %r35;
+; CHECK-NEXT: or.b32 %r39, %r23, %r34;
+; CHECK-NEXT: or.b32 %r40, %r21, %r33;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r40, %r39, %r38, %r37};
; CHECK-NEXT: ret;
%mx = and <16 x i8> %x, %mask
%notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
>From e55bb97942124e2659f8132784131c74e4f6fd10 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Tue, 3 Oct 2023 17:41:20 -0700
Subject: [PATCH 03/10] Down the rabbit hole we go.
To make things work consisstently for v4i8, we need to implement other vector
ops.
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 31 +
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 2 +
llvm/lib/Target/NVPTX/NVPTX.h | 12 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 11 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 168 ++-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 5 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 168 ++-
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 1237 +++++++++++++++++
8 files changed, 1580 insertions(+), 54 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 5d27accdc198c1e..b7a20c351f5ff6f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -309,3 +309,34 @@ void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
O << Sym.getName();
}
+
+void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int64_t Imm = MO.getImm();
+
+ switch (Imm) {
+ default:
+ return;
+ case NVPTX::PTXPrmtMode::NONE:
+ break;
+ case NVPTX::PTXPrmtMode::F4E:
+ O << ".f4e";
+ break;
+ case NVPTX::PTXPrmtMode::B4E:
+ O << ".b4e";
+ break;
+ case NVPTX::PTXPrmtMode::RC8:
+ O << ".rc8";
+ break;
+ case NVPTX::PTXPrmtMode::ECL:
+ O << ".ecl";
+ break;
+ case NVPTX::PTXPrmtMode::ECR:
+ O << ".ecr";
+ break;
+ case NVPTX::PTXPrmtMode::RC16:
+ O << ".rc16";
+ break;
+ }
+}
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index 49ad3f269229d5f..e6954f861cd10e2 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -47,6 +47,8 @@ class NVPTXInstPrinter : public MCInstPrinter {
raw_ostream &O, const char *Modifier = nullptr);
void printProtoIdent(const MCInst *MI, int OpNum,
raw_ostream &O, const char *Modifier = nullptr);
+ void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier = nullptr);
};
}
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index c5816b9266dfd9e..f7c8da372cec88c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -180,6 +180,18 @@ enum CmpMode {
FTZ_FLAG = 0x100
};
}
+
+namespace PTXPrmtMode {
+enum PrmtMode {
+ NONE,
+ F4E,
+ B4E,
+ RC8,
+ ECL,
+ ECR,
+ RC16,
+};
+}
}
void initializeNVPTXDAGToDAGISelPass(PassRegistry &);
} // namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c3bcf8f05a278ad..f442188610715ee 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3577,11 +3577,12 @@ bool NVPTXDAGToDAGISel::SelectExtractEltFromV4I8(SDValue N, SDValue &V,
Vector->getValueType(0) == MVT::v4i8))
return false;
- if (const ConstantSDNode *IdxConst =
- dyn_cast<ConstantSDNode>(N->getOperand(1))) {
- V = Vector;
- BitOffset = CurDAG->getTargetConstant(IdxConst->getZExtValue() * 8,
- SDLoc(N), MVT::i32);
+ SDLoc DL(N);
+ V = Vector;
+ SDValue Index = N->getOperand(1);
+ if (const ConstantSDNode *IdxConst = dyn_cast<ConstantSDNode>(Index)) {
+ BitOffset =
+ CurDAG->getTargetConstant(IdxConst->getZExtValue() * 8, DL, MVT::i32);
return true;
}
return false;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 66dcdb53b136b96..b886b6e2ce5ddde 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -497,18 +497,31 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
- // TODO: we should eventually lower it as PRMT instruction.
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Expand);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
+ // Only logical ops can be done on v4i8 directly, others must be done
+ // elementwise.
+ setOperationAction(
+ {ISD::ADD, ISD::MUL, ISD::ABS, ISD::SMIN,
+ ISD::SMAX, ISD::UMIN, ISD::UMAX, ISD::CTPOP,
+ ISD::CTLZ, ISD::ADD, ISD::SUB, ISD::MUL,
+ ISD::SHL, ISD::SREM, ISD::UREM, ISD::SDIV,
+ ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
+ ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP,
+ ISD::UINT_TO_FP},
+ MVT::v4i8, Expand);
// Operations not directly supported by NVPTX.
- for (MVT VT :
- {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::f64,
- MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}) {
+ for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
+ MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
+ MVT::i32, MVT::i64}) {
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::BR_CC, VT, Expand);
}
+
// Some SIGN_EXTEND_INREG can be done using cvt instruction.
// For others we will expand to a SHL/SRA pair.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
@@ -682,7 +695,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL,
- ISD::SREM, ISD::UREM, ISD::EXTRACT_VECTOR_ELT});
+ ISD::SREM, ISD::UREM, ISD::EXTRACT_VECTOR_ELT,
+ ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -891,6 +905,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
return "NVPTXISD::FUN_SHFR_CLAMP";
case NVPTXISD::IMAD:
return "NVPTXISD::IMAD";
+ case NVPTXISD::BFE:
+ return "NVPTXISD::BFE";
+ case NVPTXISD::BFI:
+ return "NVPTXISD::BFI";
+ case NVPTXISD::PRMT:
+ return "NVPTXISD::PRMT";
case NVPTXISD::SETP_F16X2:
return "NVPTXISD::SETP_F16X2";
case NVPTXISD::Dummy:
@@ -2163,18 +2183,39 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
// would get lowered as two constant loads and vector-packing move.
// Instead we want just a constant move:
-// mov.b32 %hh2, 0x40003C00
+// mov.b32 %r2, 0x40003C00
SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op->getValueType(0);
if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
return Op;
+ SDLoc DL(Op);
+
if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
isa<ConstantFPSDNode>(Operand);
- }))
+ })) {
+ // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
+ // to optimize calculation of constant parts.
+ if (VT == MVT::v4i8) {
+ SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
+ SDValue E01 = DAG.getNode(
+ NVPTXISD::BFI, DL, MVT::i32,
+ DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+ DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
+ SDValue E012 =
+ DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+ DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), E01,
+ DAG.getConstant(16, DL, MVT::i32), C8);
+ SDValue E0123 =
+ DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+ DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), E012,
+ DAG.getConstant(24, DL, MVT::i32), C8);
+ return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
+ }
return Op;
+ }
// Get value or the Nth operand as an APInt(32). Undef values treated as 0.
auto GetOperand = [](SDValue Op, int N) -> APInt {
@@ -2207,13 +2248,26 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Index = Op->getOperand(1);
+ SDValue Vector = Op->getOperand(0);
+ SDLoc DL(Op);
+ EVT VectorVT = Vector.getValueType();
+
+ if (VectorVT == MVT::v4i8) {
+ SDValue BFE =
+ DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
+ {Vector,
+ DAG.getNode(ISD::MUL, DL, MVT::i32,
+ DAG.getZExtOrTrunc(Index, DL, MVT::i32),
+ DAG.getConstant(8, DL, MVT::i32)),
+ DAG.getConstant(8, DL, MVT::i32)});
+ return DAG.getZExtOrTrunc(BFE, DL, Op->getValueType(0));
+ }
+
// Constant index will be matched by tablegen.
if (isa<ConstantSDNode>(Index.getNode()))
return Op;
// Extract individual elements and select one of them.
- SDValue Vector = Op->getOperand(0);
- EVT VectorVT = Vector.getValueType();
assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
EVT EltVT = VectorVT.getVectorElementType();
@@ -2226,6 +2280,34 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
ISD::CondCode::SETEQ);
}
+SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Vector = Op->getOperand(0);
+ EVT VectorVT = Vector.getValueType();
+
+ if (VectorVT != MVT::v4i8)
+ return Op;
+ SDLoc DL(Op);
+ SDValue Value = Op->getOperand(1);
+ if (Value->isUndef())
+ return Vector;
+
+ SDValue Index = Op->getOperand(2);
+
+ SDValue BFI =
+ DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+ {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
+ DAG.getNode(ISD::MUL, DL, MVT::i32,
+ DAG.getZExtOrTrunc(Index, DL, MVT::i32),
+ DAG.getConstant(8, DL, MVT::i32)),
+ DAG.getConstant(8, DL, MVT::i32)});
+ return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
+}
+
+SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+ SelectionDAG &DAG) const {
+ return SDValue();
+}
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
/// amount, or
@@ -2476,6 +2558,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return Op;
case ISD::EXTRACT_VECTOR_ELT:
return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT:
+ return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::VECTOR_SHUFFLE:
+ return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::CONCAT_VECTORS:
return LowerCONCAT_VECTORS(Op, DAG);
case ISD::STORE:
@@ -4987,6 +5073,32 @@ static SDValue PerformANDCombine(SDNode *N,
}
SDValue AExt;
+
+ // Convert BFE-> truncate i16 -> and 255
+ // To just BFE-> truncate i16, as the value already has all the bits in the
+ // right places.
+ if (Val.getOpcode() == ISD::TRUNCATE) {
+ SDValue BFE = Val.getOperand(0);
+ if (BFE.getOpcode() != NVPTXISD::BFE)
+ return SDValue();
+
+ ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
+ if (!BFEBits)
+ return SDValue();
+ uint64_t BFEBitsVal = BFEBits->getZExtValue();
+
+ ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+ if (!MaskCnst) {
+ // Not an AND with a constant
+ return SDValue();
+ }
+ uint64_t MaskVal = MaskCnst->getZExtValue();
+
+ if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
+ return SDValue();
+ // If we get here, the AND is unnecessary. Just replace it with the trunc
+ DCI.CombineTo(N, Val, false);
+ }
// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
if (Val.getOpcode() == ISD::ANY_EXTEND) {
AExt = Val;
@@ -5266,6 +5378,7 @@ static SDValue PerformSETCCCombine(SDNode *N,
static SDValue PerformEXTRACTCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue Vector = N->getOperand(0);
+ SDLoc DL(N);
EVT VectorVT = Vector.getValueType();
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
IsPTXVectorType(VectorVT.getSimpleVT()))
@@ -5286,7 +5399,6 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
if (!Index || Index->getZExtValue() == 0)
return SDValue();
- SDLoc DL(N);
MVT IVT = MVT::getIntegerVT(VectorBits);
EVT EltVT = VectorVT.getVectorElementType();
@@ -5309,6 +5421,38 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
return Result;
}
+static SDValue PerformVSELECTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue VA = N->getOperand(1);
+ EVT VectorVT = VA.getValueType();
+ if (VectorVT != MVT::v4i8)
+ return SDValue();
+
+ // We need to split vselect into individual per-element operations Because we
+ // use BFE/BFI instruction for byte extraction/insertion, we do end up with
+ // 32-bit values, so we may as well do comparison as i32 to avoid conversions
+ // to/from i16 normally used for i8 values.
+ SmallVector<SDValue, 4> E;
+ SDLoc DL(N);
+ SDValue VCond = N->getOperand(0);
+ SDValue VB = N->getOperand(2);
+ for (int I = 0; I < 4; ++I) {
+ SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
+ DCI.DAG.getConstant(I, DL, MVT::i32));
+ SDValue EA = DCI.DAG.getAnyExtOrTrunc(
+ DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
+ DCI.DAG.getConstant(I, DL, MVT::i32)),
+ DL, MVT::i32);
+ SDValue EB = DCI.DAG.getAnyExtOrTrunc(
+ DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
+ DCI.DAG.getConstant(I, DL, MVT::i32)),
+ DL, MVT::i32);
+ E.push_back(DCI.DAG.getAnyExtOrTrunc(
+ DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
+ }
+ return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5334,6 +5478,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformStoreRetvalCombine(N);
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACTCombine(N, DCI);
+ case ISD::VSELECT:
+ return PerformVSELECTCombine(N, DCI);
}
return SDValue();
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cd1985cc4219bdf..5c7c10965e2f2ca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -57,6 +57,9 @@ enum NodeType : unsigned {
MUL_WIDE_UNSIGNED,
IMAD,
SETP_F16X2,
+ BFE,
+ BFI,
+ PRMT,
Dummy,
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -590,6 +593,8 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 307963aaa800b88..2a34d050ed8f707 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -107,6 +107,21 @@ def VecElement : Operand<i32> {
let PrintMethod = "printVecElement";
}
+// PRMT modes
+// These must match the enum in NVPTX.h
+def PrmtNONE : PatLeaf<(i32 0x0)>;
+def PrmtF4E : PatLeaf<(i32 0x1)>;
+def PrmtB4E : PatLeaf<(i32 0x2)>;
+def PrmtRC8 : PatLeaf<(i32 0x3)>;
+def PrmtECL : PatLeaf<(i32 0x4)>;
+def PrmtECR : PatLeaf<(i32 0x5)>;
+def PrmtRC16 : PatLeaf<(i32 0x6)>;
+
+def PrmtMode : Operand<i32> {
+ let PrintMethod = "printPrmtMode";
+}
+
+
//===----------------------------------------------------------------------===//
// NVPTX Instruction Predicate Definitions
//===----------------------------------------------------------------------===//
@@ -742,7 +757,7 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
// def v2f16imm : Operand<v2f16>;
// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
-foreach vt = [v2f16, v2bf16, v2i16] in {
+foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))),
(SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>;
}
@@ -1738,46 +1753,119 @@ def FUNSHFRCLAMP :
// restriction in PTX?
//
// dest and src may be int32 or int64, but start and end are always int32.
-multiclass BFE<string Instr, RegisterClass RC> {
+def SDTBFE :
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
+
+def SDTBFI :
+ SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
+
+def SDTPRMT :
+ SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
+def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
+
+multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
def rrr
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, Int32Regs:$c),
- !strconcat(Instr, " \t$d, $a, $b, $c;"), []>;
+ !strconcat(Instr, " \t$d, $a, $b, $c;"),
+ [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 Int32Regs:$c)))]>;
def rri
: NVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, i32imm:$c),
- !strconcat(Instr, " \t$d, $a, $b, $c;"), []>;
+ !strconcat(Instr, " \t$d, $a, $b, $c;"),
+ [(set (T RC:$d), (bfe (T RC:$a), (i32 Int32Regs:$b), (i32 imm:$c)))]>;
def rii
: NVPTXInst<(outs RC:$d),
(ins RC:$a, i32imm:$b, i32imm:$c),
- !strconcat(Instr, " \t$d, $a, $b, $c;"), []>;
+ !strconcat(Instr, " \t$d, $a, $b, $c;"),
+ [(set (T RC:$d), (bfe (T RC:$a), (i32 imm:$b), (i32 imm:$c)))]>;
}
-multiclass BFI<string Instr, RegisterClass RC> {
- def rrr
+
+multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
+ def rrrr
: NVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
- def rri
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
+ def rrri
: NVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
- def rii
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
+ def rrii
: NVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), []>;
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ [(set (T RC:$f), (bfi (T RC:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
+ def irrr
+ : NVPTXInst<(outs RC:$f),
+ (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 Int32Regs:$d)))]>;
+ def irri
+ : NVPTXInst<(outs RC:$f),
+ (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 Int32Regs:$c), (i32 imm:$d)))]>;
+ def irii
+ : NVPTXInst<(outs RC:$f),
+ (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
+ !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ [(set (T RC:$f), (bfi (T imm:$a), (T RC:$b), (i32 imm:$c), (i32 imm:$d)))]>;
+}
+
+multiclass PRMT<ValueType T, RegisterClass RC> {
+ def rrr
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, Int32Regs:$c, i32imm:$mode),
+ !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
+ [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
+ def rri
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, i32imm:$c, i32imm:$mode),
+ !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
+ [(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
+ def rii
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, i32imm:$b, i32imm:$c, i32imm:$mode),
+ !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
+ [(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
}
let hasSideEffects = false in {
- defm BFE_S32 : BFE<"bfe.s32", Int32Regs>;
- defm BFE_U32 : BFE<"bfe.u32", Int32Regs>;
- defm BFE_S64 : BFE<"bfe.s64", Int64Regs>;
- defm BFE_U64 : BFE<"bfe.u64", Int64Regs>;
+ defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
+ defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
+ defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
+ defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
- defm BFI_B32 : BFI<"bfi.b32", Int32Regs>;
- defm BFI_B64 : BFI<"bfi.b64", Int64Regs>;
+ defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
+ defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
+
+ defm PRMT_B32 : PRMT<i32, Int32Regs>;
}
-// Common byte extraction patterns
+
+// byte extraction + signed/unsigned extension to i32.
+def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), i8)),
+ (BFE_S32rri Int32Regs:$s, Int32Regs:$o, 8)>;
+def : Pat<(i32 (sext_inreg (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), i8)),
+ (BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
+def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 Int32Regs:$o), 8), 255)),
+ (BFE_U32rri Int32Regs:$s, Int32Regs:$o, 8)>;
+def : Pat<(i32 (and (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8), 255)),
+ (BFE_U32rii Int32Regs:$s, imm:$o, 8)>;
+
+// byte extraction + signed extension to i16
+def : Pat<(i16 (sext_inreg (trunc (bfe (i32 Int32Regs:$s), (i32 imm:$o), 8)), i8)),
+ (CVT_s8_s32 (BFE_S32rii Int32Regs:$s, imm:$o, 8), CvtNONE)>;
+
+
+// Byte extraction via shift/trunc/sext
def : Pat<(i16 (sext_inreg (trunc Int32Regs:$s), i8)),
(CVT_s8_s32 Int32Regs:$s, CvtNONE)>;
def : Pat<(i16 (sext_inreg (trunc (srl (i32 Int32Regs:$s), (i32 imm:$o))), i8)),
@@ -1786,7 +1874,6 @@ def : Pat<(sext_inreg (srl (i32 Int32Regs:$s), (i32 imm:$o)), i8),
(BFE_S32rii Int32Regs:$s, imm:$o, 8)>;
def : Pat<(i16 (sra (i16 (trunc Int32Regs:$s)), (i32 8))),
(CVT_s8_s32 (BFE_S32rii Int32Regs:$s, 8, 8), CvtNONE)>;
-
def : Pat<(sext_inreg (srl (i64 Int64Regs:$s), (i32 imm:$o)), i8),
(BFE_S64rii Int64Regs:$s, imm:$o, 8)>;
def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
@@ -1794,24 +1881,6 @@ def : Pat<(i16 (sext_inreg (trunc Int64Regs:$s), i8)),
def : Pat<(i16 (sext_inreg (trunc (srl (i64 Int64Regs:$s), (i32 imm:$o))), i8)),
(CVT_s8_s64 (BFE_S64rii Int64Regs:$s, imm:$o, 8), CvtNONE)>;
-def ExtractFromV4I8 : ComplexPattern<i16, 2, "SelectExtractEltFromV4I8", [extractelt]>;
-def: Pat<(i32 (sext_inreg (i32 (anyext (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)))), i8)),
- (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8)>;
-def: Pat<(i32 (and (i32 (anyext (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)))), 255)),
- (BFE_U32rii Int32Regs:$src, imm:$bitidx, 8)>;
-def: Pat<(i16 (sext_inreg (ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)), i8)),
- (CVT_s8_s32 (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8), CvtNONE)>;
-def: Pat<(ExtractFromV4I8 (v4i8 Int32Regs:$src), (i32 imm:$bitidx)),
- (CVT_s16_s32 (BFE_S32rii Int32Regs:$src, imm:$bitidx, 8), CvtNONE)>;
-
-
-def : Pat<(v4i8 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b),
- (i16 Int16Regs:$c), (i16 Int16Regs:$d))),
- (BFI_B32rii
- (BFI_B32rii (CVT_u32_u16 Int16Regs:$d, CvtNONE), (CVT_u32_u16 Int16Regs:$c, CvtNONE), 8, 8),
- (BFI_B32rii (CVT_u32_u16 Int16Regs:$b, CvtNONE), (CVT_u32_u16 Int16Regs:$a, CvtNONE), 8, 8),
- 16, 16)>;
-
//-----------------------------------
// Comparison instructions (setp, set)
//-----------------------------------
@@ -2141,6 +2210,29 @@ def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
(NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+// comparisons of i8 extracted with BFE as i32
+def: Pat<(setgt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
+ (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGT)>;
+def: Pat<(setge (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
+ (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpGE)>;
+def: Pat<(setlt (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
+ (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLT)>;
+def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32Regs:$b), i8)),
+ (SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
+
+def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpGTU)>;
+def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpGEU)>;
+def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLTU)>;
+def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLEU)>;
+def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
+def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpNE)>;
+
// i1 compare -> i32
def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
(SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
new file mode 100644
index 000000000000000..3b13ac02a7b923b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -0,0 +1,1237 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; ## Support i16x2 instructions
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_90 \
+; RUN: %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define <4 x i8> @test_ret_const() #0 {
+; CHECK-LABEL: test_ret_const(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, 67305985;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ ret <4 x i8> <i8 1, i8 2, i8 3, i8 4>
+}
+
+define i8 @test_extract_0(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_extract_0(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_extract_0_param_0];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %e = extractelement <4 x i8> %a, i32 0
+ ret i8 %e
+}
+
+define i8 @test_extract_1(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_extract_1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_extract_1_param_0];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %e = extractelement <4 x i8> %a, i32 1
+ ret i8 %e
+}
+
+define i8 @test_extract_2(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_extract_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_extract_2_param_0];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %e = extractelement <4 x i8> %a, i32 2
+ ret i8 %e
+}
+
+define i8 @test_extract_3(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_extract_3(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_extract_3_param_0];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %e = extractelement <4 x i8> %a, i32 3
+ ret i8 %e
+}
+
+define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
+; CHECK-LABEL: test_extract_i(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_extract_i_param_0];
+; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT: shl.b32 %r3, %r2, 3;
+; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT: ret;
+ %e = extractelement <4 x i8> %a, i64 %idx
+ ret i8 %e
+}
+
+define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_add(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<13>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0];
+; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
+; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
+; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10;
+; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT: ret;
+ %r = add <4 x i8> %a, %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_add_imm_0(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0];
+; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12;
+; CHECK-NEXT: ret;
+ %r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_add_imm_1(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<9>;
+; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0];
+; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
+; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12;
+; CHECK-NEXT: ret;
+ %r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_sub(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<13>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0];
+; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
+; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
+; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10;
+; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT: ret;
+ %r = sub <4 x i8> %a, %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_smax(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0];
+; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
+; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
+; CHECK-NEXT: setp.gt.s32 %p1, %r3, %r4;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
+; CHECK-NEXT: setp.gt.s32 %p2, %r5, %r6;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
+; CHECK-NEXT: setp.gt.s32 %p3, %r7, %r8;
+; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
+; CHECK-NEXT: setp.gt.s32 %p4, %r9, %r10;
+; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
+; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT: ret;
+ %cmp = icmp sgt <4 x i8> %a, %b
+ %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_umax(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0];
+; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
+; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
+; CHECK-NEXT: setp.gtu.u32 %p1, %r3, %r4;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
+; CHECK-NEXT: setp.gtu.u32 %p2, %r5, %r6;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
+; CHECK-NEXT: setp.gtu.u32 %p3, %r7, %r8;
+; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
+; CHECK-NEXT: setp.gtu.u32 %p4, %r9, %r10;
+; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
+; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT: ret;
+ %cmp = icmp ugt <4 x i8> %a, %b
+ %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_smin(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0];
+; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
+; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
+; CHECK-NEXT: setp.le.s32 %p1, %r3, %r4;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
+; CHECK-NEXT: setp.le.s32 %p2, %r5, %r6;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
+; CHECK-NEXT: setp.le.s32 %p3, %r7, %r8;
+; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
+; CHECK-NEXT: setp.le.s32 %p4, %r9, %r10;
+; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
+; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT: ret;
+ %cmp = icmp sle <4 x i8> %a, %b
+ %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_umin(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0];
+; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
+; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
+; CHECK-NEXT: setp.leu.u32 %p1, %r3, %r4;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
+; CHECK-NEXT: setp.leu.u32 %p2, %r5, %r6;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
+; CHECK-NEXT: setp.leu.u32 %p3, %r7, %r8;
+; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
+; CHECK-NEXT: setp.leu.u32 %p4, %r9, %r10;
+; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
+; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
+; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT: ret;
+ %cmp = icmp ule <4 x i8> %a, %b
+ %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
+; CHECK-LABEL: test_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<24>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2];
+; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0];
+; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6;
+; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10;
+; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
+; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
+; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
+; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1;
+; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22;
+; CHECK-NEXT: ret;
+ %cmp = icmp eq <4 x i8> %a, %b
+ %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
+; CHECK-LABEL: test_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<24>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2];
+; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0];
+; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
+; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6;
+; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8;
+; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10;
+; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
+; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8;
+; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
+; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8;
+; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
+; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8;
+; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1;
+; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22;
+; CHECK-NEXT: ret;
+ %cmp = icmp ne <4 x i8> %a, %b
+ %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_mul(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<13>;
+; CHECK-NEXT: .reg .b32 %r<19>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0];
+; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
+; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
+; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4;
+; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
+; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7;
+; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
+; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10;
+; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
+; CHECK-NEXT: ret;
+ %r = mul <4 x i8> %a, %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_or(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r3, [test_or_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [test_or_param_0];
+; CHECK-NEXT: or.b32 %r5, %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT: ret;
+ %r = or <4 x i8> %a, %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_or_computed(i8 %a) {
+; CHECK-LABEL: test_or_computed(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8;
+; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8;
+; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8;
+; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8;
+; CHECK-NEXT: or.b32 %r8, %r6, %r4;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
+; CHECK-NEXT: ret;
+ %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
+ %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
+ %r = or <4 x i8> %ins.1, %ins.0
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_or_imm_0(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0];
+; CHECK-NEXT: or.b32 %r2, %r1, 67305985;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %r = or <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_or_imm_1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0];
+; CHECK-NEXT: or.b32 %r2, %r1, 67305985;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %r = or <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_xor(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r3, [test_xor_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [test_xor_param_0];
+; CHECK-NEXT: xor.b32 %r5, %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT: ret;
+ %r = xor <4 x i8> %a, %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_xor_computed(i8 %a) {
+; CHECK-LABEL: test_xor_computed(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8;
+; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8;
+; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8;
+; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8;
+; CHECK-NEXT: xor.b32 %r8, %r6, %r4;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
+; CHECK-NEXT: ret;
+ %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
+ %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
+ %r = xor <4 x i8> %ins.1, %ins.0
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_xor_imm_0(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0];
+; CHECK-NEXT: xor.b32 %r2, %r1, 67305985;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %r = xor <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_xor_imm_1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0];
+; CHECK-NEXT: xor.b32 %r2, %r1, 67305985;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %r = xor <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_and(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r3, [test_and_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [test_and_param_0];
+; CHECK-NEXT: and.b32 %r5, %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT: ret;
+ %r = and <4 x i8> %a, %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_and_computed(i8 %a) {
+; CHECK-LABEL: test_and_computed(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8;
+; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8;
+; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8;
+; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8;
+; CHECK-NEXT: and.b32 %r8, %r6, %r4;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
+; CHECK-NEXT: ret;
+ %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
+ %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
+ %r = and <4 x i8> %ins.1, %ins.0
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_and_imm_0(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0];
+; CHECK-NEXT: and.b32 %r2, %r1, 67305985;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %r = and <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_and_imm_1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0];
+; CHECK-NEXT: and.b32 %r2, %r1, 67305985;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %r = and <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
+ ret <4 x i8> %r
+}
+
+define void @test_ldst_v2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v2i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2i8_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2i8_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: st.u32 [%rd2], %r1;
+; CHECK-NEXT: ret;
+ %t1 = load <4 x i8>, ptr %a
+ store <4 x i8> %t1, ptr %b, align 16
+ ret void
+}
+
+define void @test_ldst_v3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v3i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3i8_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: st.u16 [%rd2], %r1;
+; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT: st.u8 [%rd2+2], %r3;
+; CHECK-NEXT: ret;
+ %t1 = load <3 x i8>, ptr %a
+ store <3 x i8> %t1, ptr %b, align 16
+ ret void
+}
+
+define void @test_ldst_v4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v4i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: st.u32 [%rd2], %r1;
+; CHECK-NEXT: ret;
+ %t1 = load <4 x i8>, ptr %a
+ store <4 x i8> %t1, ptr %b, align 16
+ ret void
+}
+
+define void @test_ldst_v8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v8i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
+; CHECK-NEXT: ld.u32 %r1, [%rd1];
+; CHECK-NEXT: ld.u32 %r2, [%rd1+4];
+; CHECK-NEXT: st.u32 [%rd2+4], %r2;
+; CHECK-NEXT: st.u32 [%rd2], %r1;
+; CHECK-NEXT: ret;
+ %t1 = load <8 x i8>, ptr %a
+ store <8 x i8> %t1, ptr %b, align 16
+ ret void
+}
+
+declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0
+
+define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_call(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_call_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0];
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .reg .b32 temp_param_reg;
+; CHECK-NEXT: .param .align 4 .b8 param0[4];
+; CHECK-NEXT: st.param.b32 [param0+0], %r1;
+; CHECK-NEXT: .param .align 4 .b8 param1[4];
+; CHECK-NEXT: st.param.b32 [param1+0], %r2;
+; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.b32 %r3, [retval0+0];
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b)
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_call_flipped(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0];
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .reg .b32 temp_param_reg;
+; CHECK-NEXT: .param .align 4 .b8 param0[4];
+; CHECK-NEXT: st.param.b32 [param0+0], %r2;
+; CHECK-NEXT: .param .align 4 .b8 param1[4];
+; CHECK-NEXT: st.param.b32 [param1+0], %r1;
+; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.b32 %r3, [retval0+0];
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0];
+; CHECK-NEXT: { // callseq 2, 0
+; CHECK-NEXT: .reg .b32 temp_param_reg;
+; CHECK-NEXT: .param .align 4 .b8 param0[4];
+; CHECK-NEXT: st.param.b32 [param0+0], %r2;
+; CHECK-NEXT: .param .align 4 .b8 param1[4];
+; CHECK-NEXT: st.param.b32 [param1+0], %r1;
+; CHECK-NEXT: .param .align 4 .b8 retval0[4];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.b32 %r3, [retval0+0];
+; CHECK-NEXT: } // callseq 2
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: ld.param.u32 %r2, [test_select_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_param_0];
+; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %r = select i1 %c, <4 x i8> %a, <4 x i8> %b
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 {
+; CHECK-LABEL: test_select_cc(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<29>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3];
+; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2];
+; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0];
+; CHECK-NEXT: bfe.s32 %r5, %r4, 24, 8;
+; CHECK-NEXT: bfe.s32 %r6, %r3, 24, 8;
+; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5;
+; CHECK-NEXT: bfe.s32 %r7, %r4, 16, 8;
+; CHECK-NEXT: bfe.s32 %r8, %r3, 16, 8;
+; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7;
+; CHECK-NEXT: bfe.s32 %r9, %r4, 8, 8;
+; CHECK-NEXT: bfe.s32 %r10, %r3, 8, 8;
+; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9;
+; CHECK-NEXT: bfe.s32 %r11, %r4, 0, 8;
+; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8;
+; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11;
+; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8;
+; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4;
+; CHECK-NEXT: bfe.s32 %r16, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r17, %r1, 8, 8;
+; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3;
+; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8;
+; CHECK-NEXT: bfe.s32 %r20, %r2, 16, 8;
+; CHECK-NEXT: bfe.s32 %r21, %r1, 16, 8;
+; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2;
+; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8;
+; CHECK-NEXT: bfe.s32 %r24, %r2, 24, 8;
+; CHECK-NEXT: bfe.s32 %r25, %r1, 24, 8;
+; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1;
+; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27;
+; CHECK-NEXT: ret;
+ %cc = icmp ne <4 x i8> %c, %d
+ %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
+ ret <4 x i8> %r
+}
+
+define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
+; CHECK-LABEL: test_select_cc_i32_i8(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<23>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
+; CHECK-NEXT: ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
+; CHECK-NEXT: ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
+; CHECK-NEXT: bfe.s32 %r11, %r10, 0, 8;
+; CHECK-NEXT: bfe.s32 %r12, %r9, 0, 8;
+; CHECK-NEXT: setp.ne.u32 %p1, %r12, %r11;
+; CHECK-NEXT: bfe.s32 %r13, %r10, 8, 8;
+; CHECK-NEXT: bfe.s32 %r14, %r9, 8, 8;
+; CHECK-NEXT: setp.ne.u32 %p2, %r14, %r13;
+; CHECK-NEXT: bfe.s32 %r15, %r10, 16, 8;
+; CHECK-NEXT: bfe.s32 %r16, %r9, 16, 8;
+; CHECK-NEXT: setp.ne.u32 %p3, %r16, %r15;
+; CHECK-NEXT: bfe.s32 %r17, %r10, 24, 8;
+; CHECK-NEXT: bfe.s32 %r18, %r9, 24, 8;
+; CHECK-NEXT: setp.ne.u32 %p4, %r18, %r17;
+; CHECK-NEXT: selp.b32 %r19, %r4, %r8, %p4;
+; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3;
+; CHECK-NEXT: selp.b32 %r21, %r2, %r6, %p2;
+; CHECK-NEXT: selp.b32 %r22, %r1, %r5, %p1;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r22, %r21, %r20, %r19};
+; CHECK-NEXT: ret;
+ <4 x i8> %c, <4 x i8> %d) #0 {
+ %cc = icmp ne <4 x i8> %c, %d
+ %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b
+ ret <4 x i32> %r
+}
+
+define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
+; CHECK-LABEL: test_select_cc_i8_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<27>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
+; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
+; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0];
+; CHECK-NEXT: setp.ne.s32 %p1, %r6, %r10;
+; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9;
+; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8;
+; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7;
+; CHECK-NEXT: bfe.s32 %r11, %r2, 0, 8;
+; CHECK-NEXT: bfe.s32 %r12, %r1, 0, 8;
+; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4;
+; CHECK-NEXT: bfe.s32 %r14, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r15, %r1, 8, 8;
+; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3;
+; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8;
+; CHECK-NEXT: bfe.s32 %r18, %r2, 16, 8;
+; CHECK-NEXT: bfe.s32 %r19, %r1, 16, 8;
+; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2;
+; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8;
+; CHECK-NEXT: bfe.s32 %r22, %r2, 24, 8;
+; CHECK-NEXT: bfe.s32 %r23, %r1, 24, 8;
+; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1;
+; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25;
+; CHECK-NEXT: ret;
+ <4 x i32> %c, <4 x i32> %d) #0 {
+ %cc = icmp ne <4 x i32> %c, %d
+ %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
+ ret <4 x i8> %r
+}
+
+
+define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
+; CHECK-LABEL: test_trunc_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
+; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8;
+; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8;
+; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7;
+; CHECK-NEXT: ret;
+ %r = trunc <4 x i32> %a to <4 x i8>
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
+; CHECK-LABEL: test_trunc_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
+; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
+; CHECK-NEXT: cvt.u32.u64 %r2, %rd2;
+; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8;
+; CHECK-NEXT: cvt.u32.u64 %r4, %rd3;
+; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8;
+; CHECK-NEXT: cvt.u32.u64 %r6, %rd4;
+; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7;
+; CHECK-NEXT: ret;
+ %r = trunc <4 x i64> %a to <4 x i8>
+ ret <4 x i8> %r
+}
+
+define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_zext_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0];
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r5, %r4, %r3, %r2};
+; CHECK-NEXT: ret;
+ %r = zext <4 x i8> %a to <4 x i32>
+ ret <4 x i32> %r
+}
+
+define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_zext_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0];
+; CHECK-NEXT: bfe.s32 %r2, %r1, 24, 8;
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
+; CHECK-NEXT: and.b64 %rd2, %rd1, 255;
+; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT: cvt.u64.u32 %rd3, %r3;
+; CHECK-NEXT: and.b64 %rd4, %rd3, 255;
+; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT: cvt.u64.u32 %rd5, %r4;
+; CHECK-NEXT: and.b64 %rd6, %rd5, 255;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 0, 8;
+; CHECK-NEXT: cvt.u64.u32 %rd7, %r5;
+; CHECK-NEXT: and.b64 %rd8, %rd7, 255;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd8, %rd6};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd4, %rd2};
+; CHECK-NEXT: ret;
+ %r = zext <4 x i8> %a to <4 x i64>
+ ret <4 x i64> %r
+}
+
+define <4 x i8> @test_bitcast_i32_to_2xi8(i32 %a) #0 {
+; CHECK-LABEL: test_bitcast_i32_to_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_2xi8_param_0];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+ %r = bitcast i32 %a to <4 x i8>
+ ret <4 x i8> %r
+}
+
+define i32 @test_bitcast_2xi8_to_i32(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_bitcast_2xi8_to_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_2xi8_to_i32_param_0];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
+ %r = bitcast <4 x i8> %a to i32
+ ret i32 %r
+}
+
+define <2 x half> @test_bitcast_2xi8_to_2xhalf(i8 %a) #0 {
+; CHECK-LABEL: test_bitcast_2xi8_to_2xhalf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_2xi8_to_2xhalf_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8;
+; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8;
+; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT: ret;
+ %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0
+ %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
+ %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2
+ %ins.3 = insertelement <4 x i8> %ins.2, i8 7, i32 3
+ %r = bitcast <4 x i8> %ins.3 to <2 x half>
+ ret <2 x half> %r
+}
+
+
+define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
+; CHECK-LABEL: test_shufflevector(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0];
+; CHECK-NEXT: bfe.s32 %r2, %r1, 24, 8;
+; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8;
+; CHECK-NEXT: bfi.b32 %r4, %r3, %r2, 8, 8;
+; CHECK-NEXT: bfe.s32 %r5, %r1, 8, 8;
+; CHECK-NEXT: bfi.b32 %r6, %r5, %r4, 16, 8;
+; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8;
+; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
+; CHECK-NEXT: ret;
+ %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x i8> %s
+}
+
+define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
+; CHECK-LABEL: test_insertelement(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [test_insertelement_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_insertelement_param_0];
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %i = insertelement <4 x i8> %a, i8 %x, i64 1
+ ret <4 x i8> %i
+}
+
+define <4 x i8> @test_fptosi_2xhalf_to_2xi8(<4 x half> %a) #0 {
+; CHECK-LABEL: test_fptosi_2xhalf_to_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<13>;
+; CHECK-NEXT: .reg .b32 %r<15>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_2xhalf_to_2xi8_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2;
+; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs5;
+; CHECK-NEXT: cvt.u32.u16 %r7, %rs6;
+; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4;
+; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8;
+; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7;
+; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9};
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs11;
+; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT: cvt.u32.u16 %r12, %rs12;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13;
+; CHECK-NEXT: ret;
+ %r = fptosi <4 x half> %a to <4 x i8>
+ ret <4 x i8> %r
+}
+
+define <4 x i8> @test_fptoui_2xhalf_to_2xi8(<4 x half> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xhalf_to_2xi8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<13>;
+; CHECK-NEXT: .reg .b32 %r<15>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_2xhalf_to_2xi8_param_0];
+; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2;
+; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1;
+; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
+; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs5;
+; CHECK-NEXT: cvt.u32.u16 %r7, %rs6;
+; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8;
+; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4;
+; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8;
+; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7;
+; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9};
+; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs11;
+; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT: cvt.u32.u16 %r12, %rs12;
+; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13;
+; CHECK-NEXT: ret;
+ %r = fptoui <4 x half> %a to <4 x i8>
+ ret <4 x i8> %r
+}
+
+attributes #0 = { nounwind }
>From 655c6d5bef8f016335643ad75465d22e216168e0 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 5 Oct 2023 14:22:16 -0700
Subject: [PATCH 04/10] Added vector_shuffle lowering to PRMT.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 17 +++++++++++-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 6 ++--
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 29 ++++++++++++++------
3 files changed, 39 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b886b6e2ce5ddde..701d9912150d955 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2306,7 +2306,22 @@ SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
- return SDValue();
+ SDValue V1 = Op.getOperand(0);
+ EVT VectorVT = V1.getValueType();
+ if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
+ return Op;
+
+ // Lower shuffle to PRMT instruction.
+ const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ SDValue V2 = Op.getOperand(1);
+ uint32_t Selector = 0;
+ for (auto I: llvm::enumerate(SVN->getMask()))
+ Selector |= (I.value() << (I.index() * 4));
+
+ SDLoc DL(Op);
+ return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
+ DAG.getConstant(Selector, DL, MVT::i32),
+ DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
}
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 2a34d050ed8f707..9d0bcbf3e8f50dc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1822,17 +1822,17 @@ multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
multiclass PRMT<ValueType T, RegisterClass RC> {
def rrr
: NVPTXInst<(outs RC:$d),
- (ins RC:$a, Int32Regs:$b, Int32Regs:$c, i32imm:$mode),
+ (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
!strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
[(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 Int32Regs:$c), imm:$mode))]>;
def rri
: NVPTXInst<(outs RC:$d),
- (ins RC:$a, Int32Regs:$b, i32imm:$c, i32imm:$mode),
+ (ins RC:$a, Int32Regs:$b, i32imm:$c, PrmtMode:$mode),
!strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
[(set (T RC:$d), (prmt (T RC:$a), (T RC:$b), (i32 imm:$c), imm:$mode))]>;
def rii
: NVPTXInst<(outs RC:$d),
- (ins RC:$a, i32imm:$b, i32imm:$c, i32imm:$mode),
+ (ins RC:$a, i32imm:$b, i32imm:$c, PrmtMode:$mode),
!strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
[(set (T RC:$d), (prmt (T RC:$a), (T imm:$b), (i32 imm:$c), imm:$mode))]>;
}
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 3b13ac02a7b923b..97e33c2f7eefc26 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1138,23 +1138,34 @@ define <2 x half> @test_bitcast_2xi8_to_2xhalf(i8 %a) #0 {
define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
; CHECK-LABEL: test_shufflevector(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0];
-; CHECK-NEXT: bfe.s32 %r2, %r1, 24, 8;
-; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8;
-; CHECK-NEXT: bfi.b32 %r4, %r3, %r2, 8, 8;
-; CHECK-NEXT: bfe.s32 %r5, %r1, 8, 8;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r4, 16, 8;
-; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8;
-; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 24, 8;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
+; CHECK-NEXT: // implicit-def: %r3
+; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 291;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
; CHECK-NEXT: ret;
%s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x i8> %s
}
+define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 {
+; CHECK-LABEL: test_shufflevector_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r2, [test_shufflevector_2_param_1];
+; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_2_param_0];
+; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 9527;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 7, i32 3, i32 5, i32 2>
+ ret <4 x i8> %s
+}
+
+
define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
; CHECK-LABEL: test_insertelement(
; CHECK: {
>From f915e5b855ce969a234cf644413132fe1742fac0 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 5 Oct 2023 14:44:01 -0700
Subject: [PATCH 05/10] Address clang-format complaints.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 701d9912150d955..da78eebb42ed0d9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -521,7 +521,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::BR_CC, VT, Expand);
}
-
// Some SIGN_EXTEND_INREG can be done using cvt instruction.
// For others we will expand to a SHL/SRA pair.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
@@ -2206,12 +2205,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
SDValue E012 =
DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), E01,
- DAG.getConstant(16, DL, MVT::i32), C8);
+ DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+ E01, DAG.getConstant(16, DL, MVT::i32), C8);
SDValue E0123 =
DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), E012,
- DAG.getConstant(24, DL, MVT::i32), C8);
+ DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
+ E012, DAG.getConstant(24, DL, MVT::i32), C8);
return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
}
return Op;
@@ -5414,7 +5413,6 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
if (!Index || Index->getZExtValue() == 0)
return SDValue();
-
MVT IVT = MVT::getIntegerVT(VectorBits);
EVT EltVT = VectorVT.getVectorElementType();
EVT EltIVT = EltVT.changeTypeToInteger();
>From ef3d5dee67581fd9b9644cf1e0ac54514ee4a884 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 5 Oct 2023 15:31:58 -0700
Subject: [PATCH 06/10] Use .lo/ls/hi/hs suffixes for unsigned setp
instructions.
Removed unused code.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 18 ------------------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 -
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 12 ++++++++----
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 16 ++++++++--------
5 files changed, 17 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index f442188610715ee..68391cdb6ff172b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3570,24 +3570,6 @@ bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
}
-bool NVPTXDAGToDAGISel::SelectExtractEltFromV4I8(SDValue N, SDValue &V,
- SDValue &BitOffset) {
- SDValue Vector = N->getOperand(0);
- if (!(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- Vector->getValueType(0) == MVT::v4i8))
- return false;
-
- SDLoc DL(N);
- V = Vector;
- SDValue Index = N->getOperand(1);
- if (const ConstantSDNode *IdxConst = dyn_cast<ConstantSDNode>(Index)) {
- BitOffset =
- CurDAG->getTargetConstant(IdxConst->getZExtValue() * 8, DL, MVT::i32);
- return true;
- }
- return false;
-}
-
bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
unsigned int spN) const {
const Value *Src = nullptr;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 34b5dd449ce086f..06922331f5e2059 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -95,7 +95,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
SDValue &Offset);
bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);
- bool SelectExtractEltFromV4I8(SDValue N, SDValue &Value, SDValue &Idx);
bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index da78eebb42ed0d9..8d7a29198d61a11 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2259,7 +2259,7 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
DAG.getZExtOrTrunc(Index, DL, MVT::i32),
DAG.getConstant(8, DL, MVT::i32)),
DAG.getConstant(8, DL, MVT::i32)});
- return DAG.getZExtOrTrunc(BFE, DL, Op->getValueType(0));
+ return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
}
// Constant index will be matched by tablegen.
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9d0bcbf3e8f50dc..3c9d8167e689a56 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -76,6 +76,10 @@ def CmpLT : PatLeaf<(i32 2)>;
def CmpLE : PatLeaf<(i32 3)>;
def CmpGT : PatLeaf<(i32 4)>;
def CmpGE : PatLeaf<(i32 5)>;
+def CmpLO : PatLeaf<(i32 6)>;
+def CmpLS : PatLeaf<(i32 7)>;
+def CmpHI : PatLeaf<(i32 8)>;
+def CmpHS : PatLeaf<(i32 9)>;
def CmpEQU : PatLeaf<(i32 10)>;
def CmpNEU : PatLeaf<(i32 11)>;
def CmpLTU : PatLeaf<(i32 12)>;
@@ -2221,13 +2225,13 @@ def: Pat<(setle (sext_inreg (trunc Int32Regs:$a), i8), (sext_inreg (trunc Int32R
(SETP_s32rr Int32Regs:$a, Int32Regs:$b, CmpLE)>;
def: Pat<(setugt (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
- (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpGTU)>;
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHI)>;
def: Pat<(setuge (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
- (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpGEU)>;
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpHS)>;
def: Pat<(setult (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
- (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLTU)>;
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLO)>;
def: Pat<(setule (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
- (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLEU)>;
+ (SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpLS)>;
def: Pat<(seteq (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
(SETP_u32rr Int32Regs:$a, Int32Regs:$b, CmpEQ)>;
def: Pat<(setne (i16 (and (trunc Int32Regs:$a), 255)), (i16 (and (trunc Int32Regs:$b), 255))),
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 97e33c2f7eefc26..641f2f36f95b353 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -293,16 +293,16 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0];
; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT: setp.gtu.u32 %p1, %r3, %r4;
+; CHECK-NEXT: setp.hi.u32 %p1, %r3, %r4;
; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT: setp.gtu.u32 %p2, %r5, %r6;
+; CHECK-NEXT: setp.hi.u32 %p2, %r5, %r6;
; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT: setp.gtu.u32 %p3, %r7, %r8;
+; CHECK-NEXT: setp.hi.u32 %p3, %r7, %r8;
; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT: setp.gtu.u32 %p4, %r9, %r10;
+; CHECK-NEXT: setp.hi.u32 %p4, %r9, %r10;
; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
@@ -363,16 +363,16 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0];
; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
-; CHECK-NEXT: setp.leu.u32 %p1, %r3, %r4;
+; CHECK-NEXT: setp.ls.u32 %p1, %r3, %r4;
; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
-; CHECK-NEXT: setp.leu.u32 %p2, %r5, %r6;
+; CHECK-NEXT: setp.ls.u32 %p2, %r5, %r6;
; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
-; CHECK-NEXT: setp.leu.u32 %p3, %r7, %r8;
+; CHECK-NEXT: setp.ls.u32 %p3, %r7, %r8;
; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
-; CHECK-NEXT: setp.leu.u32 %p4, %r9, %r10;
+; CHECK-NEXT: setp.ls.u32 %p4, %r9, %r10;
; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
>From 9821e908e676d8eedbee7c07c90fb5aae4454f82 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 6 Oct 2023 12:22:41 -0700
Subject: [PATCH 07/10] Fixed calculation of constant v4i8 values.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 ++++
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 4 ++--
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8d7a29198d61a11..6a62e228e8efb39 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2229,6 +2229,10 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
Value = cast<ConstantSDNode>(Operand)->getAPIntValue();
else
llvm_unreachable("Unsupported type");
+ // i8 values are carried around as i16, so we need to zero out upper bits,
+ // so they do not get in the way of combining individual byte values
+ if (VT == MVT::v4i8)
+ Value = Value.trunc(8);
return Value.zext(32);
};
APInt Value;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 641f2f36f95b353..c429bf23417f951 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -17,10 +17,10 @@ define <4 x i8> @test_ret_const() #0 {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov.u32 %r1, 67305985;
+; CHECK-NEXT: mov.u32 %r1, -66911489;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
; CHECK-NEXT: ret;
- ret <4 x i8> <i8 1, i8 2, i8 3, i8 4>
+ ret <4 x i8> <i8 -1, i8 2, i8 3, i8 -4>
}
define i8 @test_extract_0(<4 x i8> %a) #0 {
>From 3879bdb03da707fd0fc02e2f92d5c8733a52de1f Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 6 Oct 2023 14:17:39 -0700
Subject: [PATCH 08/10] Updated a test.
---
.../CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 97b1e38a3388413..8633b09af04873c 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -89,7 +89,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_1];
; CHECK-NEXT: ld.param.u32 %r4, [out_v4i8_undef_param_0];
; CHECK-NEXT: and.b32 %r5, %r4, %r1;
-; CHECK-NEXT: xor.b32 %r7, %r1, -1;
+; CHECK-NEXT: xor.b32 %r7, %r1, -16711681;
; CHECK-NEXT: and.b32 %r8, %r3, %r7;
; CHECK-NEXT: or.b32 %r9, %r5, %r8;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r9;
>From 899ab5a3ff06431091441ae3d3f5d136db76ab0e Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 6 Oct 2023 16:21:19 -0700
Subject: [PATCH 09/10] Fixed unaligned load/store of v4i8
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 7 +++---
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 24 ++++++++++++++++++++
2 files changed, 28 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 6a62e228e8efb39..8e3a80717ba0418 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2694,9 +2694,10 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::i1)
return LowerLOADi1(Op, DAG);
- // v2f16/v2bf16/v2i16 are legal, so we can't rely on legalizer to handle
+ // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
// unaligned loads and have to handle it here.
- if (Isv2x16VT(Op.getValueType())) {
+ EVT VT = Op.getValueType();
+ if (Isv2x16VT(VT) || VT == MVT::v4i8) {
LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT MemVT = Load->getMemoryVT();
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -2741,7 +2742,7 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// v2f16 is legal, so we can't rely on legalizer to handle unaligned
// stores and have to handle it here.
- if (Isv2x16VT(VT) &&
+ if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
VT, *Store->getMemOperand()))
return expandUnalignedStore(Store, DAG);
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index c429bf23417f951..fd48313ad684847 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -757,6 +757,30 @@ define void @test_ldst_v4i8(ptr %a, ptr %b) {
ret void
}
+define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v4i8_unaligned(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0];
+; CHECK-NEXT: ld.u8 %r1, [%rd1];
+; CHECK-NEXT: ld.u8 %r2, [%rd1+1];
+; CHECK-NEXT: ld.u8 %r3, [%rd1+2];
+; CHECK-NEXT: ld.u8 %r4, [%rd1+3];
+; CHECK-NEXT: st.u8 [%rd2+3], %r4;
+; CHECK-NEXT: st.u8 [%rd2+2], %r3;
+; CHECK-NEXT: st.u8 [%rd2+1], %r2;
+; CHECK-NEXT: st.u8 [%rd2], %r1;
+; CHECK-NEXT: ret;
+ %t1 = load <4 x i8>, ptr %a, align 1
+ store <4 x i8> %t1, ptr %b, align 1
+ ret void
+}
+
+
define void @test_ldst_v8i8(ptr %a, ptr %b) {
; CHECK-LABEL: test_ldst_v8i8(
; CHECK: {
>From 7494e8cf361870ae119a15a6099244bc837abcf9 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 9 Oct 2023 12:18:40 -0700
Subject: [PATCH 10/10] clang-format
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8e3a80717ba0418..36da2e7b40efaab 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2318,7 +2318,7 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
SDValue V2 = Op.getOperand(1);
uint32_t Selector = 0;
- for (auto I: llvm::enumerate(SVN->getMask()))
+ for (auto I : llvm::enumerate(SVN->getMask()))
Selector |= (I.value() << (I.index() * 4));
SDLoc DL(Op);
More information about the lldb-commits
mailing list