[llvm-branch-commits] [llvm] ec1747c - Able to produce good initial SelectionDAG for ret. resolved extract_subvector legalizing, able to build the test.ll
Jeffrey Byrnes via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Oct 17 10:42:30 PDT 2022
Author: Jeffrey Byrnes
Date: 2022-10-13T10:48:41-07:00
New Revision: ec1747cb71d0db73b268d17367b83652cd4e2ad3
URL: https://github.com/llvm/llvm-project/commit/ec1747cb71d0db73b268d17367b83652cd4e2ad3
DIFF: https://github.com/llvm/llvm-project/commit/ec1747cb71d0db73b268d17367b83652cd4e2ad3.diff
LOG: Able to produce good initial SelectionDAG for ret. resolved extract_subvector legalizing, able to build the test.ll
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index e62f57c536b37..629e7b84cf71d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -971,12 +971,16 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
TargetLowering::TypeLegal &&
"Unexpected illegal type!");
- for (const SDValue &Op : Node->op_values())
+ for (const SDValue &Op : Node->op_values()) {
+ if (TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
+ TargetLowering::TypeLegal) errs() << "TargetLowering::TypeLegal\n";
+ if (Op.getOpcode() == ISD::Register) errs() << "Register\n";
assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
TargetLowering::TypeLegal ||
Op.getOpcode() == ISD::TargetConstant ||
Op.getOpcode() == ISD::Register) &&
"Unexpected illegal type!");
+ }
#endif
// Figure out the correct action; the way to query this varies by opcode
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index b6c66077675ff..523788106db63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -22,28 +22,28 @@ def CC_SI_Gfx : CallingConv<[
// 32 is reserved for the stack pointer
// 33 is reserved for the frame pointer
// 34 is reserved for the base pointer
- CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[
SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29
]>>>,
- CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
]>>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, v4i8, i1], CCAssignToStack<4, 4>>
]>;
def RetCC_SI_Gfx : CallingConv<[
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[
def CC_SI_SHADER : CallingConv<[
- CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -76,7 +76,7 @@ def CC_SI_SHADER : CallingConv<[
]>>>,
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
- CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, v4i8] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -99,7 +99,7 @@ def CC_SI_SHADER : CallingConv<[
def RetCC_SI_Shader : CallingConv<[
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i32, i16] , CCAssignToReg<[
+ CCIfType<[i32, i16, v4i8] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -183,19 +183,19 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+ CCIfType<[i32, f32, i16, f16, v2i16, v2f16, v4i8, i1], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, v4i8, i1], CCAssignToStack<4, 4>>
]>;
// Calling convention for leaf functions
def RetCC_AMDGPU_Func : CallingConv<[
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+ CCIfType<[i32, f32, i16, f16, v2i16, v2f16, v4i8], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9c2247f336ee1..9980e851f9820 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -322,7 +322,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16,
MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64,
- MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
+ MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64, MVT::v2i8,
+ MVT::v4i8},
Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
@@ -1246,6 +1247,15 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SmallVector<SDValue, 8> Args;
EVT VT = Op.getValueType();
+ if (VT == MVT::v4i8) {
+ SDLoc SL(Op);
+ SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+ SDValue BV = DAG.getBuildVector(MVT::v2i16, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+ }
+
+
if (VT == MVT::v4i16 || VT == MVT::v4f16) {
SDLoc SL(Op);
SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
@@ -1270,6 +1280,36 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
EVT SrcVT = Op.getOperand(0).getValueType();
// For these types, we have some TableGen patterns except if the index is 1
+ if ((SrcVT == MVT::v4i8 && VT == MVT::v2i8) &&
+ Start != 1) {
+ SDLoc SL(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ unsigned VecSize = SrcVT.getSizeInBits();
+ EVT EltVT = SrcVT.getVectorElementType();
+ unsigned EltSize = EltVT.getSizeInBits();
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
+
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+ MVT RestIntVT = MVT::getIntegerVT(VT.getSizeInBits());
+
+ // Convert vector index to bit-index (* EltSize)
+ SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
+
+ // Convert source vector to corresponding scalar
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+
+ // Shift to get the appropriate bits for subvector
+ SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
+
+ // Trunc to bitsize of result vector of extract_subvector
+ SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, RestIntVT, Elt);
+
+ SDValue Recast = DAG.getNode(ISD::BITCAST, SL, VT, Result);
+ return Recast;
+ }
+
+
if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
(SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
Start != 1)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6e0478ed2f166..f962e49418c50 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -83,6 +83,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+ addRegisterClass(MVT::v4i8, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::v2i8, &AMDGPU::SReg_32RegClass);
+
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
const SIRegisterInfo *TRI = STI.getRegisterInfo();
@@ -173,20 +176,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
- setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
+ //setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
+ //setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
- setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
- setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
+ //setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+ //setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
- setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
+ //setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
@@ -242,7 +245,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64,
MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64,
MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
- MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
+ MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, MVT::v4i8}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -607,7 +610,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (!Subtarget->hasVOP3PInsts())
setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
-
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8, Custom);
+
setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
@@ -645,7 +650,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE,
{MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
- MVT::v16f16, MVT::v16i16},
+ MVT::v16f16, MVT::v16i16, MVT::v4i8},
Custom);
for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
@@ -826,8 +831,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
return VT.isInteger() ? MVT::i32 : MVT::f32;
}
+ if (Size == 8)
+ return Subtarget->has16BitInsts() ? MVT::v4i8 : MVT::i32;
- if (Size < 16)
+ if (Size < 16 && Size != 8)
return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
}
@@ -850,6 +857,9 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
unsigned Size = ScalarVT.getSizeInBits();
// FIXME: Should probably promote 8-bit vectors to i16.
+ if (Size == 8 && Subtarget->has16BitInsts())
+ return (NumElts + 1) / 4;
+
if (Size == 16 && Subtarget->has16BitInsts())
return (NumElts + 1) / 2;
@@ -872,6 +882,12 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 8 && Subtarget->has16BitInsts()) {
+ RegisterVT = MVT::v4i8;
+ NumIntermediates = (NumElts + 1) / 4;
+ IntermediateVT = RegisterVT;
+ return NumIntermediates;
+ }
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
@@ -5857,8 +5873,16 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
EVT ResultVT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
- EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
- EVT EltVT = PackVT.getVectorElementType();
+ EVT PackVT;
+ EVT EltVT;
+ auto ScalarSize = ResultVT.getVectorElementType().getSizeInBits() ;
+ if (ScalarSize == 8) {
+ PackVT = MVT::v2i8;
+ }
+ else {
+ PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ }
+ EltVT = PackVT.getVectorElementType();
int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
// vector_shuffle <0,1,6,7> lhs, rhs
@@ -5969,32 +5993,56 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
- assert(VT == MVT::v2f16 || VT == MVT::v2i16);
- assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
+ if (VT != MVT::v2i8) {
+ assert(VT == MVT::v2f16 || VT == MVT::v2i16);
+ assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
+ }
+
+
+ EVT SrcVT = Op.getOperand(1).getValueType(); // i8, i16
+ EVT BCVT = (SrcVT) == MVT::f16 ? MVT::i16 : SrcVT;
+
+ unsigned VecSize = VT.getSizeInBits(); // 16, 32
+ EVT EltVT = SrcVT.getVectorElementType();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
+ // Convert vector index to bit-index (* EltSize)
+ SDValue ScaledShift = DAG.getNode(ISD::SHL, SL, MVT::i32, DAG.getConstant(1, SL, MVT::i32), ScaleFactor);
+
+ MVT IntVT = MVT::getIntegerVT(VecSize); // i16, i32
+ MVT FloatVT = MVT::getFloatingPointVT(VecSize); // f32
+ MVT RestIntVT = MVT::getIntegerVT(VT.getSizeInBits());
+
+
+
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
// Avoid adding defined bits with the zero_extend.
if (Hi.isUndef()) {
- Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
- SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
+ Lo = DAG.getNode(ISD::BITCAST, SL, BCVT, Lo);
+ SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, IntVT, Lo);
return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
}
- Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
- Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
+ Hi = DAG.getNode(ISD::BITCAST, SL, BCVT, Hi);
+ Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Hi);
- SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
- DAG.getConstant(16, SL, MVT::i32));
+ SDValue ShlHi = DAG.getNode(ISD::SHL, SL, IntVT, Hi, ScaledShift);
if (Lo.isUndef())
return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
- Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
- Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+ Lo = DAG.getNode(ISD::BITCAST, SL, BCVT, Lo);
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Lo);
- SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
- return DAG.getNode(ISD::BITCAST, SL, VT, Or);
+ SDValue Or = DAG.getNode(ISD::OR, SL, IntVT, Lo, ShlHi);
+ auto temp = DAG.getNode(ISD::BITCAST, SL, VT, Or);
+ errs() << "Build Final node : \n";
+ temp->dump();
+ errs() << "\n";
+ return temp;
}
bool
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8100d82d21f30..d88272fc485c2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1313,6 +1313,9 @@ def : BitConvert <f16, i16, VGPR_32>;
def : BitConvert <i16, f16, SReg_32>;
def : BitConvert <f16, i16, SReg_32>;
+def : BitConvert <v2i8, i16, SReg_32>;
+def : BitConvert <i16, v2i8, SReg_32>;
+
// 32-bit bitcast
def : BitConvert <i32, f32, VGPR_32>;
def : BitConvert <f32, i32, VGPR_32>;
@@ -1329,6 +1332,9 @@ def : BitConvert <f32, v2f16, SReg_32>;
def : BitConvert <v2i16, f32, SReg_32>;
def : BitConvert <f32, v2i16, SReg_32>;
+def : BitConvert <v4i8, i32, SReg_32>;
+def : BitConvert <v4i8, v2i16, SReg_32>;
+
// 64-bit bitcast
def : BitConvert <i64, f64, VReg_64>;
def : BitConvert <f64, i64, VReg_64>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index fae76be2b1ddb..c07333b17ff37 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -369,7 +369,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
}
// SGPR 32-bit registers
-def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32,
(add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
@@ -406,7 +406,7 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
// Trap handler TMP 32-bit registers
-def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
+def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v4i8, v2i8], 32,
(add (sequence "TTMP%u", 0, 15))> {
let isAllocatable = 0;
let HasSGPR = 1;
@@ -528,7 +528,7 @@ class RegisterTypes<list<ValueType> reg_types> {
}
def Reg16Types : RegisterTypes<[i16, f16]>;
-def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v4i8, v2i8, p2, p3, p5, p6]>;
let HasVGPR = 1 in {
def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
@@ -600,7 +600,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
}
// AccVGPR 32-bit registers
-def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32,
(add (sequence "AGPR%u", 0, 255))> {
let AllocationPriority = 0;
let Size = 32;
@@ -639,7 +639,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
-def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32,
(add FP_REG, SP_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
@@ -662,7 +662,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
let GeneratePressureSet = 0, HasSGPR = 1 in {
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE,
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
@@ -680,7 +680,7 @@ def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
let AllocationPriority = 0;
}
-def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
let AllocationPriority = 0;
}
@@ -691,7 +691,7 @@ def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16,
let AllocationPriority = 0;
}
-def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 0;
}
@@ -710,20 +710,20 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
} // End GeneratePressureSet = 0
// Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8, i1], 32,
(add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
let AllocationPriority = 0;
let HasSGPR = 1;
}
let GeneratePressureSet = 0 in {
-def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32,
(add SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasSGPR = 1;
}
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4i8, v2i8], 32,
(add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 1;
@@ -807,7 +807,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Re
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
}
-def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32,
(add VGPR_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
@@ -887,14 +887,14 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> {
let HasVGPR = 1;
}
-def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32,
(add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
}
-def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, v4i8, v2i8], 32,
(add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
More information about the llvm-branch-commits
mailing list