[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
Vikram Hegde via llvm-commits
llvm-commits at lists.llvm.org
Tue May 28 02:59:14 PDT 2024
================
@@ -6086,6 +6086,63 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
}
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ unsigned IntrinsicID = N->getConstantOperandVal(0);
+ SDValue Src0 = N->getOperand(1);
+ SDLoc SL(N);
+ MVT IntVT = MVT::getIntegerVT(ValSize);
+
+ auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
+ MVT VT) -> SDValue {
+ return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
+ : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
+ : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
+ };
+
+ SDValue Src1, Src2;
+ if (IntrinsicID == Intrinsic::amdgcn_readlane ||
+ IntrinsicID == Intrinsic::amdgcn_writelane) {
+ Src1 = N->getOperand(2);
+ if (IntrinsicID == Intrinsic::amdgcn_writelane)
+ Src2 = N->getOperand(3);
+ }
+
+ if (ValSize == 32) {
+ // Already legal
+ return SDValue();
+ }
+
+ if (ValSize < 32) {
+ bool IsFloat = VT.isFloatingPoint();
+ Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+ SL, MVT::i32);
+ if (Src2.getNode()) {
+ Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+ SL, MVT::i32);
+ }
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+ SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+ return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+ }
+
+ if ((ValSize % 32) == 0) {
+ MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+ Src0 = DAG.getBitcast(VecVT, Src0);
+
+ if (Src2.getNode())
+ Src2 = DAG.getBitcast(VecVT, Src2);
+
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+ SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
+ return DAG.getBitcast(VT, UnrolledLaneOp);
----------------
vikramRH wrote:
```suggestion
MVT LaneOpT =
VT.isVector() && VT.getVectorElementType().getSizeInBits() == 16
? MVT::v2i16
: MVT::i32;
SDValue Src0SubReg, Src2SubReg;
SmallVector<SDValue, 4> LaneOps;
LaneOps.push_back(DAG.getTargetConstant(
TLI.getRegClassFor(VT.getSimpleVT(), N->isDivergent())->getID(), SL,
MVT::i32));
for (unsigned i = 0; i < (ValSize / 32); i++) {
unsigned SubRegIdx = SIRegisterInfo::getSubRegFromChannel(i);
Src0SubReg = DAG.getTargetExtractSubreg(SubRegIdx, SL, LaneOpT, Src0);
if (Src2)
Src2SubReg = DAG.getTargetExtractSubreg(SubRegIdx, SL, LaneOpT, Src2);
LaneOps.push_back(createLaneOp(Src0SubReg, Src1, Src2SubReg, LaneOpT));
LaneOps.push_back(DAG.getTargetConstant(SubRegIdx, SL, MVT::i32));
}
return SDValue(
DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, SL, VT, LaneOps), 0);
```
@arsenm , @jayfoad , an alternate idea here that is much closer in logic to the GIsel implementation and doesn't rely on bitcasts. how does this look ?
https://github.com/llvm/llvm-project/pull/89217
More information about the llvm-commits
mailing list