[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Wed May 29 06:47:48 PDT 2024
================
@@ -6086,6 +6086,63 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
}
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ unsigned IntrinsicID = N->getConstantOperandVal(0);
+ SDValue Src0 = N->getOperand(1);
+ SDLoc SL(N);
+ MVT IntVT = MVT::getIntegerVT(ValSize);
+
+ auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
+ MVT VT) -> SDValue {
+ return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
+ : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
+ : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
+ };
+
+ SDValue Src1, Src2;
+ if (IntrinsicID == Intrinsic::amdgcn_readlane ||
+ IntrinsicID == Intrinsic::amdgcn_writelane) {
+ Src1 = N->getOperand(2);
+ if (IntrinsicID == Intrinsic::amdgcn_writelane)
+ Src2 = N->getOperand(3);
+ }
+
+ if (ValSize == 32) {
+ // Already legal
+ return SDValue();
+ }
+
+ if (ValSize < 32) {
+ bool IsFloat = VT.isFloatingPoint();
+ Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+ SL, MVT::i32);
+ if (Src2.getNode()) {
+ Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+ SL, MVT::i32);
+ }
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+ SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+ return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+ }
+
+ if ((ValSize % 32) == 0) {
+ MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
----------------
arsenm wrote:
Yes, you need bitcast to get from FP to the 32-bit scalars. you are only trying to preserve the element types of the 32-bit legal pieces. i.e. v4f16 -> v2f16, v2f16. v4f32 -> f32, f32, f32, f32. v2f64 -> bitcast v4i32
https://github.com/llvm/llvm-project/pull/89217
More information about the cfe-commits
mailing list