[clang] [llvm] [AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (PR #89217)

Tue May 28 02:59:14 PDT 2024

================
@@ -6086,6 +6086,63 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
       DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
 }
 
+static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
+                           SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  unsigned ValSize = VT.getSizeInBits();
+  unsigned IntrinsicID = N->getConstantOperandVal(0);
+  SDValue Src0 = N->getOperand(1);
+  SDLoc SL(N);
+  MVT IntVT = MVT::getIntegerVT(ValSize);
+
+  auto createLaneOp = [&DAG, &SL](SDValue Src0, SDValue Src1, SDValue Src2,
+                                  MVT VT) -> SDValue {
+    return (Src2 ? DAG.getNode(AMDGPUISD::WRITELANE, SL, VT, {Src0, Src1, Src2})
+            : Src1 ? DAG.getNode(AMDGPUISD::READLANE, SL, VT, {Src0, Src1})
+                   : DAG.getNode(AMDGPUISD::READFIRSTLANE, SL, VT, {Src0}));
+  };
+
+  SDValue Src1, Src2;
+  if (IntrinsicID == Intrinsic::amdgcn_readlane ||
+      IntrinsicID == Intrinsic::amdgcn_writelane) {
+    Src1 = N->getOperand(2);
+    if (IntrinsicID == Intrinsic::amdgcn_writelane)
+      Src2 = N->getOperand(3);
+  }
+
+  if (ValSize == 32) {
+    // Already legal
+    return SDValue();
+  }
+
+  if (ValSize < 32) {
+    bool IsFloat = VT.isFloatingPoint();
+    Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
+                                SL, MVT::i32);
+    if (Src2.getNode()) {
+      Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
+                                  SL, MVT::i32);
+    }
+    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
+    SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
+    return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
+  }
+
+  if ((ValSize % 32) == 0) {
+    MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+    Src0 = DAG.getBitcast(VecVT, Src0);
+
+    if (Src2.getNode())
+      Src2 = DAG.getBitcast(VecVT, Src2);
+
+    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
+    SDValue UnrolledLaneOp = DAG.UnrollVectorOp(LaneOp.getNode());
+    return DAG.getBitcast(VT, UnrolledLaneOp);
----------------
vikramRH wrote:

```suggestion
  MVT LaneOpT =
        VT.isVector() && VT.getVectorElementType().getSizeInBits() == 16
            ? MVT::v2i16
            : MVT::i32;
    SDValue Src0SubReg, Src2SubReg;
    SmallVector<SDValue, 4> LaneOps;
    LaneOps.push_back(DAG.getTargetConstant(
        TLI.getRegClassFor(VT.getSimpleVT(), N->isDivergent())->getID(), SL,
        MVT::i32));
    for (unsigned i = 0; i < (ValSize / 32); i++) {
      unsigned SubRegIdx = SIRegisterInfo::getSubRegFromChannel(i);
      Src0SubReg = DAG.getTargetExtractSubreg(SubRegIdx, SL, LaneOpT, Src0);
      if (Src2)
        Src2SubReg = DAG.getTargetExtractSubreg(SubRegIdx, SL, LaneOpT, Src2);
      LaneOps.push_back(createLaneOp(Src0SubReg, Src1, Src2SubReg, LaneOpT));
      LaneOps.push_back(DAG.getTargetConstant(SubRegIdx, SL, MVT::i32));
    }
    return SDValue(
        DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, SL, VT, LaneOps), 0);
```

@arsenm , @jayfoad , an alternate idea here that is much closer in logic to the GIsel implementation and doesn't rely on bitcasts. how does this look ? 

https://github.com/llvm/llvm-project/pull/89217