[llvm] [NVPTX] Prefer prmt.b32 over bfi.b32 (PR #110766)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 2 11:52:03 PDT 2024
================
@@ -2328,20 +2328,23 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
// to optimize calculation of constant parts.
if (VT == MVT::v4i8) {
- SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
- SDValue E01 = DAG.getNode(
- NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
- DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
- SDValue E012 =
- DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
- E01, DAG.getConstant(16, DL, MVT::i32), C8);
- SDValue E0123 =
- DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
- DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
- E012, DAG.getConstant(24, DL, MVT::i32), C8);
- return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
+ SDValue PRMT__10 = DAG.getNode(
+ NVPTXISD::PRMT, DL, MVT::v4i8,
+ {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32),
+ DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+ DAG.getConstant(0x3340, DL, MVT::i32),
+ DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ SDValue PRMT_210 = DAG.getNode(
+ NVPTXISD::PRMT, DL, MVT::v4i8,
+ {PRMT__10, DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+ DAG.getConstant(0x3410, DL, MVT::i32),
+ DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+ SDValue PRMT3210 = DAG.getNode(
----------------
Artem-B wrote:
Converting from `v = prmt(d, prmt(c, prmt(a,b)))` to `v = prmt(prmt(c,d), prmt(a,b))` may squeeze a bit more performance here if GPU can do two leaf permutes in parallel as they are independent.
https://github.com/llvm/llvm-project/pull/110766
More information about the llvm-commits
mailing list