[clang-tools-extra] [NVPTX] Improve lowering of v4i8 (PR #67866)
Artem Belevich via cfe-commits
cfe-commits at lists.llvm.org
Fri Oct 6 12:12:28 PDT 2023
================
@@ -2150,58 +2179,94 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
}
-// We can init constant f16x2 with a single .b32 move. Normally it
+// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
// would get lowered as two constant loads and vector-packing move.
-// mov.b16 %h1, 0x4000;
-// mov.b16 %h2, 0x3C00;
-// mov.b32 %hh2, {%h2, %h1};
// Instead we want just a constant move:
-// mov.b32 %hh2, 0x40003C00
-//
-// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
-// generates good SASS in both cases.
+// mov.b32 %r2, 0x40003C00
SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op->getValueType(0);
- if (!(Isv2x16VT(VT)))
+ if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
return Op;
- APInt E0;
- APInt E1;
- if (VT == MVT::v2f16 || VT == MVT::v2bf16) {
- if (!(isa<ConstantFPSDNode>(Op->getOperand(0)) &&
- isa<ConstantFPSDNode>(Op->getOperand(1))))
- return Op;
-
- E0 = cast<ConstantFPSDNode>(Op->getOperand(0))
- ->getValueAPF()
- .bitcastToAPInt();
- E1 = cast<ConstantFPSDNode>(Op->getOperand(1))
- ->getValueAPF()
- .bitcastToAPInt();
- } else {
- assert(VT == MVT::v2i16);
- if (!(isa<ConstantSDNode>(Op->getOperand(0)) &&
- isa<ConstantSDNode>(Op->getOperand(1))))
- return Op;
- E0 = cast<ConstantSDNode>(Op->getOperand(0))->getAPIntValue();
- E1 = cast<ConstantSDNode>(Op->getOperand(1))->getAPIntValue();
+ SDLoc DL(Op);
+
+ if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
+ return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
+ isa<ConstantFPSDNode>(Operand);
+ })) {
+ // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
+ // to optimize calculation of constant parts.
+ if (VT == MVT::v4i8) {
+ SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
+ SDValue E01 = DAG.getNode(
+ NVPTXISD::BFI, DL, MVT::i32,
+ DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+ DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
+ SDValue E012 =
+ DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+ DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+ E01, DAG.getConstant(16, DL, MVT::i32), C8);
+ SDValue E0123 =
+ DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+ DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
+ E012, DAG.getConstant(24, DL, MVT::i32), C8);
+ return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
+ }
+ return Op;
}
- SDValue Const =
- DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+
+ // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
+ auto GetOperand = [](SDValue Op, int N) -> APInt {
+ const SDValue &Operand = Op->getOperand(N);
+ EVT VT = Op->getValueType(0);
+ if (Operand->isUndef())
+ return APInt(32, 0);
+ APInt Value;
+ if (VT == MVT::v2f16 || VT == MVT::v2bf16)
+ Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
+ else if (VT == MVT::v2i16 || VT == MVT::v4i8)
+ Value = cast<ConstantSDNode>(Operand)->getAPIntValue();
+ else
+ llvm_unreachable("Unsupported type");
+ return Value.zext(32);
----------------
Artem-B wrote:
When we extract v4i8 values we must truncate them to 8 bits. They are carried in i16, so for negative values we end up with the unwanted extra 8 sign bits.
https://github.com/llvm/llvm-project/pull/67866
More information about the cfe-commits
mailing list