[clang-tools-extra] [NVPTX] Improve lowering of v4i8 (PR #67866)

Fri Oct 6 12:12:28 PDT 2023

================
@@ -2150,58 +2179,94 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
-// We can init constant f16x2 with a single .b32 move.  Normally it
+// We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it
 // would get lowered as two constant loads and vector-packing move.
-//        mov.b16         %h1, 0x4000;
-//        mov.b16         %h2, 0x3C00;
-//        mov.b32         %hh2, {%h2, %h1};
 // Instead we want just a constant move:
-//        mov.b32         %hh2, 0x40003C00
-//
-// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
-// generates good SASS in both cases.
+//        mov.b32         %r2, 0x40003C00
 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                SelectionDAG &DAG) const {
   EVT VT = Op->getValueType(0);
-  if (!(Isv2x16VT(VT)))
+  if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
     return Op;
-  APInt E0;
-  APInt E1;
-  if (VT == MVT::v2f16 || VT == MVT::v2bf16) {
-    if (!(isa<ConstantFPSDNode>(Op->getOperand(0)) &&
-          isa<ConstantFPSDNode>(Op->getOperand(1))))
-      return Op;
-
-    E0 = cast<ConstantFPSDNode>(Op->getOperand(0))
-             ->getValueAPF()
-             .bitcastToAPInt();
-    E1 = cast<ConstantFPSDNode>(Op->getOperand(1))
-             ->getValueAPF()
-             .bitcastToAPInt();
-  } else {
-    assert(VT == MVT::v2i16);
-    if (!(isa<ConstantSDNode>(Op->getOperand(0)) &&
-          isa<ConstantSDNode>(Op->getOperand(1))))
-      return Op;
 
-    E0 = cast<ConstantSDNode>(Op->getOperand(0))->getAPIntValue();
-    E1 = cast<ConstantSDNode>(Op->getOperand(1))->getAPIntValue();
+  SDLoc DL(Op);
+
+  if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
+        return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
+               isa<ConstantFPSDNode>(Operand);
+      })) {
+    // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
+    // to optimize calculation of constant parts.
+    if (VT == MVT::v4i8) {
+      SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
+      SDValue E01 = DAG.getNode(
+          NVPTXISD::BFI, DL, MVT::i32,
+          DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+          DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
+      SDValue E012 =
+          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+                      DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+                      E01, DAG.getConstant(16, DL, MVT::i32), C8);
+      SDValue E0123 =
+          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+                      DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
+                      E012, DAG.getConstant(24, DL, MVT::i32), C8);
+      return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
+    }
+    return Op;
   }
-  SDValue Const =
-      DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+
+  // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
+  auto GetOperand = [](SDValue Op, int N) -> APInt {
+    const SDValue &Operand = Op->getOperand(N);
+    EVT VT = Op->getValueType(0);
+    if (Operand->isUndef())
+      return APInt(32, 0);
+    APInt Value;
+    if (VT == MVT::v2f16 || VT == MVT::v2bf16)
+      Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
+    else if (VT == MVT::v2i16 || VT == MVT::v4i8)
+      Value = cast<ConstantSDNode>(Operand)->getAPIntValue();
+    else
+      llvm_unreachable("Unsupported type");
+    return Value.zext(32);
----------------
Artem-B wrote:

When we extract v4i8 values we must truncate them to 8 bits. They are carried in i16, so for negative values we end up with the unwanted extra 8 sign bits.

https://github.com/llvm/llvm-project/pull/67866