[llvm] [NVPTX] Optimize v2x16 BUILD_VECTORs to PRMT (PR #116675)

Mon Nov 18 10:39:21 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-nvptx

Author: Fraser Cormack (frasercrmck)

<details>
<summary>Changes</summary>

When two 16-bit values are combined into a v2x16 vector, and those values are truncated come from 32-bit values, a PRMT instruction can save registers by selecting bytes directly from the original 32-bit values. We do this during a post-legalize DAG combine, as these opportunities are typically only exposed after the BUILD_VECTOR's operands have been legalized.

Additionally, if the 32-bit values are right-shifted, we can fold in the shift by selecting higher bytes with PRMT. Only logical right-shifts by 16 are supported (for now) since those are the only situations seen in practice. Right shifts by 16 often come up during the legalization of EXTRACT_VECTOR_ELT.

This idea was brought up in a PR comment by @Artem-B.

---

Patch is 23.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116675.diff


6 Files Affected:

- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+54-1) 
- (modified) llvm/test/CodeGen/NVPTX/bf16-instructions.ll (+15-25) 
- (modified) llvm/test/CodeGen/NVPTX/fma-relu-contract.ll (+6-10) 
- (modified) llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll (+6-10) 
- (modified) llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll (+12-20) 
- (modified) llvm/test/CodeGen/NVPTX/i16x2-instructions.ll (+2-5) 


``````````diff

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4ad0200ca5cf83..da38aff1efefd2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -762,7 +762,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::VSELECT});
+                       ISD::VSELECT, ISD::BUILD_VECTOR});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6176,6 +6176,57 @@ static SDValue PerformLOADCombine(SDNode *N,
       DL);
 }
 
+static SDValue
+PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  auto VT = N->getValueType(0);
+  if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
+    return SDValue();
+
+  auto Op0 = N->getOperand(0);
+  auto Op1 = N->getOperand(1);
+
+  // Start out by assuming we want to take the lower 2 bytes of each i32
+  // operand.
+  uint64_t Op0Bytes = 0x10;
+  uint64_t Op1Bytes = 0x54;
+
+  std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
+                                                {&Op1, &Op1Bytes}};
+
+  // Check that each operand is an i16, truncated from an i32 operand. We'll
+  // select individual bytes from those original operands. Optionally, fold in a
+  // shift right of that original operand.
+  for (auto &[Op, OpBytes] : OpData) {
+    // Eat up any bitcast
+    if (Op->getOpcode() == ISD::BITCAST)
+      *Op = Op->getOperand(0);
+
+    if (Op->getValueType() != MVT::i16 || Op->getOpcode() != ISD::TRUNCATE ||
+        Op->getOperand(0).getValueType() != MVT::i32)
+      return SDValue();
+
+    *Op = Op->getOperand(0);
+
+    // Optionally, fold in a shift-right of the original operand and permute
+    // the two higher bytes from the shifted operand
+    if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
+      if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
+        *OpBytes += 0x22;
+        *Op = Op->getOperand(0);
+      }
+    }
+  }
+
+  SDLoc DL(N);
+  auto &DAG = DCI.DAG;
+
+  auto PRMT = DAG.getNode(
+      NVPTXISD::PRMT, DL, MVT::v4i8,
+      {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
+       DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+  return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
+}
+
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6210,6 +6261,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformEXTRACTCombine(N, DCI);
     case ISD::VSELECT:
       return PerformVSELECTCombine(N, DCI);
+    case ISD::BUILD_VECTOR:
+      return PerformBUILD_VECTORCombine(N, DCI);
   }
   return SDValue();
 }
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 80815b3ca37c05..bd45d85d393000 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -159,8 +159,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_faddx2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<13>;
-; SM70-NEXT:    .reg .b32 %r<24>;
+; SM70-NEXT:    .reg .b16 %rs<9>;
+; SM70-NEXT:    .reg .b32 %r<25>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -182,7 +182,6 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
 ; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
 ; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
@@ -197,8 +196,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
 ; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
 ; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
-; SM70-NEXT:    mov.b32 %r23, {%rs11, %rs7};
+; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
 ; SM70-NEXT:    ret;
 ;
@@ -266,8 +264,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_fsubx2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<13>;
-; SM70-NEXT:    .reg .b32 %r<24>;
+; SM70-NEXT:    .reg .b16 %rs<9>;
+; SM70-NEXT:    .reg .b32 %r<25>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -289,7 +287,6 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
 ; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
 ; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
@@ -304,8 +301,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
 ; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
 ; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
-; SM70-NEXT:    mov.b32 %r23, {%rs11, %rs7};
+; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
 ; SM70-NEXT:    ret;
 ;
@@ -373,8 +369,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_fmulx2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<13>;
-; SM70-NEXT:    .reg .b32 %r<24>;
+; SM70-NEXT:    .reg .b16 %rs<9>;
+; SM70-NEXT:    .reg .b32 %r<25>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -396,7 +392,6 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
 ; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
 ; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
@@ -411,8 +406,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
 ; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
 ; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
-; SM70-NEXT:    mov.b32 %r23, {%rs11, %rs7};
+; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
 ; SM70-NEXT:    ret;
 ;
@@ -480,8 +474,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_fdiv(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<13>;
-; SM70-NEXT:    .reg .b32 %r<24>;
+; SM70-NEXT:    .reg .b16 %rs<9>;
+; SM70-NEXT:    .reg .b32 %r<25>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -503,7 +497,6 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
 ; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
 ; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
@@ -518,8 +511,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
 ; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
 ; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
-; SM70-NEXT:    mov.b32 %r23, {%rs11, %rs7};
+; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
 ; SM70-NEXT:    ret;
 ;
@@ -1724,8 +1716,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-LABEL: test_maxnum_v2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<13>;
-; SM70-NEXT:    .reg .b32 %r<24>;
+; SM70-NEXT:    .reg .b16 %rs<9>;
+; SM70-NEXT:    .reg .b32 %r<25>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -1747,7 +1739,6 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-NEXT:    setp.nan.f32 %p1, %f3, %f3;
 ; SM70-NEXT:    or.b32 %r11, %r7, 4194304;
 ; SM70-NEXT:    selp.b32 %r12, %r11, %r10, %p1;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
@@ -1762,8 +1753,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-NEXT:    setp.nan.f32 %p2, %f6, %f6;
 ; SM70-NEXT:    or.b32 %r21, %r17, 4194304;
 ; SM70-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
-; SM70-NEXT:    mov.b32 %r23, {%rs11, %rs7};
+; SM70-NEXT:    prmt.b32 %r23, %r22, %r12, 0x7632U;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r23;
 ; SM70-NEXT:    ret;
 ;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index 8cc4548f6e85e0..f22d2e7a6d3c9d 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -1050,8 +1050,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<9>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<25>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<61>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<21>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<62>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<19>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -1134,7 +1134,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %f15, %f15;
 ; CHECK-SM70-NEXT:    or.b32 %r49, %r45, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r50, %r49, %r48, %p7;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r50; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs17;
 ; CHECK-SM70-NEXT:    shl.b32 %r52, %r51, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f16, %r52;
@@ -1148,8 +1147,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p8, %f18, %f18;
 ; CHECK-SM70-NEXT:    or.b32 %r58, %r54, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r59, %r58, %r57, %p8;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs23}, %r59; }
-; CHECK-SM70-NEXT:    mov.b32 %r60, {%rs23, %rs20};
+; CHECK-SM70-NEXT:    prmt.b32 %r60, %r59, %r50, 0x7632U;
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r60;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
@@ -1189,8 +1187,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<17>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<44>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -1244,7 +1242,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
 ; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
 ; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
 ; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
@@ -1255,8 +1252,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
 ; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
-; CHECK-SM70-NEXT:    mov.b32 %r42, {%rs15, %rs13};
+; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index 16219aa9da0950..0f24160af57af1 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -715,8 +715,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<7>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<17>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<57>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<58>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<17>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -790,7 +790,6 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %f14, %f14;
 ; CHECK-SM70-NEXT:    or.b32 %r47, %r43, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r48, %r47, %r46, %p5;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r48; }
 ; CHECK-SM70-NEXT:    and.b32 %r49, %r34, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f15, %r49;
 ; CHECK-SM70-NEXT:    add.f32 %f16, %f15, %f9;
@@ -801,8 +800,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f16, %f16;
 ; CHECK-SM70-NEXT:    or.b32 %r54, %r50, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r55, %r54, %r53, %p6;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r55; }
-; CHECK-SM70-NEXT:    mov.b32 %r56, {%rs15, %rs13};
+; CHECK-SM70-NEXT:    prmt.b32 %r56, %r55, %r48, 0x7632U;
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r56;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
@@ -841,8 +839,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<17>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<44>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -896,7 +894,6 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
 ; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
 ; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
 ; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
@@ -907,8 +904,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
 ; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
-; CHECK-SM70-NEXT:    mov.b32 %r42, {%rs15, %rs13};
+; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
index af21bada7783be..63c2b96be838fa 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
@@ -785,8 +785,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<9>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<25>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<61>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<21>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<62>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<19>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -869,7 +869,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %f15, %f15;
 ; CHECK-SM70-NEXT:    or.b32 %r49, %r45, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r50, %r49, %r48, %p7;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r50; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs17;
 ; CHECK-SM70-NEXT:    shl.b32 %r52, %r51, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f16, %r52;
@@ -883,8 +882,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p8, %f18, %f18;
 ; CHECK-SM70-NEXT:    or.b32 %r58, %r54, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r59, %r58, %r57, %p8;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs23}, %r59; }
-; CHECK-SM70-NEXT:    mov.b32 %r60, {%rs23, %rs20};
+; CHECK-SM70-NEXT:    prmt.b32 %r60, %r59, %r50, 0x7632U;
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r60;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast <2 x bfloat> %a, %b
@@ -924,8 +922,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<17>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<44>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -979,7 +977,6 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f10, %f10;
 ; CHECK-SM70-NEXT:    or.b32 %r33, %r29, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs13}, %r34; }
 ; CHECK-SM70-NEXT:    and.b32 %r35, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f11, %r35;
 ; CHECK-SM70-NEXT:    max.f32 %f12, %f11, 0f00000000;
@@ -990,8 +987,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f12, %f12;
 ; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p4;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r41; }
-; CHECK-SM70-NEXT:    mov.b32 %r42, {%rs15, %rs13};
+; CHECK-SM70-NEXT:    prmt.b32 %r42, %r41, %r34, 0x7632U;
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r42;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast <2 x bfloat> %a, %b
@@ -1710,8 +1706,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<7>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<17>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<57>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<58>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<17>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -1785,7 +1781,6 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p5, %f14, %f14;
 ; ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/116675