[llvm] [NVPTX] Misc table-gen cleanup (NFC) (PR #142877)

Wed Jun 4 17:02:38 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

<details>
<summary>Changes</summary>



---

Patch is 196.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142877.diff


3 Files Affected:

- (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+82-114) 
- (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+563-1941) 
- (modified) llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp (+420-420) 


``````````diff

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b646d39194c7e..9ca4e8d20650a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -160,7 +160,6 @@ def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
 def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
 
 def True : Predicate<"true">;
-def False : Predicate<"false">;
 
 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
@@ -257,6 +256,11 @@ def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
 //                         "prmt.b32${mode}">;
 //         ---> "prmt.b32${mode} \t$d, $a, $b, $c;"
 //
+//   * BasicFlagsNVPTXInst<(outs Int64Regs:$state),
+//                         (ins ADDR:$addr),
+//                         "mbarrier.arrive.b64">;
+//         ---> "mbarrier.arrive.b64 \t$state, [$addr];"
+//
 class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmstr,
                           list<dag> pattern = []>
   : NVPTXInst<
@@ -274,7 +278,11 @@ class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmst
             !if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "),
             !interleave(
               !foreach(i, !range(!size(ins_dag)),
-                "$" # !getdagname(ins_dag, i)),
+                 !if(!eq(!cast<string>(!getdagarg<DAGOperand>(ins_dag, i)), "ADDR"),
+                    "[$" # !getdagname(ins_dag, i) # "]",
+                    "$" # !getdagname(ins_dag, i)
+                 )
+                ),
               ", "))),
         ";"),
       pattern>;
@@ -956,31 +964,17 @@ def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
 
 // Matchers for signed, unsigned mul.wide ISD nodes.
-def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
-          (MULWIDES32 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)),
-          (MULWIDES32Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
-          (MULWIDEU32 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)),
-          (MULWIDEU32Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
+let Predicates = [doMulWide] in {
+  def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>;
+  def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>;
+  def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>;
+  def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), (MULWIDEU32Imm $a, imm:$b)>;
 
-def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
-          (MULWIDES64 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)),
-          (MULWIDES64Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
-          (MULWIDEU64 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)),
-          (MULWIDEU64Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
+  def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), (MULWIDES64 $a, $b)>;
+  def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), (MULWIDES64Imm $a, imm:$b)>;
+  def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), (MULWIDEU64 $a, $b)>;
+  def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>;
+}
 
 // Predicates used for converting some patterns to mul.wide.
 def SInt32Const : PatLeaf<(imm), [{
@@ -1106,18 +1100,12 @@ defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>;
 defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>;
 }
 
-def INEG16 :
-  BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-            "neg.s16",
-            [(set i16:$dst, (ineg i16:$src))]>;
-def INEG32 :
-  BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-            "neg.s32",
-            [(set i32:$dst, (ineg i32:$src))]>;
-def INEG64 :
-  BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-            "neg.s64",
-            [(set i64:$dst, (ineg i64:$src))]>;
+foreach t = [I16RT, I32RT, I64RT] in {
+  def NEG_S # t.Size :
+    BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
+              "neg.s" # t.Size,
+              [(set t.Ty:$dst, (ineg t.Ty:$src))]>;
+}
 
 //-----------------------------------
 // Floating Point Arithmetic
@@ -1538,7 +1526,7 @@ def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
 
 def SDTPRMT :
   SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
+                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
 def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
 
 multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
@@ -1961,7 +1949,7 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // f16 -> pred
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
             (SETP_f16rr $a, $b, ModeFTZ)>,
-        Requires<[useFP16Math,doF32FTZ]>;
+        Requires<[useFP16Math, doF32FTZ]>;
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
             (SETP_f16rr $a, $b, Mode)>,
         Requires<[useFP16Math]>;
@@ -1969,7 +1957,7 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // bf16 -> pred
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
             (SETP_bf16rr $a, $b, ModeFTZ)>,
-        Requires<[hasBF16Math,doF32FTZ]>;
+        Requires<[hasBF16Math, doF32FTZ]>;
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
             (SETP_bf16rr $a, $b, Mode)>,
         Requires<[hasBF16Math]>;
@@ -2497,24 +2485,20 @@ def : Pat<(f16 (uint_to_fp i32:$a)), (CVT_f16_u32 $a, CvtRN)>;
 def : Pat<(f16 (uint_to_fp i64:$a)), (CVT_f16_u64 $a, CvtRN)>;
 
 // sint -> bf16
-def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>, 
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>;
+}
 
 // uint -> bf16
-def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>;
+}
 
 // sint -> f32
 def : Pat<(f32 (sint_to_fp  i1:$a)), (CVT_f32_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
@@ -2565,27 +2549,25 @@ def : Pat<(i16 (fp_to_uint bf16:$a)), (CVT_u16_bf16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint bf16:$a)), (CVT_u32_bf16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint bf16:$a)), (CVT_u64_bf16 $a, CvtRZI)>;
 // f32 -> sint
-def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
+let Predicates = [doF32FTZ] in {
+  def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>;
+}
+def : Pat<(i1  (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI)>;
 
 // f32 -> uint
+let Predicates = [doF32FTZ] in {
+  def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>;
+}
 def : Pat<(i1  (fp_to_uint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>, 
-      Requires<[doF32FTZ]>;
 def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI)>;
 
 // f64 -> sint
@@ -2707,28 +2689,24 @@ let hasSideEffects = false in {
 
   // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
   // unused high/low part.
-  def I32toI16H_Sink  : NVPTXInst<(outs Int16Regs:$high),
-                             (ins Int32Regs:$s),
-                             "mov.b32 \t{{_, $high}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I32toI16L_Sink  : NVPTXInst<(outs Int16Regs:$low),
-                             (ins Int32Regs:$s),
-                             "mov.b32 \t{{$low, _}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I64toI32H_Sink  : NVPTXInst<(outs Int32Regs:$high),
-                             (ins Int64Regs:$s),
-                             "mov.b64 \t{{_, $high}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I64toI32L_Sink  : NVPTXInst<(outs Int32Regs:$low),
-                             (ins Int64Regs:$s),
-                             "mov.b64 \t{{$low, _}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
+  let Predicates = [hasPTX<71>] in {
+    def I32toI16H_Sink  : NVPTXInst<(outs Int16Regs:$high), (ins Int32Regs:$s),
+                              "mov.b32 \t{{_, $high}}, $s;", []>;
+    def I32toI16L_Sink  : NVPTXInst<(outs Int16Regs:$low), (ins Int32Regs:$s),
+                              "mov.b32 \t{{$low, _}}, $s;", []>;
+    def I64toI32H_Sink  : NVPTXInst<(outs Int32Regs:$high), (ins Int64Regs:$s),
+                              "mov.b64 \t{{_, $high}}, $s;", []>;
+    def I64toI32L_Sink  : NVPTXInst<(outs Int32Regs:$low), (ins Int64Regs:$s),
+                              "mov.b64 \t{{$low, _}}, $s;", []>;
+  }
 }
 
-def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
+let Predicates = [hasPTX<71>] in {
+  def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>;
+  def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>;
+  def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>;
+  def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>;
+}
 
 // Fall back to the old way if we don't have PTX 7.1.
 def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H $s)>;
@@ -3061,29 +3039,19 @@ def stacksave :
   SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf,
          [SDNPHasChain, SDNPSideEffect]>;
 
-def STACKRESTORE_32 :
-  BasicNVPTXInst<(outs), (ins Int32Regs:$ptr),
-            "stackrestore.u32",
-            [(stackrestore i32:$ptr)]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKSAVE_32 :
-  BasicNVPTXInst<(outs Int32Regs:$dst), (ins),
-            "stacksave.u32",
-            [(set i32:$dst, (i32 stacksave))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKRESTORE_64 :
-  BasicNVPTXInst<(outs), (ins Int64Regs:$ptr),
-            "stackrestore.u64",
-            [(stackrestore i64:$ptr)]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKSAVE_64 :
-  BasicNVPTXInst<(outs Int64Regs:$dst), (ins),
-            "stacksave.u64",
-            [(set i64:$dst, (i64 stacksave))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
+let Predicates = [hasPTX<73>, hasSM<52>] in {
+  foreach t = [I32RT, I64RT] in {
+    def STACKRESTORE_ # t.Size :
+      BasicNVPTXInst<(outs), (ins t.RC:$ptr),
+                "stackrestore.u" # t.Size,
+              [(stackrestore t.Ty:$ptr)]>;
+
+    def STACKSAVE_ # t.Size :
+      BasicNVPTXInst<(outs t.RC:$dst), (ins),
+                "stacksave.u" # t.Size,
+              [(set t.Ty:$dst, (t.Ty stacksave))]>;
+  }
+}
 
 include "NVPTXIntrinsics.td"
 
@@ -3124,7 +3092,7 @@ def : Pat <
 ////////////////////////////////////////////////////////////////////////////////
 
 class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
-    NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
+    BasicNVPTXInst<(outs), (ins), "fence."#sem#"."#scope>,
     Requires<[ptx, hasSM<70>]>;
 
 foreach scope = ["sys", "gpu", "cluster", "cta"] in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d14c03791febb..c008443a0066e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -52,7 +52,7 @@ class PTX {
 def ptx : PTX;
 
 // Generates list of n sequential register names.
-// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ]
+// E.g. RegNames<3, "r">.ret -> ["r0", "r1", "r2" ]
 class RegSeq<int n, string prefix> {
   list<string> ret = !if(n, !listconcat(RegSeq<!sub(n, 1), prefix>.ret,
                                         [prefix # !sub(n, 1)]),
@@ -137,7 +137,7 @@ defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive
 
 class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
                           list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
-        NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
+        BasicNVPTXInst<(outs), (ins), "barrier.cluster."# variant, [(Intr)]>,
         Requires<Preds>;
 
 def barrier_cluster_arrive:
@@ -400,13 +400,9 @@ def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS :
 //-----------------------------------
 
 multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
-  def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
-            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
-            [(Intrin i32:$addr)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
-            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
-            [(Intrin i64:$addr)]>,
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+            "cp.async.mbarrier.arrive" # NoInc # AddrSpace # ".b64",
+            [(Intrin addr:$addr)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
 
@@ -420,30 +416,19 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
   CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
 
 multiclass CP_ASYNC_SHARED_GLOBAL_I<string cc, string cpsize, Intrinsic Intrin, Intrinsic IntrinS> {
-  def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
-            !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
-            [(Intrin i32:$dst, i32:$src)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
-            !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
-            [(Intrin i64:$dst, i64:$src)]>,
+  def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src),
+            "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ";",
+            [(Intrin addr:$dst, addr:$src)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
+
   // Variant with src_size parameter
-  def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i32:$dst, i32:$src, i32:$src_size)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i32:$dst, i32:$src, imm:$src_size)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i64:$dst, i64:$src, i32:$src_size)]>,
+  def _s : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$src_size),
+             "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;",
+             [(IntrinS addr:$dst, addr:$src, i32:$src_size)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i64:$dst, i64:$src, imm:$src_size)]>,
+  def _si: NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, i32imm:$src_size),
+             "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;",
+             [(IntrinS addr:$dst, addr:$src, imm:$src_size)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
 
@@ -513,14 +498,14 @@ class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
 }
 
 multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
-  def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
+  def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
       !if(has_ch,
           CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;",
           CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"),
       [(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
       Requires<[hasPTX<80>, hasSM<90>]>;
 
-  def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
+  def _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
       !if(has_ch,
           CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;",
           CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"),
@@ -533,7 +518,7 @@ defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<has_ch = 1>;
 multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
   defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster;
 
-  def NAME : NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
       (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
            Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
       !if(has_ch,
@@ -542,7 +527,7 @@ multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
       [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>,
       Requires<[hasPTX<80>, hasSM<90>]>;
 
-  def NAME # _MC : NVPTXInst<(outs),
+  def _MC : NVPTXInst<(outs),
       (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
            Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
       !if(has_ch,
@@ -561,7 +546,7 @@ def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
   Requires<[hasPTX<80>, hasSM<90>]>;
 
 multiclas...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/142877