[Mlir-commits] [mlir] [MLIR][NVVM] Migrate CpAsyncOp to intrinsics (PR #123789)

Wed Jan 22 06:22:51 PST 2025

================
@@ -849,55 +849,30 @@ def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind",
 
 def LoadCacheModifierAttr : EnumAttr<NVVM_Dialect, LoadCacheModifierKind, "load_cache_modifier">;
 
-def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
+def NVVM_CpAsyncOp : NVVM_Op<"cp.async.shared.global">,
   Arguments<(ins LLVM_PointerShared:$dst,
                  LLVM_PointerGlobal:$src,
                  I32Attr:$size,
                  LoadCacheModifierAttr:$modifier,
                  Optional<LLVM_Type>:$cpSize)> {
-  string llvmBuilder = [{
-      llvm::Intrinsic::ID id;
-      switch ($size) {
-        case 4:
-          id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_4;
-          break;
-        case 8:
-          id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_8;
-          break;
-        case 16:
-          if($modifier == NVVM::LoadCacheModifierKind::CG)
-            id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16;
-          else if($modifier == NVVM::LoadCacheModifierKind::CA)
-            id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;
-          else 
-            llvm_unreachable("unsupported cache modifier");
-          break;
-        default:
-          llvm_unreachable("unsupported async copy size");
-      }
-      createIntrinsicCall(builder, id, {$dst, $src});
-  }];
   let assemblyFormat = "$dst `,` $src `,` $size `,` `cache` `=` $modifier (`,` $cpSize^)? attr-dict `:` type(operands)";
   let hasVerifier = 1;
   let extraClassDeclaration = [{
-    bool hasIntrinsic() { if(getCpSize()) return false; return true; }
-
-    void getAsmValues(RewriterBase &rewriter, 
-        llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues) {
-      asmValues.push_back({getDst(), PTXRegisterMod::Read});
-      asmValues.push_back({getSrc(), PTXRegisterMod::Read});
-      asmValues.push_back({makeConstantI32(rewriter, getSize()), PTXRegisterMod::Read});
-      asmValues.push_back({getCpSize(), PTXRegisterMod::Read});
-    }        
+    static llvm::Intrinsic::ID getIntrinsicID(int size,
+                                              NVVM::LoadCacheModifierKind kind,
+                                              bool hasCpSize);
   }];
-  let extraClassDefinition = [{        
-    std::string $cppClass::getPtx() { 
-      if(getModifier() == NVVM::LoadCacheModifierKind::CG)
-        return std::string("cp.async.cg.shared.global [%0], [%1], %2, %3;\n");
-      if(getModifier() == NVVM::LoadCacheModifierKind::CA)
-        return std::string("cp.async.ca.shared.global [%0], [%1], %2, %3;\n");
-      llvm_unreachable("unsupported cache modifier");      
-    }
+  string llvmBuilder = [{
+    bool hasCpSize = op.getCpSize() ? true : false;
+
+    llvm::SmallVector<llvm::Value *> translatedOperands;
+    translatedOperands.push_back($dst);
+    translatedOperands.push_back($src);
+    if (hasCpSize)
+      translatedOperands.push_back($cpSize);
----------------
durga4github wrote:

I tried this and it seems we have to pass three more arguments to getIntrinsicID() to achieve this.

A short diff of the changes:

auto id = NVVM::CpAsyncOp::getIntrinsicIDAndArgs($size, $modifier, hasCpSize, translatedOperands, *op, moduleTranslation);

Inside getIntrinsicIDAndArgs():
auto op = cast<NVVM::CpAsyncOp>(opInput);
args.push_back(moduleTranslation.lookupValue(op.getDst()));
args.push_back(moduleTranslation.lookupValue(op.getSrc()));
if (hasCpSize)
   args.push_back(moduleTranslation.lookupValue(op.getCpSize()));

It feels like, we are manually doing what the translator generates in the NVVMConversions.inc file.
But, I agree that this reduces the llvmBuilder code in the td file by a few lines.

With this, do you think, packing the args inside the getIntrinsicID(), is a good idea?
Kindly let me know what you think.

https://github.com/llvm/llvm-project/pull/123789