[Mlir-commits] [mlir] cfa82f7 - [mlir][sparse][gpu] introduce flag that controls host to device copy strategies (regular dma default)
Kun Wu
llvmlistbot at llvm.org
Tue Aug 1 15:31:06 PDT 2023
Author: K-Wu
Date: 2023-08-01T22:30:40Z
New Revision: cfa82f778348ceaefc04e0cf2817a2e30e687b57
URL: https://github.com/llvm/llvm-project/commit/cfa82f778348ceaefc04e0cf2817a2e30e687b57
DIFF: https://github.com/llvm/llvm-project/commit/cfa82f778348ceaefc04e0cf2817a2e30e687b57.diff
LOG: [mlir][sparse][gpu] introduce flag that controls host to device copy strategies (regular dma default)
Differential Revision: https://reviews.llvm.org/D155352
Added:
Modified:
mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index febb0113cf9932..5deab8321cbcbd 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -52,6 +52,21 @@ struct SparseCompilerOptions
mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",
"Enable sparse parallelization for any storage and loop."))};
+ PassOptions::Option<mlir::GPUDataTransferStrategy> gpuDataTransfer{
+ *this, "gpu-data-transfer-strategy",
+ ::llvm::cl::desc(
+ "Set the data transfer strategy between the host and the GPUs"),
+ ::llvm::cl::init(mlir::GPUDataTransferStrategy::kRegularDMA),
+ llvm::cl::values(
+ clEnumValN(mlir::GPUDataTransferStrategy::kRegularDMA, "regular-dma",
+ "Default option: malloc on host without additional "
+ "options or care and then use DMA to copy the data"),
+ clEnumValN(mlir::GPUDataTransferStrategy::kPinnedDMA, "pinned-dma",
+ "Based on the default option, pin the host memory to "
+ "accelerate the data transfer"),
+ clEnumValN(mlir::GPUDataTransferStrategy::kZeroCopy, "zero-copy",
+ "Use zero-copy to perform the data transfer from the host "
+ "to the GPU"))};
PassOptions::Option<bool> enableIndexReduction{
*this, "enable-index-reduction",
@@ -138,8 +153,9 @@ struct SparseCompilerOptions
/// Projects out the options for `createSparsificationPass`.
SparsificationOptions sparsificationOptions() const {
- return SparsificationOptions(parallelization, enableIndexReduction,
- enableGPULibgen, enableRuntimeLibrary);
+ return SparsificationOptions(parallelization, gpuDataTransfer,
+ enableIndexReduction, enableGPULibgen,
+ enableRuntimeLibrary);
}
/// Projects out the options for `createSparseTensorConversionPass`.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index c2942cf7be0b49..2949397214dfd6 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -44,19 +44,26 @@ enum class SparseParallelizationStrategy {
// TODO: support reduction parallelization too?
};
+// TODO : Zero copy is disabled due to correctness bugs.Tracker #64316
+enum class GPUDataTransferStrategy { kRegularDMA, kZeroCopy, kPinnedDMA };
+
#define GEN_PASS_DECL
#include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"
/// Options for the Sparsification pass.
struct SparsificationOptions {
- SparsificationOptions(SparseParallelizationStrategy p, bool idxReduc,
+ SparsificationOptions(SparseParallelizationStrategy p,
+ GPUDataTransferStrategy t, bool idxReduc,
bool gpuLibgen, bool enableRT)
- : parallelizationStrategy(p), enableIndexReduction(idxReduc),
- enableGPULibgen(gpuLibgen), enableRuntimeLibrary(enableRT) {}
+ : parallelizationStrategy(p), gpuDataTransferStrategy(t),
+ enableIndexReduction(idxReduc), enableGPULibgen(gpuLibgen),
+ enableRuntimeLibrary(enableRT) {}
SparsificationOptions()
- : SparsificationOptions(SparseParallelizationStrategy::kNone, false,
+ : SparsificationOptions(SparseParallelizationStrategy::kNone,
+ GPUDataTransferStrategy::kRegularDMA, false,
false, true) {}
SparseParallelizationStrategy parallelizationStrategy;
+ GPUDataTransferStrategy gpuDataTransferStrategy;
bool enableIndexReduction;
bool enableGPULibgen;
bool enableRuntimeLibrary;
@@ -211,8 +218,8 @@ std::unique_ptr<Pass> createSparseVectorizationPass(unsigned vectorLength,
void populateSparseGPUCodegenPatterns(RewritePatternSet &patterns,
unsigned numThreads);
-void populateSparseGPULibgenPatterns(RewritePatternSet &patterns,
- bool enableRT);
+void populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT,
+ GPUDataTransferStrategy gpuDataTransfer);
std::unique_ptr<Pass> createSparseGPUCodegenPass();
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 962399931d9330..95a4feb52256a2 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -102,6 +102,19 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",
"Enable sparse parallelization for any storage and loop."))}]>,
+ Option<"gpuDataTransfer", "gpu-data-transfer-strategy", "mlir::GPUDataTransferStrategy",
+ "mlir::GPUDataTransferStrategy::kRegularDMA",
+ "Set the data transfer strategy", [{llvm::cl::values(
+ clEnumValN(mlir::GPUDataTransferStrategy::kRegularDMA,
+ "regular-dma",
+ "Default option: malloc on host without additional "
+ "options or care and then use DMA to copy the data"),
+ clEnumValN(mlir::GPUDataTransferStrategy::kPinnedDMA, "pinned-dma",
+ "Based on the default option, pin the host memory to "
+ "accelerate the data transfer"),
+ clEnumValN(mlir::GPUDataTransferStrategy::kZeroCopy, "zero-copy",
+ "Use zero-copy to perform the data transfer from the host "
+ "to the GPU"))}]>,
Option<"enableGPULibgen", "enable-gpu-libgen", "bool",
"false",
"Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,
@@ -110,6 +123,7 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
];
}
+
def PostSparsificationRewrite : Pass<"post-sparsification-rewrite", "ModuleOp"> {
let summary = "Applies sparse tensor rewriting rules after sparsification";
let description = [{
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index c40bcd178060cc..f9c35d8b14d2e0 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -461,14 +461,18 @@ static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,
}
/// Match and rewrite SpMV kernel.
-static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
- linalg::GenericOp op, bool enableRT) {
+static LogicalResult
+rewriteSpMV(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
+ GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();
Value a = op.getOperand(0);
Value x = op.getOperand(1);
Value y = op.getOperand(2); // we have y = Ax
SmallVector<Value> tokens;
+ bool isZeroCopy =
+ gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;
+
// Only admissible sparse matrix format and dense vectors.
bool isCOO = false;
SparseTensorType aTp = getSparseTensorType(a);
@@ -487,12 +491,27 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);
Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);
Value memV = genToValues(rewriter, loc, a);
+ Value memX, memY;
+ Value castR, castC, castV, castX, castY;
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ memX = genTensorToMemref(rewriter, loc, x);
+ memY = genTensorToMemref(rewriter, loc, y);
+ castR = genHostRegisterMemref(rewriter, loc, memR);
+ if (memC)
+ castC = genHostRegisterMemref(rewriter, loc, memC);
+ castV = genHostRegisterMemref(rewriter, loc, memV);
+ castX = genHostRegisterMemref(rewriter, loc, memX);
+ castY = genHostRegisterMemref(rewriter, loc, memY);
+ }
+
Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
Value valA = genAllocCopy(rewriter, loc, memV, tokens);
- Value memX = genTensorToMemref(rewriter, loc, x);
- Value vecX = genAllocCopy(rewriter, loc, memX, tokens);
- Value memY = genTensorToMemref(rewriter, loc, y);
+ if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
+ memX = genTensorToMemref(rewriter, loc, x);
+ Value vecX = isZeroCopy ? memX : genAllocCopy(rewriter, loc, memX, tokens);
+ if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
+ memY = genTensorToMemref(rewriter, loc, y);
Value vecY = genAllocCopy(rewriter, loc, memY, tokens);
genBlockingWait(rewriter, loc, tokens);
tokens.clear();
@@ -546,11 +565,20 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
token = genDeallocMemRef(rewriter, loc, colA, token);
token = genDeallocMemRef(rewriter, loc, valA, token);
token = genDeallocMemRef(rewriter, loc, buffer, token);
- token = genDeallocMemRef(rewriter, loc, vecX, token);
+ if (!isZeroCopy)
+ token = genDeallocMemRef(rewriter, loc, vecX, token);
token = genCopyMemRef(rewriter, loc, memY, vecY, token);
token = genDeallocMemRef(rewriter, loc, vecY, token);
tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ genHostUnregisterMemref(rewriter, loc, castR);
+ if (memC)
+ genHostUnregisterMemref(rewriter, loc, castC);
+ genHostUnregisterMemref(rewriter, loc, castV);
+ genHostUnregisterMemref(rewriter, loc, castX);
+ genHostUnregisterMemref(rewriter, loc, castY);
+ }
tokens.clear();
// Done.
@@ -559,14 +587,18 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
}
/// Match and rewrite SpMM kernel.
-static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
- linalg::GenericOp op, bool enableRT) {
+static LogicalResult
+rewriteSpMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
+ GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();
Value a = op.getOperand(0);
Value b = op.getOperand(1);
Value c = op.getOperand(2); // we have C = AB
SmallVector<Value> tokens;
+ bool isZeroCopy =
+ gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;
+
// Only admissible sparse matrix format and dense matrices.
bool isCOO = false;
SparseTensorType aTp = getSparseTensorType(a);
@@ -586,12 +618,27 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);
Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);
Value memV = genToValues(rewriter, loc, a);
+ Value bufB, bufC;
+ Value castR, castC, castV, castB, castBufC;
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ bufB = genTensorToMemref(rewriter, loc, b);
+ bufC = genTensorToMemref(rewriter, loc, c);
+ castR = genHostRegisterMemref(rewriter, loc, memR);
+ if (memC)
+ castC = genHostRegisterMemref(rewriter, loc, memC);
+ castV = genHostRegisterMemref(rewriter, loc, memV);
+ castB = genHostRegisterMemref(rewriter, loc, bufB);
+ castBufC = genHostRegisterMemref(rewriter, loc, bufC);
+ }
+
Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
Value valA = genAllocCopy(rewriter, loc, memV, tokens);
- Value bufB = genTensorToMemref(rewriter, loc, b);
- Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
- Value bufC = genTensorToMemref(rewriter, loc, c);
+ if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
+ bufB = genTensorToMemref(rewriter, loc, b);
+ Value matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens);
+ if (gpuDataTransferStrategy == GPUDataTransferStrategy::kRegularDMA)
+ bufC = genTensorToMemref(rewriter, loc, c);
Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
genBlockingWait(rewriter, loc, tokens);
tokens.clear();
@@ -649,11 +696,20 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
token = genDeallocMemRef(rewriter, loc, colA, token);
token = genDeallocMemRef(rewriter, loc, valA, token);
token = genDeallocMemRef(rewriter, loc, buffer, token);
- token = genDeallocMemRef(rewriter, loc, matB, token);
+ if (!isZeroCopy)
+ token = genDeallocMemRef(rewriter, loc, matB, token);
token = genCopyMemRef(rewriter, loc, bufC, matC, token);
token = genDeallocMemRef(rewriter, loc, matC, token);
tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ genHostUnregisterMemref(rewriter, loc, castR);
+ if (memC)
+ genHostUnregisterMemref(rewriter, loc, castC);
+ genHostUnregisterMemref(rewriter, loc, castV);
+ genHostUnregisterMemref(rewriter, loc, castB);
+ genHostUnregisterMemref(rewriter, loc, castC);
+ }
tokens.clear();
// Done.
@@ -662,23 +718,41 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
}
// Match and rewrite 2:4 SpMM kernels.
-static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
- linalg::GenericOp op) {
+static LogicalResult
+rewrite2To4SpMM(PatternRewriter &rewriter, linalg::GenericOp op,
+ GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();
Value A = op.getOperand(0);
Value B = op.getOperand(1);
Value C = op.getOperand(2); // we have C = AB
SmallVector<Value> tokens;
+ bool isZeroCopy =
+ gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;
+
// All input should be dense tensors.
if (!isDenseTensor(A) || !isDenseTensor(B) || !isDenseTensor(C))
return failure();
+ Value matA, matB;
Value bufA = genTensorToMemref(rewriter, loc, A);
- Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
+ if (!isZeroCopy)
+ matA = genAllocCopy(rewriter, loc, bufA, tokens);
Value bufB = genTensorToMemref(rewriter, loc, B);
- Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
+ if (!isZeroCopy)
+ matB = genAllocCopy(rewriter, loc, bufB, tokens);
Value bufC = genTensorToMemref(rewriter, loc, C);
+ Value castA, castB, castC;
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ castA = genHostRegisterMemref(rewriter, loc, bufA);
+ castB = genHostRegisterMemref(rewriter, loc, bufB);
+ castC = genHostRegisterMemref(rewriter, loc, bufC);
+ }
+
+ if (isZeroCopy) {
+ matA = bufA;
+ matB = bufB;
+ }
Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
genBlockingWait(rewriter, loc, tokens);
tokens.clear();
@@ -754,26 +828,38 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
token = genDeallocMemRef(rewriter, loc, buffer, token);
token = genDeallocMemRef(rewriter, loc, buffer2, token);
token = genDeallocMemRef(rewriter, loc, buffer3, token);
- token = genDeallocMemRef(rewriter, loc, matA, token);
- token = genDeallocMemRef(rewriter, loc, matB, token);
+
+ if (!isZeroCopy)
+ token = genDeallocMemRef(rewriter, loc, matA, token);
+ if (!isZeroCopy)
+ token = genDeallocMemRef(rewriter, loc, matB, token);
token = genCopyMemRef(rewriter, loc, bufC, matC, token);
token = genDeallocMemRef(rewriter, loc, matC, token);
tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ genHostUnregisterMemref(rewriter, loc, castA);
+ genHostUnregisterMemref(rewriter, loc, castB);
+ genHostUnregisterMemref(rewriter, loc, castC);
+ }
tokens.clear();
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, bufC);
return success();
}
/// Match and rewrite SDDMM kernel.
-static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
- linalg::GenericOp op, bool enableRT) {
+static LogicalResult
+rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT,
+ GPUDataTransferStrategy gpuDataTransferStrategy) {
Location loc = op.getLoc();
Value a = op.getOperand(0);
Value b = op.getOperand(1);
Value c = op.getOperand(2);
SmallVector<Value> tokens;
+ bool isZeroCopy =
+ gpuDataTransferStrategy == GPUDataTransferStrategy::kZeroCopy;
+
// Only admissible sparse matrix format and dense matrices, no COO.
bool isCOO = false;
SparseTensorType aTp = getSparseTensorType(a);
@@ -793,13 +879,31 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
+ Value matA, matB;
Value bufA = genTensorToMemref(rewriter, loc, a);
- Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
+ if (!isZeroCopy)
+ matA = genAllocCopy(rewriter, loc, bufA, tokens);
Value bufB = genTensorToMemref(rewriter, loc, b);
- Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
+ if (!isZeroCopy)
+ matB = isZeroCopy ? bufB : genAllocCopy(rewriter, loc, bufB, tokens);
Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT);
Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT);
Value memV = genToValues(rewriter, loc, c);
+
+ Value castB, castA, castR, castC, castV;
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ castB = genHostRegisterMemref(rewriter, loc, bufB);
+ castA = genHostRegisterMemref(rewriter, loc, bufA);
+ castR = genHostRegisterMemref(rewriter, loc, memR);
+ if (memC)
+ castC = genHostRegisterMemref(rewriter, loc, memC);
+ castV = genHostRegisterMemref(rewriter, loc, memV);
+ }
+
+ if (isZeroCopy) {
+ matA = bufA;
+ matB = bufB;
+ }
Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
Value valC = genAllocCopy(rewriter, loc, memV, tokens);
@@ -850,8 +954,10 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
.getAsyncToken();
token = genDeallocMemRef(rewriter, loc, buffer, token);
- token = genDeallocMemRef(rewriter, loc, matA, token);
- token = genDeallocMemRef(rewriter, loc, matB, token);
+ if (!isZeroCopy) {
+ token = genDeallocMemRef(rewriter, loc, matA, token);
+ token = genDeallocMemRef(rewriter, loc, matB, token);
+ }
token = genDeallocMemRef(rewriter, loc, rowC, token);
if (colC)
token = genDeallocMemRef(rewriter, loc, colC, token);
@@ -859,6 +965,14 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
token = genDeallocMemRef(rewriter, loc, valC, token);
tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);
+ if (gpuDataTransferStrategy != GPUDataTransferStrategy::kRegularDMA) {
+ genHostUnregisterMemref(rewriter, loc, castB);
+ genHostUnregisterMemref(rewriter, loc, castA);
+ genHostUnregisterMemref(rewriter, loc, castR);
+ if (memC)
+ genHostUnregisterMemref(rewriter, loc, castC);
+ genHostUnregisterMemref(rewriter, loc, castV);
+ }
tokens.clear();
// Done.
@@ -977,8 +1091,8 @@ struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> {
struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;
- LinalgOpRewriter(MLIRContext *context, bool rt)
- : OpRewritePattern(context), enableRT(rt) {}
+ LinalgOpRewriter(MLIRContext *context, bool rt, GPUDataTransferStrategy t)
+ : OpRewritePattern(context), enableRT(rt), gpuDataTransferStrategy(t) {}
LogicalResult matchAndRewrite(linalg::GenericOp op,
PatternRewriter &rewriter) const override {
@@ -1004,7 +1118,7 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
linalg::isReductionIterator(iteratorTypes[1]) &&
// TODO: add transposed {i, j}
maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) {
- return rewriteSpMV(rewriter, op, enableRT);
+ return rewriteSpMV(rewriter, op, enableRT, gpuDataTransferStrategy);
}
// Recognize a SpMM kernel.
@@ -1016,9 +1130,9 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
// TODO: maybe add transposed {i, j} in future
maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) {
if (op->getAttr("DENSE24"))
- return rewrite2To4SpMM(rewriter, op);
+ return rewrite2To4SpMM(rewriter, op, gpuDataTransferStrategy);
- return rewriteSpMM(rewriter, op, enableRT);
+ return rewriteSpMM(rewriter, op, enableRT, gpuDataTransferStrategy);
}
// Recognize a SDDMM kernel.
@@ -1030,7 +1144,7 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
// TODO: maybe add transposed {i, j} in future
maps == infer({{i, k}, {k, j}, {i, j}}) &&
matchSumReductionOfMulUnary(op)) {
- return rewriteSDDMM(rewriter, op, enableRT);
+ return rewriteSDDMM(rewriter, op, enableRT, gpuDataTransferStrategy);
}
return failure();
@@ -1038,6 +1152,7 @@ struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
private:
bool enableRT;
+ GPUDataTransferStrategy gpuDataTransferStrategy;
};
} // namespace
@@ -1057,7 +1172,9 @@ void mlir::populateSparseGPUCodegenPatterns(RewritePatternSet &patterns,
patterns.add<ForallRewriter>(patterns.getContext(), numThreads);
}
-void mlir::populateSparseGPULibgenPatterns(RewritePatternSet &patterns,
- bool enableRT) {
- patterns.add<LinalgOpRewriter>(patterns.getContext(), enableRT);
+void mlir::populateSparseGPULibgenPatterns(
+ RewritePatternSet &patterns, bool enableRT,
+ GPUDataTransferStrategy gpuDataTransfer) {
+ patterns.add<LinalgOpRewriter>(patterns.getContext(), enableRT,
+ gpuDataTransfer);
}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index cfe3fe881450a9..cce26bc603eeb3 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -65,6 +65,7 @@ struct SparsificationPass
SparsificationPass(const SparsificationPass &pass) = default;
SparsificationPass(const SparsificationOptions &options) {
parallelization = options.parallelizationStrategy;
+ gpuDataTransfer = options.gpuDataTransferStrategy;
enableIndexReduction = options.enableIndexReduction;
enableGPULibgen = options.enableGPULibgen;
enableRuntimeLibrary = options.enableRuntimeLibrary;
@@ -73,12 +74,17 @@ struct SparsificationPass
void runOnOperation() override {
auto *ctx = &getContext();
// Translate strategy flags to strategy options.
- SparsificationOptions options(parallelization, enableIndexReduction,
- enableGPULibgen, enableRuntimeLibrary);
+ SparsificationOptions options(parallelization, gpuDataTransfer,
+ enableIndexReduction, enableGPULibgen,
+ enableRuntimeLibrary);
// Apply GPU libgen (if requested), sparsification, and cleanup rewriting.
RewritePatternSet patterns(ctx);
if (enableGPULibgen) {
- populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
+ // TODO : Zero copy is disabled due to correctness bugs.Tracker #64316
+ assert(gpuDataTransfer != GPUDataTransferStrategy::kZeroCopy &&
+ "zero-copy transfer not supported with GPU libgen");
+ populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary,
+ gpuDataTransfer);
}
populateSparsificationPatterns(patterns, options);
scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
index 694d24bf820c32..aa71abbcf0e717 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir
@@ -1,13 +1,18 @@
//
// NOTE: this test requires gpu-sm80 and cusparselt
//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
-// RUN: | mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e main --entry-point-result=void \
-// RUN: | FileCheck %s
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE: --shared-libs=%mlir_cuda_runtime \
+// DEFINE: --shared-libs=%mlir_c_runner_utils \
+// DEFINE: --e main --entry-point-result=void \
+// DEFINE: | FileCheck %s
+
+// RUN: %{compile}" | %{run}
+// RUN: %{compile} gpu-data-transfer-strategy=pinned-dma" | %{run}
+// Tracker #64316
+// RUNNOT: %{compile} gpu-data-transfer-strategy=zero-copy" | %{run}
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
index 7b3769f195e4fa..0b546c59605552 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir
@@ -1,14 +1,19 @@
//
// NOTE: this test requires gpu-sm80 and cusparselt
//
-// RUN: mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
-// RUN: %s \
-// RUN: | mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e main --entry-point-result=void \
-// RUN: | FileCheck %s
+// DEFINE: %{compile} = mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
+// DEFINE: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
+// DEFINE: %s
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE: --shared-libs=%mlir_cuda_runtime \
+// DEFINE: --shared-libs=%mlir_c_runner_utils \
+// DEFINE: --e main --entry-point-result=void \
+// DEFINE: | FileCheck %s
+
+// RUN: %{compile} | %{run}
+// RUN: %{compile} --sparse-compiler="gpu-data-transfer-strategy=pinned-dma" | %{run}
+// RUNNOT: %{compile} --sparse-compiler="gpu-data-transfer-strategy=zero-copy" | %{run}
+
module {
llvm.func @mgpuCreateSparseLtEnv()
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
index 2b471e0e118c4f..60c8e1ca3e43c7 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
@@ -1,25 +1,28 @@
//
// NOTE: this test requires gpu-sm80
//
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE: --shared-libs=%mlir_cuda_runtime \
+// DEFINE: --shared-libs=%mlir_c_runner_utils \
+// DEFINE: --e main --entry-point-result=void \
+// DEFINE: | FileCheck %s
+//
+//
// with RT lib (SoA COO):
//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
-// RUN: | mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e main --entry-point-result=void \
-// RUN: | FileCheck %s
+// RUN: %{compile} enable-runtime-library=true" | %{run}
+// RUN: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=pinned-dma" | %{run}
+// Tracker #64316
+// RUNNOT: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=zero-copy" | %{run}
//
// without RT lib (AoS COO): note, may fall back to CPU
//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
-// RUN: | mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e main --entry-point-result=void \
-// RUN: | FileCheck %s
+// RUN: %{compile} enable-runtime-library=false" | %{run}
+// RUN: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=pinned-dma" | %{run}
+// Tracker #64316
+// RUNNOT: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=zero-copy" | %{run}
#SortedCOO = #sparse_tensor.encoding<{
lvlTypes = [ "compressed-nu", "singleton" ]
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
index 9c2ddcc9282935..bd8b11cb24d506 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
@@ -1,25 +1,28 @@
//
// NOTE: this test requires gpu-sm80
//
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: %{run} = mlir-cpu-runner \
+// DEFINE: --shared-libs=%mlir_cuda_runtime \
+// DEFINE: --shared-libs=%mlir_c_runner_utils \
+// DEFINE: --e main --entry-point-result=void \
+// DEFINE: | FileCheck %s
+//
// with RT lib (SoA COO):
//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
-// RUN: | mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e main --entry-point-result=void \
-// RUN: | FileCheck %s
+// RUN: %{compile} enable-runtime-library=true" | %{run}
+// RUN: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=pinned-dma" | %{run}
+// Tracker #64316
+// RUNNOT: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=zero-copy" | %{run}
//
// without RT lib (AoS COO): note, may fall back to CPU
//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
-// RUN: | mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e main --entry-point-result=void \
-// RUN: | FileCheck %s
+// RUN: %{compile} enable-runtime-library=false" | %{run}
+// RUN: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=pinned-dma" | %{run}
+// Tracker #64316
+// RUNNOT: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=zero-copy" | %{run}
+//
#SortedCOO = #sparse_tensor.encoding<{
lvlTypes = [ "compressed-nu", "singleton" ]
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index e4a3294f971748..3332e3c4b17a30 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -1,28 +1,29 @@
//
// NOTE: this test requires gpu-sm80
//
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71
+// DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
+// DEFINE: mlir-cpu-runner \
+// DEFINE: --shared-libs=%mlir_cuda_runtime \
+// DEFINE: --shared-libs=%mlir_c_runner_utils \
+// DEFINE: --e entry --entry-point-result=void \
+// DEFINE: | FileCheck %s
+//
// with RT lib:
//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
-// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e entry --entry-point-result=void \
-// RUN: | FileCheck %s
+// RUN: %{compile} enable-runtime-library=true" | %{run}
+// RUN: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=pinned-dma" | %{run}
+// Tracker #64316
+// RUNNOT: %{compile} enable-runtime-library=true gpu-data-transfer-strategy=zero-copy" | %{run}
//
// without RT lib:
//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \
-// RUN: | TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN: --shared-libs=%mlir_cuda_runtime \
-// RUN: --shared-libs=%mlir_c_runner_utils \
-// RUN: --e entry --entry-point-result=void \
-// RUN: | FileCheck %s
-//
+// RUN: %{compile} enable-runtime-library=false" | %{run}
+// RUN: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=pinned-dma" | %{run}
+// Tracker #64316
+// RUNNOT: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=zero-copy" | %{run}
+//
!Filename = !llvm.ptr<i8>
More information about the Mlir-commits
mailing list