[Mlir-commits] [mlir] [ml_program] fix bufferizesToMemoryRead for ml_program.global_store (PR #177387)

Thu Jan 22 08:01:37 PST 2026

https://github.com/nmalimban created https://github.com/llvm/llvm-project/pull/177387

This is a fix for the `BufferizableOpInterface` implementation for `ml_program.global_store`.

`bufferizesToMemoryRead` currently returns false for `GlobalStoreOpInterface`, but I believe it should return true as `ml_program.global_store` needs to read its input buffer to know what value to store to global.

This manifested in a bug where `one-shot-bufferize` would produce MLIR that copies uninitialized data to the global var instead of the intended value to be stored.

For the following MLIR:

```
module {
  ml_program.global private mutable @"state_tensor"(dense<0.0> : tensor<4x75xf32>) : tensor<4x75xf32>
  func.func @main() -> tensor<4x75xf32> {
    %c0 = arith.constant 0 : index
    %cst_val = arith.constant 1.0 : f32
    %initial_state = ml_program.global_load @"state_tensor" : tensor<4x75xf32>
    %val = tensor.extract %initial_state[%c0, %c0] : tensor<4x75xf32>
    %next_val = arith.addf %val, %cst_val : f32
    %updated_tensor = tensor.insert %next_val into %initial_state[%c0, %c0] : tensor<4x75xf32>
    ml_program.global_store @"state_tensor" = %updated_tensor : tensor<4x75xf32>
    return %updated_tensor : tensor<4x75xf32>
  }
}
```
`one-shot-bufferize` produces this incorrect MLIR
```
module {
  memref.global "private" @state_tensor : memref<4x75xf32> = dense<0.000000e+00>
  func.func @main() -> tensor<4x75xf32> {
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %0 = memref.get_global @state_tensor : memref<4x75xf32>
    %1 = memref.load %0[%c0, %c0] : memref<4x75xf32>
    %2 = arith.addf %1, %cst : f32
    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x75xf32>
    memref.copy %0, %alloc : memref<4x75xf32> to memref<4x75xf32>
    memref.store %2, %alloc[%c0, %c0] : memref<4x75xf32>
    %3 = bufferization.to_tensor %alloc : memref<4x75xf32> to tensor<4x75xf32>
    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x75xf32>
    %4 = memref.get_global @state_tensor : memref<4x75xf32>
    memref.copy %alloc_0, %4 : memref<4x75xf32> to memref<4x75xf32>
    return %3 : tensor<4x75xf32>
  }
}
```
Note that `memref.copy` at the end copies an uninitialized `alloc_0` to the global variable.

But after the change we see the following MLIR:
```
module {
  memref.global "private" @state_tensor : memref<4x75xf32> = dense<0.000000e+00>
  func.func @main() -> tensor<4x75xf32> {
    %c0 = arith.constant 0 : index
    %cst = arith.constant 1.000000e+00 : f32
    %0 = memref.get_global @state_tensor : memref<4x75xf32>
    %1 = memref.load %0[%c0, %c0] : memref<4x75xf32>
    %2 = arith.addf %1, %cst : f32
    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x75xf32>
    memref.copy %0, %alloc : memref<4x75xf32> to memref<4x75xf32>
    memref.store %2, %alloc[%c0, %c0] : memref<4x75xf32>
    %3 = bufferization.to_tensor %alloc : memref<4x75xf32> to tensor<4x75xf32>
    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<4x75xf32>
    memref.copy %alloc, %alloc_0 : memref<4x75xf32> to memref<4x75xf32>
    %4 = memref.get_global @state_tensor : memref<4x75xf32>
    memref.copy %alloc_0, %4 : memref<4x75xf32> to memref<4x75xf32>
    return %3 : tensor<4x75xf32>
  }
}
```
We now see that the relevant data is copied to `alloc_0` before it is stored in global.

>From f11faeeaea2a275d450bb46d61ed559dd4afcd52 Mon Sep 17 00:00:00 2001
From: Nathan Malimban <nmalimba at ah-nmalimba-l.dhcp.mathworks.com>
Date: Wed, 21 Jan 2026 16:52:24 -0500
Subject: [PATCH] fix bufferizesToMemoryRead for ml_program.global_store

---
 .../BufferizableOpInterfaceImpl.cpp           |  2 +-
 .../Dialect/MLProgram/one-shot-bufferize.mlir | 31 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp
index 364e4d385fd62..c2cbc95e94343 100644
--- a/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -118,7 +118,7 @@ struct GlobalStoreOpInterface
 
   bool bufferizesToMemoryRead(Operation *, OpOperand &,
                               const AnalysisState &) const {
-    return false;
+    return true;
   }
 
   bool bufferizesToMemoryWrite(Operation *, OpOperand &,
diff --git a/mlir/test/Dialect/MLProgram/one-shot-bufferize.mlir b/mlir/test/Dialect/MLProgram/one-shot-bufferize.mlir
index 942247aef2603..d0eb2606ec7f9 100644
--- a/mlir/test/Dialect/MLProgram/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/MLProgram/one-shot-bufferize.mlir
@@ -50,3 +50,34 @@ func.func @raw_hazard() -> i64 {
   return %extracted : i64
 }
 
+// -----
+
+// CHECK-LABEL: memref.global "private" @state_tensor
+ml_program.global private mutable @"state_tensor"(dense<0.0> : tensor<4x75xf32>) : tensor<4x75xf32>
+
+// CHECK-LABEL: func.func @global_load_store_tensor
+func.func @global_load_store_tensor() -> tensor<4x75xf32> {
+  // CHECK-DAG:     %[[C0:.*]] = arith.constant 0
+  // CHECK-DAG:     %[[CST:.*]] = arith.constant 1.000000e+00
+  // CHECK-DAG:     %[[GLOB:.*]] = memref.get_global @state_tensor
+  // CHECK:         %[[VAL:.*]] = memref.load %[[GLOB]][%[[C0]], %[[C0]]]
+  // CHECK:         %[[ADD:.*]] = arith.addf %[[VAL]], %[[CST]]
+  // CHECK:         %[[ALLOC1:.*]] = memref.alloc() {alignment = 64 : i64}
+  // CHECK:         memref.copy %[[GLOB]], %[[ALLOC1]] 
+  // CHECK:         memref.store %[[ADD]], %[[ALLOC1]][%[[C0]], %[[C0]]] 
+  // CHECK:         %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC1]] 
+  // CHECK:         %[[ALLOC2:.*]] = memref.alloc() {alignment = 64 : i64}
+  // CHECK:         memref.copy %[[ALLOC1]], %[[ALLOC2]] 
+  // CHECK:         %[[GLOB_REF:.*]] = memref.get_global @state_tensor 
+  // CHECK:         memref.copy %[[ALLOC2]], %[[GLOB_REF]] 
+  // CHECK:         return %[[TENSOR]]
+  %c0 = arith.constant 0 : index
+  %cst_val = arith.constant 1.0 : f32
+  %initial_state = ml_program.global_load @"state_tensor" : tensor<4x75xf32>
+  %val = tensor.extract %initial_state[%c0, %c0] : tensor<4x75xf32>
+  %next_val = arith.addf %val, %cst_val : f32
+  %updated_tensor = tensor.insert %next_val into %initial_state[%c0, %c0] : tensor<4x75xf32>
+  ml_program.global_store @"state_tensor" = %updated_tensor : tensor<4x75xf32>
+  return %updated_tensor : tensor<4x75xf32>
+}
+