[clang] [llvm] [OpenMP] Fix non-contiguous array omp target update (PR #156889)

Thu Mar 26 04:45:43 PDT 2026

https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/156889

>From 2e7cf5d3a750cd6df9242370888144413e362bb6 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Thu, 26 Mar 2026 03:29:54 -0700
Subject: [PATCH 1/7] Fix test and xfail it

---
 .../strided_update_count_expression_complex.c    | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/offload/test/offloading/strided_update_count_expression_complex.c b/offload/test/offloading/strided_update_count_expression_complex.c
index a2ebdcd1f510d..92fb9c7da0c46 100644
--- a/offload/test/offloading/strided_update_count_expression_complex.c
+++ b/offload/test/offloading/strided_update_count_expression_complex.c
@@ -1,5 +1,5 @@
 // RUN: %libomptarget-compile-run-and-check-generic
-// XFAIL: intelgpu
+// XFAIL: *
 // Tests non-contiguous array sections with complex expression-based count
 // scenarios including multiple struct arrays and non-zero offset.
 
@@ -130,6 +130,8 @@ void test_2_complex_count_with_offset() {
     }
 
     // Count: (len-offset)/2 with stride 2
+    // s1.arr[2:4:2]
+    // s2.arr[1:4:2]
 #pragma omp target update from(                                                \
         s1.arr[s1.offset : (s1.len - s1.offset) / 2 : 2],                      \
             s2.arr[s2.offset : (s2.len - s2.offset) / 2 : 2])
@@ -238,14 +240,14 @@ void test_2_complex_count_with_offset() {
 // CHECK: s1 results:
 // CHECK-NEXT: 0.000000
 // CHECK-NEXT: 1.000000
-// CHECK-NEXT: 2.000000
-// CHECK-NEXT: 6.000000
 // CHECK-NEXT: 4.000000
-// CHECK-NEXT: 10.000000
-// CHECK-NEXT: 6.000000
-// CHECK-NEXT: 14.000000
+// CHECK-NEXT: 3.000000
 // CHECK-NEXT: 8.000000
-// CHECK-NEXT: 18.000000
+// CHECK-NEXT: 5.000000
+// CHECK-NEXT: 12.000000
+// CHECK-NEXT: 7.000000
+// CHECK-NEXT: 16.000000
+// CHECK-NEXT: 9.000000
 // CHECK: s2 results:
 // CHECK-NEXT: 0.000000
 // CHECK-NEXT: 20.000000

>From 7d4ceb12b68fe42f1dbbfff0288b05c9d734533a Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Fri, 22 Aug 2025 11:50:57 -0700
Subject: [PATCH 2/7] [OpenMP] Fix non-contiguous array omp target update

The existing implementation has three issues which this patch addresses.

1. The last dimension which represents the bytes in the type, has the wrong
stride and count. For example, for a 4 byte int, count=1 and stride=4. The
correct representation here is count=4 and stride=1 because there are 4 bytes
(count=4) that we need to copy and we do not skip any bytes (stride=1).

2. The size of the data copy was computed using the last dimension. However,
this is incorrect in cases where some of the final dimensions get merged into
one. In this case we need to take the combined size of the merged dimensions,
which is (Count * Stride) of the first merged dimension.

3. The Offset into a dimension was computed as a multiple of its Stride.
However, this Stride which is in bytes, already includes the stride multiplier
given by the user. This means that when the user specified 1:3:2, i.e. elements
1, 3, 5, the runtime incorrectly copied elements 2, 4, 6. Fix this by
precomputing at compile time the Offset to be in bytes and correctly multiply
the offset by the stride of the dimension without the user-specified
multiplier.

Remove unnecesasry assert
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  20 ++--
 clang/test/OpenMP/target_update_codegen.cpp   |  43 +++----
 offload/libomptarget/omptarget.cpp            |  14 ++-
 .../test/offloading/non_contiguous_update.cpp | 107 ++++++++++++------
 4 files changed, 116 insertions(+), 68 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3a57c06e40329..332b439c87472 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -8458,8 +8458,8 @@ class MappableExprsHandler {
     // For supporting stride in array section, we need to initialize the first
     // dimension size as 1, first offset as 0, and first count as 1
     MapValuesArrayTy CurOffsets = {llvm::ConstantInt::get(CGF.CGM.Int64Ty, 0)};
-    MapValuesArrayTy CurCounts = {llvm::ConstantInt::get(CGF.CGM.Int64Ty, 1)};
-    MapValuesArrayTy CurStrides;
+    MapValuesArrayTy CurCounts;
+    MapValuesArrayTy CurStrides = {llvm::ConstantInt::get(CGF.CGM.Int64Ty, 1)};
     MapValuesArrayTy DimSizes{llvm::ConstantInt::get(CGF.CGM.Int64Ty, 1)};
     uint64_t ElementTypeSize;
 
@@ -8483,8 +8483,8 @@ class MappableExprsHandler {
              "Should be either ConstantArray or VariableArray if not the "
              "first Component");
 
-      // Get element size if CurStrides is empty.
-      if (CurStrides.empty()) {
+      // Get element size if CurCounts is empty.
+      if (CurCounts.empty()) {
         const Type *ElementType = nullptr;
         if (CAT)
           ElementType = CAT->getElementType().getTypePtr();
@@ -8516,7 +8516,7 @@ class MappableExprsHandler {
             ElementType = ElementType->getPointeeOrArrayElementType();
           ElementTypeSize =
               Context.getTypeSizeInChars(ElementType).getQuantity();
-          CurStrides.push_back(
+          CurCounts.push_back(
               llvm::ConstantInt::get(CGF.Int64Ty, ElementTypeSize));
         }
       }
@@ -8576,7 +8576,6 @@ class MappableExprsHandler {
                                            CGF.Int64Ty,
                                            /*isSigned=*/false);
       }
-      CurOffsets.push_back(Offset);
 
       // Count
       const Expr *CountExpr = OASE->getLength();
@@ -8613,11 +8612,12 @@ class MappableExprsHandler {
       CurCounts.push_back(Count);
 
       // Stride_n' = Stride_n * (D_0 * D_1 ... * D_n-1) * Unit size
+      // Offset_n' = Offset_n * (D_0 * D_1 ... * D_n-1) * Unit size
       // Take `int arr[5][5][5]` and `arr[0:2:2][1:2:1][0:2:2]` as an example:
       //              Offset      Count     Stride
-      //    D0          0           1         4    (int)    <- dummy dimension
+      //    D0          0           4         1    (int)    <- dummy dimension
       //    D1          0           2         8    (2 * (1) * 4)
-      //    D2          1           2         20   (1 * (1 * 5) * 4)
+      //    D2          100         2         20   (1 * (1 * 5) * 4)
       //    D3          0           2         200  (2 * (1 * 5 * 4) * 4)
       const Expr *StrideExpr = OASE->getStride();
       llvm::Value *Stride =
@@ -8630,6 +8630,10 @@ class MappableExprsHandler {
         CurStrides.push_back(CGF.Builder.CreateNUWMul(DimProd, Stride));
       else
         CurStrides.push_back(DimProd);
+
+      Offset = CGF.Builder.CreateNUWMul(DimProd, Offset);
+      CurOffsets.push_back(Offset);
+
       if (DI != DimSizes.end())
         ++DI;
     }
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index dc51e8969518b..1925d90651721 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -1144,7 +1144,7 @@ void foo(int arg) {
   // CK20: store i64 {{32|64}}, ptr [[STRIDE]],
   // CK20: [[DIM_2:%.+]] = getelementptr inbounds [3 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
   // CK20: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-  // CK20: store i64 1, ptr [[OFFSET_2]],
+  // CK20: store i64 {{8|16}}, ptr [[OFFSET_2]],
   // CK20: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
   // CK20: store i64 4, ptr [[COUNT_2]],
   // CK20: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1153,9 +1153,9 @@ void foo(int arg) {
   // CK20: [[OFFSET_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 0
   // CK20: store i64 0, ptr [[OFFSET_3]],
   // CK20: [[COUNT_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 1
-  // CK20: store i64 1, ptr [[COUNT_3]],
+  // CK20: store i64 {{8|16}}, ptr [[COUNT_3]],
   // CK20: [[STRIDE_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 2
-  // CK20: store i64 {{8|16}}, ptr [[STRIDE_3]],
+  // CK20: store i64 1, ptr [[STRIDE_3]],
   // CK20-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
   // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
   // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1212,7 +1212,7 @@ struct ST {
     // CK21: store i64 {{400|800}}, ptr [[STRIDE_1]],
     // CK21: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
     // CK21: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-    // CK21: store i64 1, ptr [[OFFSET_2]],
+    // CK21: store i64 {{40|80}}, ptr [[OFFSET_2]],
     // CK21: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
     // CK21: store i64 3, ptr [[COUNT_2]],
     // CK21: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1228,9 +1228,9 @@ struct ST {
     // CK21: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
     // CK21: store i64 0, ptr [[OFFSET_4]],
     // CK21: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-    // CK21: store i64 1, ptr [[COUNT_4]],
+    // CK21: store i64 {{4|8}}, ptr [[COUNT_4]],
     // CK21: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-    // CK21: store i64 {{4|8}}, ptr [[STRIDE_4]],
+    // CK21: store i64 1, ptr [[STRIDE_4]],
     // CK21-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 2, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[GEPSZ:%.+]], ptr [[MTYPE]]{{.+}})
     // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
     // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1287,7 +1287,7 @@ struct ST {
     // CK22: store i64 200, ptr [[STRIDE]],
     // CK22: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
     // CK22: [[OFFSET:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-    // CK22: store i64 1, ptr [[OFFSET]],
+    // CK22: store i64 40, ptr [[OFFSET]],
     // CK22: [[COUNT:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
     // CK22: store i64 3, ptr [[COUNT]],
     // CK22: [[STRIDE:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1303,9 +1303,9 @@ struct ST {
     // CK22: [[OFFSET:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
     // CK22: store i64 0, ptr [[OFFSET]],
     // CK22: [[COUNT:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-    // CK22: store i64 1, ptr [[COUNT]],
+    // CK22: store i64 4, ptr [[COUNT]],
     // CK22: [[STRIDE:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-    // CK22: store i64 4, ptr [[STRIDE]],
+    // CK22: store i64 1, ptr [[STRIDE]],
     // CK22-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
     // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
     // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1368,7 +1368,7 @@ void foo(int arg) {
   // CK23: store i64 200, ptr [[STRIDE]],
   // CK23: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
   // CK23: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-  // CK23: store i64 1, ptr [[OFFSET_2]],
+  // CK23: store i64 20, ptr [[OFFSET_2]],
   // CK23: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
   // CK23: store i64 2, ptr [[COUNT_2]],
   // CK23: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1384,9 +1384,9 @@ void foo(int arg) {
   // CK23: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
   // CK23: store i64 0, ptr [[OFFSET_4]],
   // CK23: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-  // CK23: store i64 1, ptr [[COUNT_4]],
+  // CK23: store i64 4, ptr [[COUNT_4]],
   // CK23: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-  // CK23: store i64 4, ptr [[STRIDE_4]],
+  // CK23: store i64 1, ptr [[STRIDE_4]],
   // CK23-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
   // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
   // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1430,6 +1430,7 @@ void foo(int arg) {
   // CK24: [[MUL:%.+]] = mul nuw i64 8,
   // CK24: [[SUB:%.+]] = sub nuw i64 4, [[ARG:%.+]]
   // CK24: [[LEN:%.+]] = udiv {{.+}} [[SUB]], 1
+  // CK24: [[MUL_ARG:%.+]] = mul nuw i64 40, [[ARG]]
   // CK24: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
   // CK24: store ptr [[ARR]], ptr [[BP0]],
   // CK24: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], {{.+}} 0, {{.+}} 0
@@ -1443,7 +1444,7 @@ void foo(int arg) {
   // CK24: store i64 320, ptr [[STRIDE]],
   // CK24: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
   // CK24: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-  // CK24: store i64 [[ARG]], ptr [[OFFSET_2]],
+  // CK24: store i64 [[MUL_ARG]], ptr [[OFFSET_2]],
   // CK24: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
   // CK24: store i64 [[LEN]], ptr [[COUNT_2]],
   // CK24: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1459,9 +1460,9 @@ void foo(int arg) {
   // CK24: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
   // CK24: store i64 0, ptr [[OFFSET_4]],
   // CK24: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-  // CK24: store i64 1, ptr [[COUNT_4]],
+  // CK24: store i64 8, ptr [[COUNT_4]],
   // CK24: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-  // CK24: store i64 8, ptr [[STRIDE_4]],
+  // CK24: store i64 1, ptr [[STRIDE_4]],
   // CK24-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
   // CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
   // CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1526,7 +1527,7 @@ void foo(int arg) {
   // CK25: store i64 20, ptr [[STRIDE_2]],
   // CK25: [[DIM_3:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 2
   // CK25: [[OFFSET_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 0
-  // CK25: store i64 1, ptr [[OFFSET_3]],
+  // CK25: store i64 4, ptr [[OFFSET_3]],
   // CK25: [[COUNT_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 1
   // CK25: store i64 4, ptr [[COUNT_3]],
   // CK25: [[STRIDE_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 2
@@ -1535,9 +1536,9 @@ void foo(int arg) {
   // CK25: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
   // CK25: store i64 0, ptr [[OFFSET_4]],
   // CK25: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-  // CK25: store i64 1, ptr [[COUNT_4]],
+  // CK25: store i64 4, ptr [[COUNT_4]],
   // CK25: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-  // CK25: store i64 4, ptr [[STRIDE_4]],
+  // CK25: store i64 1, ptr [[STRIDE_4]],
   // CK25: [[PTRS:%.+]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 0
   // CK25: store ptr [[DIMS]], ptr [[PTRS]],
   // CK25: [[DIM_5:%.+]] = getelementptr inbounds [3 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS_2]], {{.+}} 0, {{.+}} 0
@@ -1549,7 +1550,7 @@ void foo(int arg) {
   // CK25: store i64 12, ptr [[STRIDE_2_1]],
   // CK25: [[DIM_6:%.+]] = getelementptr inbounds [3 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS_2]], {{.+}} 0, {{.+}} 1
   // CK25: [[OFFSET_2_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_6]], {{.+}} 0, {{.+}} 0
-  // CK25: store i64 1, ptr [[OFFSET_2_2]],
+  // CK25: store i64 4, ptr [[OFFSET_2_2]],
   // CK25: [[COUNT_2_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_6]], {{.+}} 0, {{.+}} 1
   // CK25: store i64 2, ptr [[COUNT_2_2]],
   // CK25: [[STRIDE_2_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_6]], {{.+}} 0, {{.+}} 2
@@ -1558,9 +1559,9 @@ void foo(int arg) {
   // CK25: [[OFFSET_2_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_7]], {{.+}} 0, {{.+}} 0
   // CK25: store i64 0, ptr [[OFFSET_2_3]],
   // CK25: [[COUNT_2_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_7]], {{.+}} 0, {{.+}} 1
-  // CK25: store i64 1, ptr [[COUNT_2_3]],
+  // CK25: store i64 4, ptr [[COUNT_2_3]],
   // CK25: [[STRIDE_2_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_7]], {{.+}} 0, {{.+}} 2
-  // CK25: store i64 4, ptr [[STRIDE_2_3]],
+  // CK25: store i64 1, ptr [[STRIDE_2_3]],
   // CK25: [[PTRS_2:%.+]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 2
   // CK25: store ptr [[DIMS_2]], ptr [[PTRS_2]],
   // CK25-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 3, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 344c388e794af..92ab296a7ed48 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -1507,7 +1507,7 @@ static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
   if (CurrentDim < DimSize) {
     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
       uint64_t CurOffset =
-          (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
+          NonContig[CurrentDim].Offset + I * NonContig[CurrentDim].Stride;
       // we only need to transfer the first element for the last dimension
       // since we've already got a contiguous piece.
       if (CurrentDim != DimSize - 1 || I == 0) {
@@ -1578,9 +1578,17 @@ int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
       int32_t DimSize = ArgSizes[I];
-      uint64_t Size =
-          NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
+      DP("Non contig descriptor:\n");
+      for (int I = 0; I < DimSize; I++)
+        DP("  Dim %" PRId32 " : Offset %" PRIu64 " Count %" PRIu64
+           " Stride %" PRIu64 "\n",
+           I, NonContig[I].Offset, NonContig[I].Count, NonContig[I].Stride);
       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
+      DP("Merged %" PRId32 " dimensions\n", MergedDim);
+      __tgt_target_non_contig &FirstMergedDim =
+          NonContig[DimSize - MergedDim - 1];
+      uint64_t Size = FirstMergedDim.Count * FirstMergedDim.Stride;
+      DP("Transfer size %" PRIu64 "\n", Size);
       Ret = targetDataNonContiguous(
           Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
diff --git a/offload/test/offloading/non_contiguous_update.cpp b/offload/test/offloading/non_contiguous_update.cpp
index 609f0f967fb17..3973174bf2c5e 100644
--- a/offload/test/offloading/non_contiguous_update.cpp
+++ b/offload/test/offloading/non_contiguous_update.cpp
@@ -7,9 +7,9 @@
 
 // Data structure definitions copied from OpenMP RTL.
 struct __tgt_target_non_contig {
-  int64_t offset;
-  int64_t width;
-  int64_t stride;
+  int64_t Offset;
+  int64_t Count;
+  int64_t Stride;
 };
 
 enum tgt_map_type { OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000 };
@@ -18,21 +18,22 @@ enum tgt_map_type { OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000 };
 #ifdef __cplusplus
 extern "C" {
 #endif
-void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
-                              void **args_base, void **args, int64_t *arg_sizes,
-                              int64_t *arg_types);
+  void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+                                void **args_base, void **args, int64_t *arg_sizes,
+                                int64_t *arg_types);
 #ifdef __cplusplus
 }
 #endif
 
 int main() {
+  {
   // case 1
   // int arr[3][4][5][6];
   // #pragma omp target update to(arr[0:2][1:3][1:2][:])
   // set up descriptor
   __tgt_target_non_contig non_contig[5] = {
-      {0, 2, 480}, {1, 3, 120}, {1, 2, 24}, {0, 6, 4}, {0, 1, 4}};
-  int64_t size = 4, type = OMP_TGT_MAPTYPE_NON_CONTIG;
+    {0, 2, 480}, {120, 3, 120}, {24, 2, 24}, {0, 6, 4}, {0, 4, 1}};
+  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
 
   void *base;
   void *begin = &non_contig;
@@ -40,9 +41,9 @@ int main() {
   int64_t *types = &type;
 
   // The below diagram is the visualization of the non-contiguous transfer after
-  // optimization. Note that each element represent the innermost dimension
-  // (unit size = 24) since the stride * count of last dimension is equal to the
-  // stride of second last dimension.
+  // optimization. Note that each element represent the merged innermost
+  // dimension (unit size = 24) since the stride * count of last dimension is
+  // equal to the stride of second last dimension.
   //
   // OOOOO OOOOO OOOOO
   // OXXOO OXXOO OOOOO
@@ -50,44 +51,78 @@ int main() {
   // OXXOO OXXOO OOOOO
   __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
                            sizes, types);
-  // DEBUG: offset 144
-  // DEBUG: offset 264
-  // DEBUG: offset 384
-  // DEBUG: offset 624
-  // DEBUG: offset 744
-  // DEBUG: offset 864
+  // DEBUG: offset 144 len 48
+  // DEBUG: offset 264 len 48
+  // DEBUG: offset 384 len 48
+  // DEBUG: offset 624 len 48
+  // DEBUG: offset 744 len 48
+  // DEBUG: offset 864 len 48
+  }
 
+  {
   // case 2
   // double darr[3][4][5];
   // #pragma omp target update to(darr[0:2:2][2:2][:2:2])
   // set up descriptor
-  __tgt_target_non_contig non_contig_2[4] = {
-      {0, 2, 320}, {2, 2, 40}, {0, 2, 16}, {0, 1, 8}};
-  int64_t size_2 = 4, type_2 = OMP_TGT_MAPTYPE_NON_CONTIG;
+  __tgt_target_non_contig non_contig[4] = {
+    {0, 2, 320}, {80, 2, 40}, {0, 2, 16}, {0, 8, 1}};
+  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
 
-  void *base_2;
-  void *begin_2 = &non_contig_2;
-  int64_t *sizes_2 = &size_2;
-  int64_t *types_2 = &type_2;
+  void *base;
+  void *begin = &non_contig;
+  int64_t *sizes = &size;
+  int64_t *types = &type;
 
   // The below diagram is the visualization of the non-contiguous transfer after
   // optimization. Note that each element represent the innermost dimension
-  // (unit size = 24) since the stride * count of last dimension is equal to the
-  // stride of second last dimension.
+  // (unit size = 8).
   //
   // OOOOO OOOOO OOOOO
   // OOOOO OOOOO OOOOO
   // XOXOO OOOOO XOXOO
   // XOXOO OOOOO XOXOO
-  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base_2, &begin_2,
-                           sizes_2, types_2);
-  // DEBUG: offset 80
-  // DEBUG: offset 96
-  // DEBUG: offset 120
-  // DEBUG: offset 136
-  // DEBUG: offset 400
-  // DEBUG: offset 416
-  // DEBUG: offset 440
-  // DEBUG: offset 456
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                           sizes, types);
+  // DEBUG: offset 80 len 8
+  // DEBUG: offset 96 len 8
+  // DEBUG: offset 120 len 8
+  // DEBUG: offset 136 len 8
+  // DEBUG: offset 400 len 8
+  // DEBUG: offset 416 len 8
+  // DEBUG: offset 440 len 8
+  // DEBUG: offset 456 len 8
+  }
+
+  {
+  // case 3
+  // int darr[6][6];
+  // #pragma omp target update to(darr[1:2:2][2:3])
+  // set up descriptor
+  __tgt_target_non_contig non_contig[3] = {
+    {24, 2, 48}, {8, 3, 4}, {0, 4, 1}};
+  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
+
+  void *base;
+  void *begin = &non_contig;
+  int64_t *sizes = &size;
+  int64_t *types = &type;
+
+  // The below diagram is the visualization of the non-contiguous transfer after
+  // optimization. Note that each element represent the merged innermost
+  // dimension (unit size = 12).
+  //
+  // OOOOOO
+  // OOXXXO
+  // OOOOOO
+  // OOXXXO
+  // OOOOOO
+  // OOOOOO
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                           sizes, types);
+  // DEBUG: offset 24 len 12
+  // DEBUG: offset 72 len 12
+
+  }
+
   return 0;
 }

>From 79939604f9b531bd60bffb3d29036fd131b42750 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Wed, 25 Mar 2026 03:27:26 -0700
Subject: [PATCH 3/7] Switch to new error reporting

---
 offload/libomptarget/omptarget.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 92ab296a7ed48..91ef12285e097 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -1578,17 +1578,17 @@ int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
       int32_t DimSize = ArgSizes[I];
-      DP("Non contig descriptor:\n");
+      ODBG(ODT_DataTransfer) << "Non contig descriptor:";
       for (int I = 0; I < DimSize; I++)
-        DP("  Dim %" PRId32 " : Offset %" PRIu64 " Count %" PRIu64
-           " Stride %" PRIu64 "\n",
-           I, NonContig[I].Offset, NonContig[I].Count, NonContig[I].Stride);
+        ODBG(ODT_DataTransfer)
+            << "  Dim " << I << ": Offset " << NonContig[I].Offset << " Count "
+            << NonContig[I].Count << " Stride " << NonContig[I].Stride;
       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
-      DP("Merged %" PRId32 " dimensions\n", MergedDim);
+      ODBG(ODT_DataTransfer) << "Merged " << MergedDim << " dimensions";
       __tgt_target_non_contig &FirstMergedDim =
           NonContig[DimSize - MergedDim - 1];
       uint64_t Size = FirstMergedDim.Count * FirstMergedDim.Stride;
-      DP("Transfer size %" PRIu64 "\n", Size);
+      ODBG(ODT_DataTransfer) << "Transfer size " << Size;
       Ret = targetDataNonContiguous(
           Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);

>From 6d7a302c510fd23b95962afedf55d28444f3c4dd Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Wed, 25 Mar 2026 05:23:47 -0700
Subject: [PATCH 4/7] Fix test

---
 .../test/offloading/non_contiguous_update.cpp | 178 +++++++++---------
 1 file changed, 92 insertions(+), 86 deletions(-)

diff --git a/offload/test/offloading/non_contiguous_update.cpp b/offload/test/offloading/non_contiguous_update.cpp
index 3973174bf2c5e..011e25a4a3b59 100644
--- a/offload/test/offloading/non_contiguous_update.cpp
+++ b/offload/test/offloading/non_contiguous_update.cpp
@@ -18,110 +18,116 @@ enum tgt_map_type { OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000 };
 #ifdef __cplusplus
 extern "C" {
 #endif
-  void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
-                                void **args_base, void **args, int64_t *arg_sizes,
-                                int64_t *arg_types);
+void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+                              void **args_base, void **args, int64_t *arg_sizes,
+                              int64_t *arg_types);
 #ifdef __cplusplus
 }
 #endif
 
 int main() {
   {
-  // case 1
-  // int arr[3][4][5][6];
-  // #pragma omp target update to(arr[0:2][1:3][1:2][:])
-  // set up descriptor
-  __tgt_target_non_contig non_contig[5] = {
-    {0, 2, 480}, {120, 3, 120}, {24, 2, 24}, {0, 6, 4}, {0, 4, 1}};
-  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
+    // case 1
+    // int32_t arr[3][4][5][6];
+    // #pragma omp target update to(arr[0:2][1:3][1:2][:])
+    // set up descriptor
+    __tgt_target_non_contig non_contig[5] = {{0, 2, 4 * 5 * 6 * 4},
+                                             {1 * 5 * 6 * 4, 3, 5 * 6 * 4},
+                                             {6 * 4, 2, 6 * 4},
+                                             {0, 6, 4},
+                                             {0, 4, 1}};
+    int64_t size = sizeof(non_contig) / sizeof(non_contig[0]),
+            type = OMP_TGT_MAPTYPE_NON_CONTIG;
 
-  void *base;
-  void *begin = &non_contig;
-  int64_t *sizes = &size;
-  int64_t *types = &type;
+    void *base;
+    void *begin = &non_contig;
+    int64_t *sizes = &size;
+    int64_t *types = &type;
 
-  // The below diagram is the visualization of the non-contiguous transfer after
-  // optimization. Note that each element represent the merged innermost
-  // dimension (unit size = 24) since the stride * count of last dimension is
-  // equal to the stride of second last dimension.
-  //
-  // OOOOO OOOOO OOOOO
-  // OXXOO OXXOO OOOOO
-  // OXXOO OXXOO OOOOO
-  // OXXOO OXXOO OOOOO
-  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
-                           sizes, types);
-  // DEBUG: offset 144 len 48
-  // DEBUG: offset 264 len 48
-  // DEBUG: offset 384 len 48
-  // DEBUG: offset 624 len 48
-  // DEBUG: offset 744 len 48
-  // DEBUG: offset 864 len 48
+    // The below diagram is the visualization of the non-contiguous transfer
+    // after optimization. Note that each element represent the merged innermost
+    // dimension (unit size = 24) since the stride * count of last dimension is
+    // equal to the stride of second last dimension.
+    //
+    // OOOOO OOOOO OOOOO
+    // OXXOO OXXOO OOOOO
+    // OXXOO OXXOO OOOOO
+    // OXXOO OXXOO OOOOO
+    __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                             sizes, types);
+    // DEBUG: offset 144 len 48
+    // DEBUG: offset 264 len 48
+    // DEBUG: offset 384 len 48
+    // DEBUG: offset 624 len 48
+    // DEBUG: offset 744 len 48
+    // DEBUG: offset 864 len 48
   }
 
   {
-  // case 2
-  // double darr[3][4][5];
-  // #pragma omp target update to(darr[0:2:2][2:2][:2:2])
-  // set up descriptor
-  __tgt_target_non_contig non_contig[4] = {
-    {0, 2, 320}, {80, 2, 40}, {0, 2, 16}, {0, 8, 1}};
-  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
+    // case 2
+    // int64_t darr[3][4][5];
+    // #pragma omp target update to(darr[0:2:2][2:2][:2:2])
+    // set up descriptor
+    __tgt_target_non_contig non_contig[4] = {
+        {0, 2, 2 * 4 * 5 * 8}, {2 * 5 * 8, 2, 5 * 8}, {0, 2, 2 * 8}, {0, 8, 1}};
+    int64_t size = sizeof(non_contig) / sizeof(non_contig[0]),
+            type = OMP_TGT_MAPTYPE_NON_CONTIG;
 
-  void *base;
-  void *begin = &non_contig;
-  int64_t *sizes = &size;
-  int64_t *types = &type;
+    void *base;
+    void *begin = &non_contig;
+    int64_t *sizes = &size;
+    int64_t *types = &type;
 
-  // The below diagram is the visualization of the non-contiguous transfer after
-  // optimization. Note that each element represent the innermost dimension
-  // (unit size = 8).
-  //
-  // OOOOO OOOOO OOOOO
-  // OOOOO OOOOO OOOOO
-  // XOXOO OOOOO XOXOO
-  // XOXOO OOOOO XOXOO
-  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
-                           sizes, types);
-  // DEBUG: offset 80 len 8
-  // DEBUG: offset 96 len 8
-  // DEBUG: offset 120 len 8
-  // DEBUG: offset 136 len 8
-  // DEBUG: offset 400 len 8
-  // DEBUG: offset 416 len 8
-  // DEBUG: offset 440 len 8
-  // DEBUG: offset 456 len 8
+    // The below diagram is the visualization of the non-contiguous transfer
+    // after optimization. Note that each element represent the innermost
+    // dimension (unit size = 8).
+    //
+    // OOOOO OOOOO OOOOO
+    // OOOOO OOOOO OOOOO
+    // XOXOO OOOOO XOXOO
+    // XOXOO OOOOO XOXOO
+    __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                             sizes, types);
+    // DEBUG: offset 80 len 8
+    // DEBUG: offset 96 len 8
+    // DEBUG: offset 120 len 8
+    // DEBUG: offset 136 len 8
+    // DEBUG: offset 400 len 8
+    // DEBUG: offset 416 len 8
+    // DEBUG: offset 440 len 8
+    // DEBUG: offset 456 len 8
   }
 
   {
-  // case 3
-  // int darr[6][6];
-  // #pragma omp target update to(darr[1:2:2][2:3])
-  // set up descriptor
-  __tgt_target_non_contig non_contig[3] = {
-    {24, 2, 48}, {8, 3, 4}, {0, 4, 1}};
-  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
+    // case 3
+    // int32_t darr[6][6];
+    // #pragma omp target update to(darr[1:2:2][2:3])
+    // set up descriptor
+    __tgt_target_non_contig non_contig[3] = {{1 * 6 * 4 * 1, 2, 2 * 6 * 4 * 1},
+                                             {2 * 4 * 1, 3, 1 * 4 * 1},
+                                             {0, 4, 1}};
+    int64_t size = sizeof(non_contig) / sizeof(non_contig[0]),
+            type = OMP_TGT_MAPTYPE_NON_CONTIG;
 
-  void *base;
-  void *begin = &non_contig;
-  int64_t *sizes = &size;
-  int64_t *types = &type;
-
-  // The below diagram is the visualization of the non-contiguous transfer after
-  // optimization. Note that each element represent the merged innermost
-  // dimension (unit size = 12).
-  //
-  // OOOOOO
-  // OOXXXO
-  // OOOOOO
-  // OOXXXO
-  // OOOOOO
-  // OOOOOO
-  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
-                           sizes, types);
-  // DEBUG: offset 24 len 12
-  // DEBUG: offset 72 len 12
+    void *base;
+    void *begin = &non_contig;
+    int64_t *sizes = &size;
+    int64_t *types = &type;
 
+    // The below diagram is the visualization of the non-contiguous transfer
+    // after optimization. Note that each element represent the merged innermost
+    // dimension (unit size = 4).
+    //
+    // OOOOOO
+    // OOXXXO
+    // OOOOOO
+    // OOXXXO
+    // OOOOOO
+    // OOOOOO
+    __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                             sizes, types);
+    // DEBUG: offset 32 len 12
+    // DEBUG: offset 80 len 12
   }
 
   return 0;

>From 078f786831a67d7f99b23032106260b648312656 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Wed, 25 Mar 2026 06:03:30 -0700
Subject: [PATCH 5/7] Fix clang tests

---
 .../test/OpenMP/target_update_count_expression_codegen.c  | 4 ++--
 clang/test/OpenMP/target_update_variable_stride_codegen.c | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/test/OpenMP/target_update_count_expression_codegen.c b/clang/test/OpenMP/target_update_count_expression_codegen.c
index dc1a45a788846..e11f043f3602f 100644
--- a/clang/test/OpenMP/target_update_count_expression_codegen.c
+++ b/clang/test/OpenMP/target_update_count_expression_codegen.c
@@ -59,9 +59,9 @@ void test_contig_byte_size() {
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM]], ptr [[TMP7]], i32 0, i32 0
 // CHECK-NEXT:    store i64 0, ptr [[TMP8]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM]], ptr [[TMP7]], i32 0, i32 1
-// CHECK-NEXT:    store i64 1, ptr [[TMP9]], align 8
+// CHECK-NEXT:    store i64 4, ptr [[TMP9]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM]], ptr [[TMP7]], i32 0, i32 2
-// CHECK-NEXT:    store i64 4, ptr [[TMP10]], align 8
+// CHECK-NEXT:    store i64 1, ptr [[TMP10]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[DIMS]], ptr [[TMP11]], align 8
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_update_variable_stride_codegen.c b/clang/test/OpenMP/target_update_variable_stride_codegen.c
index afc5459787c2e..5d4a582bf2090 100644
--- a/clang/test/OpenMP/target_update_variable_stride_codegen.c
+++ b/clang/test/OpenMP/target_update_variable_stride_codegen.c
@@ -67,9 +67,9 @@ void test_constant_stride_one() {
 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM]], ptr [[TMP10]], i32 0, i32 0
 // CHECK-NEXT:    store i64 0, ptr [[TMP11]], align 8
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM]], ptr [[TMP10]], i32 0, i32 1
-// CHECK-NEXT:    store i64 1, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store i64 4, ptr [[TMP12]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM]], ptr [[TMP10]], i32 0, i32 2
-// CHECK-NEXT:    store i64 4, ptr [[TMP13]], align 8
+// CHECK-NEXT:    store i64 1, ptr [[TMP13]], align 8
 // CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[DIMS]], ptr [[TMP14]], align 8
 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
@@ -105,9 +105,9 @@ void test_constant_stride_one() {
 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM_0]], ptr [[TMP10]], i32 0, i32 0
 // CHECK-NEXT:    store i64 0, ptr [[TMP11]], align 8
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM_0]], ptr [[TMP10]], i32 0, i32 1
-// CHECK-NEXT:    store i64 1, ptr [[TMP12]], align 8
+// CHECK-NEXT:    store i64 4, ptr [[TMP12]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR_DIM_0]], ptr [[TMP10]], i32 0, i32 2
-// CHECK-NEXT:    store i64 4, ptr [[TMP13]], align 8
+// CHECK-NEXT:    store i64 1, ptr [[TMP13]], align 8
 // CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[DIMS]], ptr [[TMP14]], align 8
 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0

>From cc74561e200a72e891e71695870f6d6b96fa716c Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Wed, 25 Mar 2026 11:59:46 -0700
Subject: [PATCH 6/7] More debug printing

---
 offload/libomptarget/omptarget.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 91ef12285e097..5fe8dd705b5c5 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -1589,6 +1589,7 @@ int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
           NonContig[DimSize - MergedDim - 1];
       uint64_t Size = FirstMergedDim.Count * FirstMergedDim.Stride;
       ODBG(ODT_DataTransfer) << "Transfer size " << Size;
+      ODBG(ODT_DataTransfer) << "Base Ptr " << ArgsBase[I];
       Ret = targetDataNonContiguous(
           Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);

>From afcc6cbbe75fe6271defc92f6664c992793cee2d Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Thu, 26 Mar 2026 04:36:46 -0700
Subject: [PATCH 7/7] Fix tests

---
 .../strided_offset_multidim_update.c          | 95 +++++++++++++++++++
 .../strided_update_variable_stride_misc.c     | 28 +++---
 2 files changed, 109 insertions(+), 14 deletions(-)
 create mode 100644 offload/test/offloading/strided_offset_multidim_update.c

diff --git a/offload/test/offloading/strided_offset_multidim_update.c b/offload/test/offloading/strided_offset_multidim_update.c
new file mode 100644
index 0000000000000..bc0b45cc5699b
--- /dev/null
+++ b/offload/test/offloading/strided_offset_multidim_update.c
@@ -0,0 +1,95 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// XFAIL: intelgpu
+
+// Make sure multi-dimensional strided offset update works correctly.
+
+#include <stdio.h>
+
+#define N 6
+#define SLICE1 1 : 2 : 2
+#define SLICE2 2 : 3
+int main() {
+  int darr[N][N];
+
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      darr[i][j] = 100 + 10 * i + j;
+
+  printf("Full array\n");
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      printf("\t%d", darr[i][j]);
+    }
+    printf("\n");
+  }
+
+#pragma omp target enter data map(alloc : darr[0 : N][0 : N])
+
+  // Zero out target array.
+#pragma omp target
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      darr[i][j] = 0;
+
+  // Only copy over the slice to the device.
+#pragma omp target update to(darr[SLICE1][SLICE2])
+  // Then copy over the entire array to the host.
+#pragma omp target exit data map(from : darr[0 : N][0 : N])
+
+  printf("Only slice (to)\n");
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      printf("\t%d", darr[i][j]);
+    }
+    printf("\n");
+  }
+
+#pragma omp target enter data map(alloc : darr[0 : N][0 : N])
+
+  // Initialize on the device
+#pragma omp target
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      darr[i][j] = 100 + 10 * i + j;
+
+  // Zero out host array.
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++)
+      darr[i][j] = 0;
+
+  // Copy over only the slice to the host
+#pragma omp target update from(darr[SLICE1][SLICE2])
+#pragma omp target exit data map(delete : darr[0 : N][0 : N])
+
+  printf("Only slice (from)\n");
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      printf("\t%d", darr[i][j]);
+    }
+    printf("\n");
+  }
+
+  return 0;
+}
+
+//      CHECK: Full array
+// CHECK-NEXT:         100     101     102     103     104     105
+// CHECK-NEXT:         110     111     112     113     114     115
+// CHECK-NEXT:         120     121     122     123     124     125
+// CHECK-NEXT:         130     131     132     133     134     135
+// CHECK-NEXT:         140     141     142     143     144     145
+// CHECK-NEXT:         150     151     152     153     154     155
+// CHECK-NEXT: Only slice (to)
+// CHECK-NEXT:         0       0       0       0       0       0
+// CHECK-NEXT:         0       0       112     113     114     0
+// CHECK-NEXT:         0       0       0       0       0       0
+// CHECK-NEXT:         0       0       132     133     134     0
+// CHECK-NEXT:         0       0       0       0       0       0
+// CHECK-NEXT:         0       0       0       0       0       0
+// CHECK-NEXT: Only slice (from)
+// CHECK-NEXT:         0       0       0       0       0       0
+// CHECK-NEXT:         0       0       112     113     114     0
+// CHECK-NEXT:         0       0       0       0       0       0
+// CHECK-NEXT:         0       0       132     133     134     0
+// CHECK-NEXT:         0       0       0       0       0       0
+// CHECK-NEXT:         0       0       0       0       0       0
diff --git a/offload/test/offloading/strided_update_variable_stride_misc.c b/offload/test/offloading/strided_update_variable_stride_misc.c
index 1efabb20db29b..d5579fe48887e 100644
--- a/offload/test/offloading/strided_update_variable_stride_misc.c
+++ b/offload/test/offloading/strided_update_variable_stride_misc.c
@@ -12,7 +12,7 @@ void test_1_variable_stride_one() {
 
   // Initialize data on host
   for (int i = 0; i < 10; i++) {
-    data1[i] = i;
+    data1[i] = i + 1;
   }
 
 #pragma omp target data map(to : stride_one, data1[0 : 10])
@@ -20,7 +20,7 @@ void test_1_variable_stride_one() {
 #pragma omp target
     {
       for (int i = 0; i < 10; i++) {
-        data1[i] += i;
+        data1[i] += i + 1;
       }
     }
 
@@ -38,7 +38,7 @@ void test_2_variable_stride_large() {
 
   // Initialize data on host
   for (int i = 0; i < 10; i++) {
-    data2[i] = i;
+    data2[i] = i + 1;
   }
 
 #pragma omp target data map(to : stride_large, data2[0 : 10])
@@ -46,7 +46,7 @@ void test_2_variable_stride_large() {
 #pragma omp target
     {
       for (int i = 0; i < 10; i++) {
-        data2[i] += i;
+        data2[i] += i + 1;
       }
     }
 
@@ -65,25 +65,25 @@ int main() {
 }
 
 // CHECK: Test 1: Variable stride = 1
-// CHECK-NEXT: 0.000000
-// CHECK-NEXT: 1.000000
 // CHECK-NEXT: 2.000000
-// CHECK-NEXT: 3.000000
 // CHECK-NEXT: 4.000000
-// CHECK-NEXT: 5.000000
 // CHECK-NEXT: 6.000000
-// CHECK-NEXT: 7.000000
 // CHECK-NEXT: 8.000000
-// CHECK-NEXT: 9.000000
+// CHECK-NEXT: 10.000000
+// CHECK-NEXT: 12.000000
+// CHECK-NEXT: 14.000000
+// CHECK-NEXT: 16.000000
+// CHECK-NEXT: 18.000000
+// CHECK-NEXT: 20.000000
 
 // CHECK: Test 2: Variable stride = 5
-// CHECK-NEXT: 0.000000
-// CHECK-NEXT: 1.000000
+// CHECK-NEXT: 2.000000
 // CHECK-NEXT: 2.000000
 // CHECK-NEXT: 3.000000
 // CHECK-NEXT: 4.000000
-// CHECK-NEXT: 10.000000
-// CHECK-NEXT: 6.000000
+// CHECK-NEXT: 5.000000
+// CHECK-NEXT: 12.000000
 // CHECK-NEXT: 7.000000
 // CHECK-NEXT: 8.000000
 // CHECK-NEXT: 9.000000
+// CHECK-NEXT: 10.000000