[clang] [llvm] [OpenMP] Fix non-contiguous array omp target update (PR #156889)

Thu Sep 4 07:04:19 PDT 2025

https://github.com/ivanradanov created https://github.com/llvm/llvm-project/pull/156889

The existing implementation has three issues which this patch addresses.

1. The last dimension which represents the bytes in the type, has the wrong stride and count. For example, for a 4 byte int, count=1 and stride=4. The correct representation here is count=4 and stride=1 because there are 4 bytes (count=4) that we need to copy and we do not skip any bytes (stride=1).

2. The size of the data copy was computed using the last dimension. However, this is incorrect in cases where some of the final dimensions get merged into one. In this case we need to take the combined size of the merged dimensions, which is (Count * Stride) of the first merged dimension.

3. The Offset into a dimension was computed as a multiple of its Stride. However, this Stride which is in bytes, already includes the stride multiplier given by the user. This means that when the user specified 1:3:2, i.e. elements 1, 3, 5, the runtime incorrectly copied elements 2, 4, 6. Fix this by precomputing at compile time the Offset to be in bytes by correctly multiplying the offset by the stride of the dimension without the user-specified multiplier.

>From 8bbf42dfef75353ac50f40cc366e29344a841e27 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Fri, 22 Aug 2025 11:50:57 -0700
Subject: [PATCH] [OpenMP] Fix non-contiguous array omp target update

The existing implementation has three issues which this patch addresses.

1. The last dimension which represents the bytes in the type, has the wrong
stride and count. For example, for a 4 byte int, count=1 and stride=4. The
correct representation here is count=4 and stride=1 because there are 4 bytes
(count=4) that we need to copy and we do not skip any bytes (stride=1).

2. The size of the data copy was computed using the last dimension. However,
this is incorrect in cases where some of the final dimensions get merged into
one. In this case we need to take the combined size of the merged dimensions,
which is (Count * Stride) of the first merged dimension.

3. The Offset into a dimension was computed as a multiple of its Stride.
However, this Stride which is in bytes, already includes the stride multiplier
given by the user. This means that when the user specified 1:3:2, i.e. elements
1, 3, 5, the runtime incorrectly copied elements 2, 4, 6. Fix this by
precomputing at compile time the Offset to be in bytes and correctly multiply
the offset by the stride of the dimension without the user-specified
multiplier.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  20 ++--
 clang/test/OpenMP/target_update_codegen.cpp   |  43 +++----
 offload/libomptarget/omptarget.cpp            |  14 ++-
 .../test/offloading/non_contiguous_update.cpp | 107 ++++++++++++------
 4 files changed, 116 insertions(+), 68 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index f98339d472fa9..691dd62bf549a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -7874,8 +7874,8 @@ class MappableExprsHandler {
     // For supporting stride in array section, we need to initialize the first
     // dimension size as 1, first offset as 0, and first count as 1
     MapValuesArrayTy CurOffsets = {llvm::ConstantInt::get(CGF.CGM.Int64Ty, 0)};
-    MapValuesArrayTy CurCounts = {llvm::ConstantInt::get(CGF.CGM.Int64Ty, 1)};
-    MapValuesArrayTy CurStrides;
+    MapValuesArrayTy CurCounts;
+    MapValuesArrayTy CurStrides = {llvm::ConstantInt::get(CGF.CGM.Int64Ty, 1)};
     MapValuesArrayTy DimSizes{llvm::ConstantInt::get(CGF.CGM.Int64Ty, 1)};
     uint64_t ElementTypeSize;
 
@@ -7899,8 +7899,8 @@ class MappableExprsHandler {
              "Should be either ConstantArray or VariableArray if not the "
              "first Component");
 
-      // Get element size if CurStrides is empty.
-      if (CurStrides.empty()) {
+      // Get element size if CurCounts is empty.
+      if (CurCounts.empty()) {
         const Type *ElementType = nullptr;
         if (CAT)
           ElementType = CAT->getElementType().getTypePtr();
@@ -7920,7 +7920,7 @@ class MappableExprsHandler {
             ElementType = ElementType->getPointeeOrArrayElementType();
           ElementTypeSize =
               Context.getTypeSizeInChars(ElementType).getQuantity();
-          CurStrides.push_back(
+          CurCounts.push_back(
               llvm::ConstantInt::get(CGF.Int64Ty, ElementTypeSize));
         }
       }
@@ -7980,7 +7980,6 @@ class MappableExprsHandler {
                                            CGF.Int64Ty,
                                            /*isSigned=*/false);
       }
-      CurOffsets.push_back(Offset);
 
       // Count
       const Expr *CountExpr = OASE->getLength();
@@ -8017,11 +8016,12 @@ class MappableExprsHandler {
       CurCounts.push_back(Count);
 
       // Stride_n' = Stride_n * (D_0 * D_1 ... * D_n-1) * Unit size
+      // Offset_n' = Offset_n * (D_0 * D_1 ... * D_n-1) * Unit size
       // Take `int arr[5][5][5]` and `arr[0:2:2][1:2:1][0:2:2]` as an example:
       //              Offset      Count     Stride
-      //    D0          0           1         4    (int)    <- dummy dimension
+      //    D0          0           4         1    (int)    <- dummy dimension
       //    D1          0           2         8    (2 * (1) * 4)
-      //    D2          1           2         20   (1 * (1 * 5) * 4)
+      //    D2          100         2         20   (1 * (1 * 5) * 4)
       //    D3          0           2         200  (2 * (1 * 5 * 4) * 4)
       const Expr *StrideExpr = OASE->getStride();
       llvm::Value *Stride =
@@ -8034,6 +8034,10 @@ class MappableExprsHandler {
         CurStrides.push_back(CGF.Builder.CreateNUWMul(DimProd, Stride));
       else
         CurStrides.push_back(DimProd);
+
+      Offset = CGF.Builder.CreateNUWMul(DimProd, Offset);
+      CurOffsets.push_back(Offset);
+
       if (DI != DimSizes.end())
         ++DI;
     }
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index c8211f475c7fc..648ad58787660 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -1134,7 +1134,7 @@ void foo(int arg) {
   // CK20: store i64 {{32|64}}, ptr [[STRIDE]],
   // CK20: [[DIM_2:%.+]] = getelementptr inbounds [3 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
   // CK20: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-  // CK20: store i64 1, ptr [[OFFSET_2]],
+  // CK20: store i64 {{8|16}}, ptr [[OFFSET_2]],
   // CK20: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
   // CK20: store i64 4, ptr [[COUNT_2]],
   // CK20: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1143,9 +1143,9 @@ void foo(int arg) {
   // CK20: [[OFFSET_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 0
   // CK20: store i64 0, ptr [[OFFSET_3]],
   // CK20: [[COUNT_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 1
-  // CK20: store i64 1, ptr [[COUNT_3]],
+  // CK20: store i64 {{8|16}}, ptr [[COUNT_3]],
   // CK20: [[STRIDE_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 2
-  // CK20: store i64 {{8|16}}, ptr [[STRIDE_3]],
+  // CK20: store i64 1, ptr [[STRIDE_3]],
   // CK20-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
   // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
   // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1202,7 +1202,7 @@ struct ST {
     // CK21: store i64 {{400|800}}, ptr [[STRIDE_1]],
     // CK21: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
     // CK21: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-    // CK21: store i64 1, ptr [[OFFSET_2]],
+    // CK21: store i64 {{40|80}}, ptr [[OFFSET_2]],
     // CK21: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
     // CK21: store i64 3, ptr [[COUNT_2]],
     // CK21: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1218,9 +1218,9 @@ struct ST {
     // CK21: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
     // CK21: store i64 0, ptr [[OFFSET_4]],
     // CK21: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-    // CK21: store i64 1, ptr [[COUNT_4]],
+    // CK21: store i64 {{4|8}}, ptr [[COUNT_4]],
     // CK21: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-    // CK21: store i64 {{4|8}}, ptr [[STRIDE_4]],
+    // CK21: store i64 1, ptr [[STRIDE_4]],
     // CK21-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 2, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[GEPSZ:%.+]], ptr [[MTYPE]]{{.+}})
     // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
     // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1276,7 +1276,7 @@ struct ST {
     // CK22: store i64 200, ptr [[STRIDE]],
     // CK22: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
     // CK22: [[OFFSET:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-    // CK22: store i64 1, ptr [[OFFSET]],
+    // CK22: store i64 40, ptr [[OFFSET]],
     // CK22: [[COUNT:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
     // CK22: store i64 3, ptr [[COUNT]],
     // CK22: [[STRIDE:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1292,9 +1292,9 @@ struct ST {
     // CK22: [[OFFSET:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
     // CK22: store i64 0, ptr [[OFFSET]],
     // CK22: [[COUNT:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-    // CK22: store i64 1, ptr [[COUNT]],
+    // CK22: store i64 4, ptr [[COUNT]],
     // CK22: [[STRIDE:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-    // CK22: store i64 4, ptr [[STRIDE]],
+    // CK22: store i64 1, ptr [[STRIDE]],
     // CK22-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
     // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
     // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1357,7 +1357,7 @@ void foo(int arg) {
   // CK23: store i64 200, ptr [[STRIDE]],
   // CK23: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
   // CK23: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-  // CK23: store i64 1, ptr [[OFFSET_2]],
+  // CK23: store i64 20, ptr [[OFFSET_2]],
   // CK23: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
   // CK23: store i64 2, ptr [[COUNT_2]],
   // CK23: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1373,9 +1373,9 @@ void foo(int arg) {
   // CK23: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
   // CK23: store i64 0, ptr [[OFFSET_4]],
   // CK23: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-  // CK23: store i64 1, ptr [[COUNT_4]],
+  // CK23: store i64 4, ptr [[COUNT_4]],
   // CK23: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-  // CK23: store i64 4, ptr [[STRIDE_4]],
+  // CK23: store i64 1, ptr [[STRIDE_4]],
   // CK23-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
   // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
   // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1419,6 +1419,7 @@ void foo(int arg) {
   // CK24: [[MUL:%.+]] = mul nuw i64 8,
   // CK24: [[SUB:%.+]] = sub nuw i64 4, [[ARG:%.+]]
   // CK24: [[LEN:%.+]] = udiv {{.+}} [[SUB]], 1
+  // CK24: [[MUL_ARG:%.+]] = mul nuw i64 40, [[ARG]]
   // CK24: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
   // CK24: store ptr [[ARR]], ptr [[BP0]],
   // CK24: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], {{.+}} 0, {{.+}} 0
@@ -1432,7 +1433,7 @@ void foo(int arg) {
   // CK24: store i64 320, ptr [[STRIDE]],
   // CK24: [[DIM_2:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 1
   // CK24: [[OFFSET_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 0
-  // CK24: store i64 [[ARG]], ptr [[OFFSET_2]],
+  // CK24: store i64 [[MUL_ARG]], ptr [[OFFSET_2]],
   // CK24: [[COUNT_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 1
   // CK24: store i64 [[LEN]], ptr [[COUNT_2]],
   // CK24: [[STRIDE_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_2]], {{.+}} 0, {{.+}} 2
@@ -1448,9 +1449,9 @@ void foo(int arg) {
   // CK24: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
   // CK24: store i64 0, ptr [[OFFSET_4]],
   // CK24: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-  // CK24: store i64 1, ptr [[COUNT_4]],
+  // CK24: store i64 8, ptr [[COUNT_4]],
   // CK24: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-  // CK24: store i64 8, ptr [[STRIDE_4]],
+  // CK24: store i64 1, ptr [[STRIDE_4]],
   // CK24-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 1, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
   // CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
   // CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
@@ -1515,7 +1516,7 @@ void foo(int arg) {
   // CK25: store i64 20, ptr [[STRIDE_2]],
   // CK25: [[DIM_3:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 2
   // CK25: [[OFFSET_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 0
-  // CK25: store i64 1, ptr [[OFFSET_3]],
+  // CK25: store i64 4, ptr [[OFFSET_3]],
   // CK25: [[COUNT_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 1
   // CK25: store i64 4, ptr [[COUNT_3]],
   // CK25: [[STRIDE_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_3]], {{.+}} 0, {{.+}} 2
@@ -1524,9 +1525,9 @@ void foo(int arg) {
   // CK25: [[OFFSET_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 0
   // CK25: store i64 0, ptr [[OFFSET_4]],
   // CK25: [[COUNT_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 1
-  // CK25: store i64 1, ptr [[COUNT_4]],
+  // CK25: store i64 4, ptr [[COUNT_4]],
   // CK25: [[STRIDE_4:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_4]], {{.+}} 0, {{.+}} 2
-  // CK25: store i64 4, ptr [[STRIDE_4]],
+  // CK25: store i64 1, ptr [[STRIDE_4]],
   // CK25: [[PTRS:%.+]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 0
   // CK25: store ptr [[DIMS]], ptr [[PTRS]],
   // CK25: [[DIM_5:%.+]] = getelementptr inbounds [3 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS_2]], {{.+}} 0, {{.+}} 0
@@ -1538,7 +1539,7 @@ void foo(int arg) {
   // CK25: store i64 12, ptr [[STRIDE_2_1]],
   // CK25: [[DIM_6:%.+]] = getelementptr inbounds [3 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS_2]], {{.+}} 0, {{.+}} 1
   // CK25: [[OFFSET_2_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_6]], {{.+}} 0, {{.+}} 0
-  // CK25: store i64 1, ptr [[OFFSET_2_2]],
+  // CK25: store i64 4, ptr [[OFFSET_2_2]],
   // CK25: [[COUNT_2_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_6]], {{.+}} 0, {{.+}} 1
   // CK25: store i64 2, ptr [[COUNT_2_2]],
   // CK25: [[STRIDE_2_2:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_6]], {{.+}} 0, {{.+}} 2
@@ -1547,9 +1548,9 @@ void foo(int arg) {
   // CK25: [[OFFSET_2_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_7]], {{.+}} 0, {{.+}} 0
   // CK25: store i64 0, ptr [[OFFSET_2_3]],
   // CK25: [[COUNT_2_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_7]], {{.+}} 0, {{.+}} 1
-  // CK25: store i64 1, ptr [[COUNT_2_3]],
+  // CK25: store i64 4, ptr [[COUNT_2_3]],
   // CK25: [[STRIDE_2_3:%.+]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DIM_7]], {{.+}} 0, {{.+}} 2
-  // CK25: store i64 4, ptr [[STRIDE_2_3]],
+  // CK25: store i64 1, ptr [[STRIDE_2_3]],
   // CK25: [[PTRS_2:%.+]] = getelementptr inbounds [3 x ptr], ptr %.offload_ptrs, i32 0, i32 2
   // CK25: store ptr [[DIMS_2]], ptr [[PTRS_2]],
   // CK25-DAG: call void @__tgt_target_data_update_mapper(ptr @{{.+}}, i64 -1, i32 3, ptr [[GEPBP:%.+]], ptr [[GEPP:%.+]], ptr [[MSIZE]], ptr [[MTYPE]]{{.+}})
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 32e89cc75efc9..f2e01993938b0 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -1221,7 +1221,7 @@ static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
   if (CurrentDim < DimSize) {
     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
       uint64_t CurOffset =
-          (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
+          NonContig[CurrentDim].Offset + I * NonContig[CurrentDim].Stride;
       // we only need to transfer the first element for the last dimension
       // since we've already got a contiguous piece.
       if (CurrentDim != DimSize - 1 || I == 0) {
@@ -1293,9 +1293,17 @@ int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
       int32_t DimSize = ArgSizes[I];
-      uint64_t Size =
-          NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
+      DP("Non contig descriptor:\n");
+      for (int I = 0; I < DimSize; I++)
+        DP("  Dim %" PRId32 " : Offset %" PRIu64 " Count %" PRIu64
+           " Stride %" PRIu64 "\n",
+           I, NonContig[I].Offset, NonContig[I].Count, NonContig[I].Stride);
       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
+      DP("Merged %" PRId32 " dimensions\n", MergedDim);
+      __tgt_target_non_contig &FirstMergedDim =
+          NonContig[DimSize - MergedDim - 1];
+      uint64_t Size = FirstMergedDim.Count * FirstMergedDim.Stride;
+      DP("Transfer size %" PRIu64 "\n", Size);
       Ret = targetDataNonContiguous(
           Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
diff --git a/offload/test/offloading/non_contiguous_update.cpp b/offload/test/offloading/non_contiguous_update.cpp
index 609f0f967fb17..3973174bf2c5e 100644
--- a/offload/test/offloading/non_contiguous_update.cpp
+++ b/offload/test/offloading/non_contiguous_update.cpp
@@ -7,9 +7,9 @@
 
 // Data structure definitions copied from OpenMP RTL.
 struct __tgt_target_non_contig {
-  int64_t offset;
-  int64_t width;
-  int64_t stride;
+  int64_t Offset;
+  int64_t Count;
+  int64_t Stride;
 };
 
 enum tgt_map_type { OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000 };
@@ -18,21 +18,22 @@ enum tgt_map_type { OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000 };
 #ifdef __cplusplus
 extern "C" {
 #endif
-void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
-                              void **args_base, void **args, int64_t *arg_sizes,
-                              int64_t *arg_types);
+  void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+                                void **args_base, void **args, int64_t *arg_sizes,
+                                int64_t *arg_types);
 #ifdef __cplusplus
 }
 #endif
 
 int main() {
+  {
   // case 1
   // int arr[3][4][5][6];
   // #pragma omp target update to(arr[0:2][1:3][1:2][:])
   // set up descriptor
   __tgt_target_non_contig non_contig[5] = {
-      {0, 2, 480}, {1, 3, 120}, {1, 2, 24}, {0, 6, 4}, {0, 1, 4}};
-  int64_t size = 4, type = OMP_TGT_MAPTYPE_NON_CONTIG;
+    {0, 2, 480}, {120, 3, 120}, {24, 2, 24}, {0, 6, 4}, {0, 4, 1}};
+  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
 
   void *base;
   void *begin = &non_contig;
@@ -40,9 +41,9 @@ int main() {
   int64_t *types = &type;
 
   // The below diagram is the visualization of the non-contiguous transfer after
-  // optimization. Note that each element represent the innermost dimension
-  // (unit size = 24) since the stride * count of last dimension is equal to the
-  // stride of second last dimension.
+  // optimization. Note that each element represent the merged innermost
+  // dimension (unit size = 24) since the stride * count of last dimension is
+  // equal to the stride of second last dimension.
   //
   // OOOOO OOOOO OOOOO
   // OXXOO OXXOO OOOOO
@@ -50,44 +51,78 @@ int main() {
   // OXXOO OXXOO OOOOO
   __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
                            sizes, types);
-  // DEBUG: offset 144
-  // DEBUG: offset 264
-  // DEBUG: offset 384
-  // DEBUG: offset 624
-  // DEBUG: offset 744
-  // DEBUG: offset 864
+  // DEBUG: offset 144 len 48
+  // DEBUG: offset 264 len 48
+  // DEBUG: offset 384 len 48
+  // DEBUG: offset 624 len 48
+  // DEBUG: offset 744 len 48
+  // DEBUG: offset 864 len 48
+  }
 
+  {
   // case 2
   // double darr[3][4][5];
   // #pragma omp target update to(darr[0:2:2][2:2][:2:2])
   // set up descriptor
-  __tgt_target_non_contig non_contig_2[4] = {
-      {0, 2, 320}, {2, 2, 40}, {0, 2, 16}, {0, 1, 8}};
-  int64_t size_2 = 4, type_2 = OMP_TGT_MAPTYPE_NON_CONTIG;
+  __tgt_target_non_contig non_contig[4] = {
+    {0, 2, 320}, {80, 2, 40}, {0, 2, 16}, {0, 8, 1}};
+  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
 
-  void *base_2;
-  void *begin_2 = &non_contig_2;
-  int64_t *sizes_2 = &size_2;
-  int64_t *types_2 = &type_2;
+  void *base;
+  void *begin = &non_contig;
+  int64_t *sizes = &size;
+  int64_t *types = &type;
 
   // The below diagram is the visualization of the non-contiguous transfer after
   // optimization. Note that each element represent the innermost dimension
-  // (unit size = 24) since the stride * count of last dimension is equal to the
-  // stride of second last dimension.
+  // (unit size = 8).
   //
   // OOOOO OOOOO OOOOO
   // OOOOO OOOOO OOOOO
   // XOXOO OOOOO XOXOO
   // XOXOO OOOOO XOXOO
-  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base_2, &begin_2,
-                           sizes_2, types_2);
-  // DEBUG: offset 80
-  // DEBUG: offset 96
-  // DEBUG: offset 120
-  // DEBUG: offset 136
-  // DEBUG: offset 400
-  // DEBUG: offset 416
-  // DEBUG: offset 440
-  // DEBUG: offset 456
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                           sizes, types);
+  // DEBUG: offset 80 len 8
+  // DEBUG: offset 96 len 8
+  // DEBUG: offset 120 len 8
+  // DEBUG: offset 136 len 8
+  // DEBUG: offset 400 len 8
+  // DEBUG: offset 416 len 8
+  // DEBUG: offset 440 len 8
+  // DEBUG: offset 456 len 8
+  }
+
+  {
+  // case 3
+  // int darr[6][6];
+  // #pragma omp target update to(darr[1:2:2][2:3])
+  // set up descriptor
+  __tgt_target_non_contig non_contig[3] = {
+    {24, 2, 48}, {8, 3, 4}, {0, 4, 1}};
+  int64_t size = sizeof(non_contig) / sizeof(non_contig[0]), type = OMP_TGT_MAPTYPE_NON_CONTIG;
+
+  void *base;
+  void *begin = &non_contig;
+  int64_t *sizes = &size;
+  int64_t *types = &type;
+
+  // The below diagram is the visualization of the non-contiguous transfer after
+  // optimization. Note that each element represent the merged innermost
+  // dimension (unit size = 12).
+  //
+  // OOOOOO
+  // OOXXXO
+  // OOOOOO
+  // OOXXXO
+  // OOOOOO
+  // OOOOOO
+  __tgt_target_data_update(/*device_id*/ -1, /*arg_num*/ 1, &base, &begin,
+                           sizes, types);
+  // DEBUG: offset 24 len 12
+  // DEBUG: offset 72 len 12
+
+  }
+
   return 0;
 }