[clang] [flang] [lldb] [llvm] [mlir] [mlir][tosa][tosa-to-linalg] tosa.cast: fix answer mismatch to cast f64/f32 max value to i64/i32 (PR #130116)

Thu Mar 6 22:30:37 PST 2025

https://github.com/alaa-ali updated https://github.com/llvm/llvm-project/pull/130116

>From 6a1342bd7eefe9915bc2f99b2329707262a30bb4 Mon Sep 17 00:00:00 2001
From: Alaa Ali <alaaali at ah-alaaali-l.dhcp.mathworks.com>
Date: Thu, 6 Mar 2025 03:36:47 -0500
Subject: [PATCH 01/23] tosa.cast: fix answer mismatch to cast f64/f32 max
 value to i64/i32

---
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp  | 17 ++++++++---------
 .../TosaToLinalg/tosa-to-linalg.mlir          | 19 ++++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 8732ddafa24d4..8085f1104a4cb 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -618,12 +618,8 @@ static Value createLinalgBodyCalculationForElementwiseOp(
             loc, rewriter.getIntegerAttr(
                      getElementTypeOrSelf(dstTy),
                      APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
-        auto intMax = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(
-                     getElementTypeOrSelf(dstTy),
-                     APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
         auto maxClamped =
-            rewriter.create<arith::SelectOp>(loc, overflow, intMax, conv);
+            rewriter.create<arith::SelectOp>(loc, overflow, intMin, conv);
         return rewriter.create<arith::SelectOp>(loc, underflow, intMin,
                                                 maxClamped);
       }
@@ -647,8 +643,11 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                      APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
                          .getSExtValue()));
 
+        auto overflow = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UGT, rounded, intMaxFP);
+        Value maxClampedFP = rewriter.create<arith::SelectOp>(loc, overflow, intMinFP, rounded);
+
         Value clamped =
-            clampFloatHelper(loc, rounded, intMinFP, intMaxFP, rewriter);
+            clampFloatHelper(loc, maxClampedFP, intMinFP, intMaxFP, rewriter);
         return rewriter.create<arith::FPToSIOp>(loc, dstTy, clamped);
       }
 
@@ -664,17 +663,17 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                            .getSExtValue()) +
                        1.0f));
 
-      auto intMax = rewriter.create<arith::ConstantOp>(
+      auto intMin = rewriter.create<arith::ConstantOp>(
           loc, rewriter.getIntegerAttr(
                    getElementTypeOrSelf(dstTy),
-                   APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
+                   APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
       auto minClampedFP =
           rewriter.create<arith::MaximumFOp>(loc, rounded, intMinFP);
       auto minClamped =
           rewriter.create<arith::FPToSIOp>(loc, dstTy, minClampedFP);
       auto overflow = rewriter.create<arith::CmpFOp>(
           loc, arith::CmpFPredicate::UGE, rounded, intMaxPlusOneFP);
-      return rewriter.create<arith::SelectOp>(loc, overflow, intMax,
+      return rewriter.create<arith::SelectOp>(loc, overflow, intMin,
                                               minClamped);
     }
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 6ca260a5324a9..a10053c31a8e6 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -541,13 +541,13 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () {
 
   // CHECK: linalg.generic
   // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f32
-  // CHECK: [[CSTMIN:%.+]] = arith.constant -2.14748365E+9 : f32
+  // CHECK: [[CSTMINF:%.+]] = arith.constant -2.14748365E+9 : f32
   // CHECK: [[CSTMAXP1:%.+]] = arith.constant 2.14748365E+9 : f32
-  // CHECK: [[CSTMAX:%.+]] = arith.constant 2147483647 : i32
-  // CHECK: [[MAX:%.+]] = arith.maximumf [[ROUND]], [[CSTMIN]] : f32
+  // CHECK: [[CSTMIN:%.+]] = arith.constant -2147483648 : i32
+  // CHECK: [[MAX:%.+]] = arith.maximumf [[ROUND]], [[CSTMINF]] : f32
   // CHECK: [[CONV:%.+]] = arith.fptosi [[MAX]] : f32 to i32
   // CHECK: [[CMP:%.+]] = arith.cmpf uge, [[ROUND]], [[CSTMAXP1]] : f32
-  // CHECK: arith.select [[CMP]], [[CSTMAX]], [[CONV]] : i32
+  // CHECK: arith.select [[CMP]], [[CSTMIN]], [[CONV]] : i32
   %20 = tosa.cast %0 : (tensor<1xf32>) -> tensor<1xi32>
 
   // CHECK: linalg.generic
@@ -591,7 +591,9 @@ func.func @test_simple_f16(%arg0: tensor<1xf16>) -> () {
   // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f16
   // CHECK: [[CSTMIN:%.+]] = arith.constant -1.280000e+02 : f16
   // CHECK: [[CSTMAX:%.+]] = arith.constant 1.270000e+02 : f16
-  // CHECK: [[MIN:%.+]] = arith.minimumf [[ROUND]], [[CSTMAX]] : f16
+  // CHECK: [[OVERFLOW:%.+]] = arith.cmpf ugt, [[ROUND]], [[CSTMAX]] : f16
+  // CHECK: [[CLAMPMAX:%.+]] = arith.select [[OVERFLOW]], [[CSTMIN]], [[ROUND]] : f16
+  // CHECK: [[MIN:%.+]] = arith.minimumf [[CLAMPMAX]], [[CSTMAX]] : f16
   // CHECK: [[CLAMP:%.+]] = arith.maximumf [[MIN]], [[CSTMIN]] : f16
   // CHECK: arith.fptosi [[CLAMP]] : f16 to i8
   %1 = "tosa.cast"(%arg0) : (tensor<1xf16>) -> tensor<1xi8>
@@ -604,8 +606,7 @@ func.func @test_simple_f16(%arg0: tensor<1xf16>) -> () {
   // CHECK: [[OVERFLOW:%.+]] = arith.cmpf ueq, [[ROUND]], [[POSINF]] : f16
   // CHECK: [[UNDERFLOW:%.+]] = arith.cmpf ueq, [[ROUND]], [[NEGINF]] : f16
   // CHECK: [[MININT:%.+]] = arith.constant -2147483648 : i32
-  // CHECK: [[MAXINT:%.+]] = arith.constant 2147483647 : i32
-  // CHECK: [[CLAMPPOSINF:%.+]] = arith.select [[OVERFLOW]], [[MAXINT]], [[CONV]] : i32
+  // CHECK: [[CLAMPPOSINF:%.+]] = arith.select [[OVERFLOW]], [[MININT]], [[CONV]] : i32
   // CHECK: arith.select [[UNDERFLOW]], [[MININT]], [[CLAMPPOSINF]] : i32
   %2 = "tosa.cast"(%arg0) : (tensor<1xf16>) -> tensor<1xi32>
   return
@@ -1980,11 +1981,11 @@ func.func @test_dynamic_fft2d(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>
 // CHECK:             %[[ROUND_EVEN:.*]] = math.roundeven %[[IN]] : f32
 // CHECK:             %[[FP_INT_MIN:.*]] = arith.constant -9.22337203E+18 : f32
 // CHECK:             %[[FP_INT_MAX_PLUS_ONE:.*]] = arith.constant 9.22337203E+18 : f32
-// CHECK:             %[[INT_MAX:.*]] = arith.constant 9223372036854775807 : i64
+// CHECK:             %[[INT_MIN:.*]] = arith.constant -9223372036854775808 : i64
 // CHECK:             %[[MAX:.*]] = arith.maximumf %[[ROUND_EVEN]], %[[FP_INT_MIN]] : f32
 // CHECK:             %[[FPTOSI:.*]] = arith.fptosi %[[MAX]] : f32 to i64
 // CHECK:             %[[CMPF:.*]] = arith.cmpf uge, %[[ROUND_EVEN]], %[[FP_INT_MAX_PLUS_ONE]] : f32
-// CHECK:             %[[SELECT:.*]] = arith.select %[[CMPF]], %[[INT_MAX]], %[[FPTOSI]] : i64
+// CHECK:             %[[SELECT:.*]] = arith.select %[[CMPF]], %[[INT_MIN]], %[[FPTOSI]] : i64
 // CHECK:             linalg.yield %[[SELECT]] : i64
 // CHECK:           } -> tensor<1xi64>
 // CHECK:           return %[[RESULT]] : tensor<1xi64>

>From f8585ea09f1221b3b92572500da76672407523e3 Mon Sep 17 00:00:00 2001
From: Alaa Ali <alaaali at ah-alaaali-l.dhcp.mathworks.com>
Date: Thu, 6 Mar 2025 03:36:47 -0500
Subject: [PATCH 02/23] tosa.cast: fix answer mismatch to cast f64/f32 max
 value to i64/i32

---
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp  | 17 ++++++++---------
 .../TosaToLinalg/tosa-to-linalg.mlir          | 19 ++++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index a99cf293b9eac..8854b4690bdf5 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -618,12 +618,8 @@ static Value createLinalgBodyCalculationForElementwiseOp(
             loc, rewriter.getIntegerAttr(
                      getElementTypeOrSelf(dstTy),
                      APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
-        auto intMax = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(
-                     getElementTypeOrSelf(dstTy),
-                     APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
         auto maxClamped =
-            rewriter.create<arith::SelectOp>(loc, overflow, intMax, conv);
+            rewriter.create<arith::SelectOp>(loc, overflow, intMin, conv);
         return rewriter.create<arith::SelectOp>(loc, underflow, intMin,
                                                 maxClamped);
       }
@@ -647,8 +643,11 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                      APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
                          .getSExtValue()));
 
+        auto overflow = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UGT, rounded, intMaxFP);
+        Value maxClampedFP = rewriter.create<arith::SelectOp>(loc, overflow, intMinFP, rounded);
+
         Value clamped =
-            clampFloatHelper(loc, rounded, intMinFP, intMaxFP, rewriter);
+            clampFloatHelper(loc, maxClampedFP, intMinFP, intMaxFP, rewriter);
         return rewriter.create<arith::FPToSIOp>(loc, dstTy, clamped);
       }
 
@@ -664,17 +663,17 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                            .getSExtValue()) +
                        1.0f));
 
-      auto intMax = rewriter.create<arith::ConstantOp>(
+      auto intMin = rewriter.create<arith::ConstantOp>(
           loc, rewriter.getIntegerAttr(
                    getElementTypeOrSelf(dstTy),
-                   APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
+                   APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
       auto minClampedFP =
           rewriter.create<arith::MaximumFOp>(loc, rounded, intMinFP);
       auto minClamped =
           rewriter.create<arith::FPToSIOp>(loc, dstTy, minClampedFP);
       auto overflow = rewriter.create<arith::CmpFOp>(
           loc, arith::CmpFPredicate::UGE, rounded, intMaxPlusOneFP);
-      return rewriter.create<arith::SelectOp>(loc, overflow, intMax,
+      return rewriter.create<arith::SelectOp>(loc, overflow, intMin,
                                               minClamped);
     }
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 9ba9965315fd3..bd6381bedf65c 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -541,13 +541,13 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () {
 
   // CHECK: linalg.generic
   // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f32
-  // CHECK: [[CSTMIN:%.+]] = arith.constant -2.14748365E+9 : f32
+  // CHECK: [[CSTMINF:%.+]] = arith.constant -2.14748365E+9 : f32
   // CHECK: [[CSTMAXP1:%.+]] = arith.constant 2.14748365E+9 : f32
-  // CHECK: [[CSTMAX:%.+]] = arith.constant 2147483647 : i32
-  // CHECK: [[MAX:%.+]] = arith.maximumf [[ROUND]], [[CSTMIN]] : f32
+  // CHECK: [[CSTMIN:%.+]] = arith.constant -2147483648 : i32
+  // CHECK: [[MAX:%.+]] = arith.maximumf [[ROUND]], [[CSTMINF]] : f32
   // CHECK: [[CONV:%.+]] = arith.fptosi [[MAX]] : f32 to i32
   // CHECK: [[CMP:%.+]] = arith.cmpf uge, [[ROUND]], [[CSTMAXP1]] : f32
-  // CHECK: arith.select [[CMP]], [[CSTMAX]], [[CONV]] : i32
+  // CHECK: arith.select [[CMP]], [[CSTMIN]], [[CONV]] : i32
   %20 = tosa.cast %0 : (tensor<1xf32>) -> tensor<1xi32>
 
   // CHECK: linalg.generic
@@ -591,7 +591,9 @@ func.func @test_simple_f16(%arg0: tensor<1xf16>) -> () {
   // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f16
   // CHECK: [[CSTMIN:%.+]] = arith.constant -1.280000e+02 : f16
   // CHECK: [[CSTMAX:%.+]] = arith.constant 1.270000e+02 : f16
-  // CHECK: [[MIN:%.+]] = arith.minimumf [[ROUND]], [[CSTMAX]] : f16
+  // CHECK: [[OVERFLOW:%.+]] = arith.cmpf ugt, [[ROUND]], [[CSTMAX]] : f16
+  // CHECK: [[CLAMPMAX:%.+]] = arith.select [[OVERFLOW]], [[CSTMIN]], [[ROUND]] : f16
+  // CHECK: [[MIN:%.+]] = arith.minimumf [[CLAMPMAX]], [[CSTMAX]] : f16
   // CHECK: [[CLAMP:%.+]] = arith.maximumf [[MIN]], [[CSTMIN]] : f16
   // CHECK: arith.fptosi [[CLAMP]] : f16 to i8
   %1 = "tosa.cast"(%arg0) : (tensor<1xf16>) -> tensor<1xi8>
@@ -604,8 +606,7 @@ func.func @test_simple_f16(%arg0: tensor<1xf16>) -> () {
   // CHECK: [[OVERFLOW:%.+]] = arith.cmpf ueq, [[ROUND]], [[POSINF]] : f16
   // CHECK: [[UNDERFLOW:%.+]] = arith.cmpf ueq, [[ROUND]], [[NEGINF]] : f16
   // CHECK: [[MININT:%.+]] = arith.constant -2147483648 : i32
-  // CHECK: [[MAXINT:%.+]] = arith.constant 2147483647 : i32
-  // CHECK: [[CLAMPPOSINF:%.+]] = arith.select [[OVERFLOW]], [[MAXINT]], [[CONV]] : i32
+  // CHECK: [[CLAMPPOSINF:%.+]] = arith.select [[OVERFLOW]], [[MININT]], [[CONV]] : i32
   // CHECK: arith.select [[UNDERFLOW]], [[MININT]], [[CLAMPPOSINF]] : i32
   %2 = "tosa.cast"(%arg0) : (tensor<1xf16>) -> tensor<1xi32>
   return
@@ -1980,11 +1981,11 @@ func.func @test_dynamic_fft2d(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>
 // CHECK:             %[[ROUND_EVEN:.*]] = math.roundeven %[[IN]] : f32
 // CHECK:             %[[FP_INT_MIN:.*]] = arith.constant -9.22337203E+18 : f32
 // CHECK:             %[[FP_INT_MAX_PLUS_ONE:.*]] = arith.constant 9.22337203E+18 : f32
-// CHECK:             %[[INT_MAX:.*]] = arith.constant 9223372036854775807 : i64
+// CHECK:             %[[INT_MIN:.*]] = arith.constant -9223372036854775808 : i64
 // CHECK:             %[[MAX:.*]] = arith.maximumf %[[ROUND_EVEN]], %[[FP_INT_MIN]] : f32
 // CHECK:             %[[FPTOSI:.*]] = arith.fptosi %[[MAX]] : f32 to i64
 // CHECK:             %[[CMPF:.*]] = arith.cmpf uge, %[[ROUND_EVEN]], %[[FP_INT_MAX_PLUS_ONE]] : f32
-// CHECK:             %[[SELECT:.*]] = arith.select %[[CMPF]], %[[INT_MAX]], %[[FPTOSI]] : i64
+// CHECK:             %[[SELECT:.*]] = arith.select %[[CMPF]], %[[INT_MIN]], %[[FPTOSI]] : i64
 // CHECK:             linalg.yield %[[SELECT]] : i64
 // CHECK:           } -> tensor<1xi64>
 // CHECK:           return %[[RESULT]] : tensor<1xi64>

>From 267403442264959f6b06e227ff450c385f4b3ef2 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Thu, 6 Mar 2025 22:46:41 -0500
Subject: [PATCH 03/23] Reland "[HIP] Use original file path for CUID"
 (#111885)

This patch fixes the buildbots failure of lit tests on MacOS. Since
clang driver options depend on toolchain, we cannot hardcode CUID hash.
On MacOS there is an extra -mlinker-version= option.
---
 clang/lib/Driver/Driver.cpp         |  5 +----
 clang/test/Driver/hip-cuid-hash.hip | 15 +++++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index eca96c1cce7f7..e998c94aeacd1 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -226,10 +226,7 @@ std::string CUIDOptions::getCUID(StringRef InputFile,
     else if (UseCUID == Kind::Hash) {
       llvm::MD5 Hasher;
       llvm::MD5::MD5Result Hash;
-      SmallString<256> RealPath;
-      llvm::sys::fs::real_path(InputFile, RealPath,
-                               /*expand_tilde=*/true);
-      Hasher.update(RealPath);
+      Hasher.update(InputFile);
       for (auto *A : Args) {
         if (A->getOption().matches(options::OPT_INPUT))
           continue;
diff --git a/clang/test/Driver/hip-cuid-hash.hip b/clang/test/Driver/hip-cuid-hash.hip
index 103a1cbf26d50..a4167d664537e 100644
--- a/clang/test/Driver/hip-cuid-hash.hip
+++ b/clang/test/Driver/hip-cuid-hash.hip
@@ -1,13 +1,20 @@
 // Check CUID generated by hash.
 // The same CUID is generated for the same file with the same options.
 
+// This test requires relative paths for input files. Since the test may be
+// done out of source tree, create the local directory structure and copy the
+// input file from the source tree into that directory.
+// RUN: mkdir -p %t/Inputs/hip_multiple_inputs
+// RUN: cp %S/Inputs/hip_multiple_inputs/a.cu %t/Inputs/hip_multiple_inputs/a.cu
+// RUN: cd %t
+
 // RUN: %clang -### -x hip --target=x86_64-unknown-linux-gnu --no-offload-new-driver \
 // RUN:   --offload-arch=gfx906 -c -nogpuinc -nogpulib -fuse-cuid=hash \
-// RUN:   %S/Inputs/hip_multiple_inputs/a.cu >%t.out 2>&1
+// RUN:   Inputs/hip_multiple_inputs/a.cu >%t.out 2>&1
 
 // RUN: %clang -### -x hip --target=x86_64-unknown-linux-gnu --no-offload-new-driver \
 // RUN:   --offload-arch=gfx906 -c -nogpuinc -nogpulib -fuse-cuid=hash \
-// RUN:   %S/Inputs/hip_multiple_inputs/a.cu >>%t.out 2>&1
+// RUN:   Inputs/hip_multiple_inputs/a.cu >>%t.out 2>&1
 
 // RUN: FileCheck %s -check-prefixes=SAME -input-file %t.out
 
@@ -16,11 +23,11 @@
 
 // RUN: %clang -### -x hip --target=x86_64-unknown-linux-gnu -DX=1 --no-offload-new-driver \
 // RUN:   --offload-arch=gfx906 -c -nogpuinc -nogpulib -fuse-cuid=hash \
-// RUN:   %S/Inputs/hip_multiple_inputs/a.cu >%t.out 2>&1
+// RUN:   Inputs/hip_multiple_inputs/a.cu >%t.out 2>&1
 
 // RUN: %clang -### -x hip --target=x86_64-unknown-linux-gnu -DX=2 --no-offload-new-driver \
 // RUN:   --offload-arch=gfx906 -c -nogpuinc -nogpulib -fuse-cuid=hash \
-// RUN:   %S/Inputs/../Inputs/hip_multiple_inputs/a.cu >>%t.out 2>&1
+// RUN:   Inputs/../Inputs/hip_multiple_inputs/a.cu >>%t.out 2>&1
 
 // RUN: FileCheck %s -check-prefixes=DIFF -input-file %t.out
 

>From 6a4de5921292117444a108afb0bcca8f73c00cce Mon Sep 17 00:00:00 2001
From: Alaa Ali <alaaali at ah-alaaali-l.dhcp.mathworks.com>
Date: Thu, 6 Mar 2025 22:46:57 -0500
Subject: [PATCH 04/23] clear the code formatting errors

---
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 8854b4690bdf5..17ebc7dc32372 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -643,8 +643,10 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                      APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
                          .getSExtValue()));
 
-        auto overflow = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UGT, rounded, intMaxFP);
-        Value maxClampedFP = rewriter.create<arith::SelectOp>(loc, overflow, intMinFP, rounded);
+        auto overflow = rewriter.create<arith::CmpFOp>(
+            loc, arith::CmpFPredicate::UGT, rounded, intMaxFP);
+        Value maxClampedFP =
+            rewriter.create<arith::SelectOp>(loc, overflow, intMinFP, rounded);
 
         Value clamped =
             clampFloatHelper(loc, maxClampedFP, intMinFP, intMaxFP, rewriter);

>From f3dc358953a13caf7521fc615a08f6317930351c Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Thu, 6 Mar 2025 22:47:11 -0500
Subject: [PATCH 05/23] [MC] output inlined-at debug info (#106230)

Currently MC print source location of instructions in comments in
assembly when debug info is available, however, it does not include
inlined-at locations when a function is inlined.

For example, function foo is defined in header file a.h and is called
multiple times in b.cpp. If foo is inlined, current assembly will only
show its instructions with their line numbers in a.h. With inlined-at
locations, the assembly will also show where foo is called in b.cpp.

This patch adds inlined-at locations to the comments by using
DebugLoc::print. It makes the printed source location info consistent
with those printed by machine passes.
---
 llvm/include/llvm/MC/MCObjectStreamer.h       |  4 +-
 llvm/include/llvm/MC/MCStreamer.h             |  3 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    | 23 ++++---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h      |  2 +-
 llvm/lib/MC/MCAsmStreamer.cpp                 | 18 +++--
 llvm/lib/MC/MCObjectStreamer.cpp              |  5 +-
 llvm/lib/MC/MCStreamer.cpp                    |  2 +-
 .../test/CodeGen/AMDGPU/dbg-info-inline-at.ll | 66 +++++++++++++++++++
 llvm/test/CodeGen/XCore/dwarf_debug.ll        |  2 +-
 llvm/test/DebugInfo/X86/inline-seldag-test.ll |  4 +-
 10 files changed, 104 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/dbg-info-inline-at.ll

diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index d6a957322ea11..ee747253fd869 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -141,8 +141,8 @@ class MCObjectStreamer : public MCStreamer {
                          SMLoc Loc) override;
   void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
                              unsigned Flags, unsigned Isa,
-                             unsigned Discriminator,
-                             StringRef FileName) override;
+                             unsigned Discriminator, StringRef FileName,
+                             StringRef Comment = {}) override;
   void emitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel,
                                 const MCSymbol *Label,
                                 unsigned PointerSize) override;
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 1dc6ddfd7566b..9d63c1e66bdae 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -887,7 +887,8 @@ class MCStreamer {
   virtual void emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                      unsigned Column, unsigned Flags,
                                      unsigned Isa, unsigned Discriminator,
-                                     StringRef FileName);
+                                     StringRef FileName,
+                                     StringRef Comment = {});
 
   /// This implements the '.loc_label Name' directive.
   virtual void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 60d911d0383ed..7ad7627f32a06 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2057,6 +2057,14 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
     }
   }
 
+  auto RecordSourceLine = [&](auto &DL, auto Flags) {
+    SmallString<128> LocationString;
+    raw_svector_ostream OS(LocationString);
+    DL.print(OS);
+
+    const MDNode *Scope = DL.getScope();
+    recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags, LocationString);
+  };
   // When we emit a line-0 record, we don't update PrevInstLoc; so look at
   // the last line number actually emitted, to see if it was line 0.
   unsigned LastAsmLine =
@@ -2084,8 +2092,7 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
     // But we might be coming back to it after a line 0 record.
     if ((LastAsmLine == 0 && DL.getLine() != 0) || Flags) {
       // Reinstate the source location but not marked as a statement.
-      const MDNode *Scope = DL.getScope();
-      recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+      RecordSourceLine(DL, Flags);
     }
     return;
   }
@@ -2136,8 +2143,7 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   if (DL.getLine() && (DL.getLine() != OldLine || ForceIsStmt))
     Flags |= DWARF2_FLAG_IS_STMT;
 
-  const MDNode *Scope = DL.getScope();
-  recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+  RecordSourceLine(DL, Flags);
 
   // If we're not at line 0, remember this location.
   if (DL.getLine())
@@ -2272,7 +2278,8 @@ findPrologueEndLoc(const MachineFunction *MF) {
 static void recordSourceLine(AsmPrinter &Asm, unsigned Line, unsigned Col,
                              const MDNode *S, unsigned Flags, unsigned CUID,
                              uint16_t DwarfVersion,
-                             ArrayRef<std::unique_ptr<DwarfCompileUnit>> DCUs) {
+                             ArrayRef<std::unique_ptr<DwarfCompileUnit>> DCUs,
+                             StringRef Comment = {}) {
   StringRef Fn;
   unsigned FileNo = 1;
   unsigned Discriminator = 0;
@@ -2286,7 +2293,7 @@ static void recordSourceLine(AsmPrinter &Asm, unsigned Line, unsigned Col,
                  .getOrCreateSourceID(Scope->getFile());
   }
   Asm.OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
-                                         Discriminator, Fn);
+                                         Discriminator, Fn, Comment);
 }
 
 const MachineInstr *
@@ -2617,10 +2624,10 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
 // Register a source line with debug info. Returns the  unique label that was
 // emitted and which provides correspondence to the source line list.
 void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
-                                  unsigned Flags) {
+                                  unsigned Flags, StringRef Location) {
   ::recordSourceLine(*Asm, Line, Col, S, Flags,
                      Asm->OutStreamer->getContext().getDwarfCompileUnitID(),
-                     getDwarfVersion(), getUnits());
+                     getDwarfVersion(), getUnits(), Location);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 9662c617d730e..58e6d39f76ae0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -679,7 +679,7 @@ class DwarfDebug : public DebugHandlerBase {
   /// label that was emitted and which provides correspondence to the
   /// source line list.
   void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
-                        unsigned Flags);
+                        unsigned Flags, StringRef Location = {});
 
   /// Populate LexicalScope entries with variables' info.
   void collectEntityInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP,
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index f1b34ca65abd9..fe6bb8c965147 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -294,8 +294,8 @@ class MCAsmStreamer final : public MCStreamer {
                                unsigned CUID = 0) override;
   void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
                              unsigned Flags, unsigned Isa,
-                             unsigned Discriminator,
-                             StringRef FileName) override;
+                             unsigned Discriminator, StringRef FileName,
+                             StringRef Location = {}) override;
   virtual void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override;
 
   MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
@@ -1688,7 +1688,8 @@ void MCAsmStreamer::emitDwarfFile0Directive(
 void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           unsigned Column, unsigned Flags,
                                           unsigned Isa, unsigned Discriminator,
-                                          StringRef FileName) {
+                                          StringRef FileName,
+                                          StringRef Comment) {
   // If target doesn't support .loc/.file directive, we need to record the lines
   // same way like we do in object mode.
   if (MAI->isAIX()) {
@@ -1696,7 +1697,7 @@ void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
     // first one gets a line entry.
     MCDwarfLineEntry::make(this, getCurrentSectionOnly());
     this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
-                                            Discriminator, FileName);
+                                            Discriminator, FileName, Comment);
     return;
   }
 
@@ -1727,12 +1728,15 @@ void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
 
   if (IsVerboseAsm) {
     OS.PadToColumn(MAI->getCommentColumn());
-    OS << MAI->getCommentString() << ' ' << FileName << ':'
-       << Line << ':' << Column;
+    OS << MAI->getCommentString() << ' ';
+    if (Comment.empty())
+      OS << FileName << ':' << Line << ':' << Column;
+    else
+      OS << Comment;
   }
   EmitEOL();
   this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
-                                          Discriminator, FileName);
+                                          Discriminator, FileName, Comment);
 }
 
 void MCAsmStreamer::emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) {
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 9aee1abcd0d67..41c08809301ee 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -418,13 +418,14 @@ void MCObjectStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                              unsigned Column, unsigned Flags,
                                              unsigned Isa,
                                              unsigned Discriminator,
-                                             StringRef FileName) {
+                                             StringRef FileName,
+                                             StringRef Comment) {
   // In case we see two .loc directives in a row, make sure the
   // first one gets a line entry.
   MCDwarfLineEntry::make(this, getCurrentSectionOnly());
 
   this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
-                                          Discriminator, FileName);
+                                          Discriminator, FileName, Comment);
 }
 
 static const MCExpr *buildSymbolDiff(MCObjectStreamer &OS, const MCSymbol *A,
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 99b7651a9ab31..f040954efb6b5 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -238,7 +238,7 @@ void MCStreamer::emitCFIMTETaggedFrame() {
 void MCStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                        unsigned Column, unsigned Flags,
                                        unsigned Isa, unsigned Discriminator,
-                                       StringRef FileName) {
+                                       StringRef FileName, StringRef Comment) {
   getContext().setCurrentDwarfLoc(FileNo, Line, Column, Flags, Isa,
                                   Discriminator);
 }
diff --git a/llvm/test/CodeGen/AMDGPU/dbg-info-inline-at.ll b/llvm/test/CodeGen/AMDGPU/dbg-info-inline-at.ll
new file mode 100644
index 0000000000000..ed609f85918f9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dbg-info-inline-at.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s
+
+define amdgpu_kernel void @_Z3fooPiiii(ptr addrspace(1) nocapture noundef writeonly %c.coerce, i32 noundef %a, i32 noundef %b, i32 noundef %d) !dbg !9 {
+; CHECK-LABEL: _Z3fooPiiii:
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:    .file 0 "test" "a.hip" md5 0x004a28df8cfd98cdd2c71d5d814d9c6b
+; CHECK-NEXT:    .cfi_sections .debug_frame
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    .file 1 "." "a.h"
+; CHECK-NEXT:    .loc 1 5 12 prologue_end ; ./a.h:5:12 @[ a.hip:12:8 ]
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_i32 s1, s1, s0
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    .loc 1 5 12 is_stmt 0 ; ./a.h:5:12 @[ a.hip:13:9 ]
+; CHECK-NEXT:    s_add_i32 s0, s2, s0
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    .file 2 "a.hip"
+; CHECK-NEXT:    .loc 2 13 6 is_stmt 1 ; a.hip:13:6
+; CHECK-NEXT:    s_mul_i32 s0, s0, s1
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    global_store_dword v0, v1, s[4:5]
+; CHECK-NEXT:    .loc 2 14 1 ; a.hip:14:1
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .Ltmp2:
+entry:
+  %add.i = add nsw i32 %b, %a, !dbg !13
+  %add.i3 = add nsw i32 %d, %a, !dbg !17
+  %mul = mul nsw i32 %add.i3, %add.i, !dbg !19
+  store i32 %mul, ptr addrspace(1) %c.coerce, align 4, !dbg !19, !tbaa !20
+  ret void, !dbg !24
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "a.hip", directory: "test", checksumkind: CSK_MD5, checksum: "004a28df8cfd98cdd2c71d5d814d9c6b")
+!2 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!3 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+!4 = !{i32 7, !"Dwarf Version", i32 5}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 8, !"PIC Level", i32 2}
+!8 = !{!"clang version 20.0.0"}
+!9 = distinct !DISubprogram(name: "foo", scope: !10, file: !10, line: 11, type: !11, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DIFile(filename: "a.hip", directory: "test")
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 5, column: 12, scope: !14, inlinedAt: !16)
+!14 = distinct !DISubprogram(name: "bar", scope: !15, file: !15, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = !DIFile(filename: "./a.h", directory: "test")
+!16 = distinct !DILocation(line: 12, column: 8, scope: !9)
+!17 = !DILocation(line: 5, column: 12, scope: !14, inlinedAt: !18)
+!18 = distinct !DILocation(line: 13, column: 9, scope: !9)
+!19 = !DILocation(line: 13, column: 6, scope: !9)
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !22, i64 0}
+!22 = !{!"omnipotent char", !23, i64 0}
+!23 = !{!"Simple C++ TBAA"}
+!24 = !DILocation(line: 14, column: 1, scope: !9)
diff --git a/llvm/test/CodeGen/XCore/dwarf_debug.ll b/llvm/test/CodeGen/XCore/dwarf_debug.ll
index 43851130c08f4..084d0fca86320 100644
--- a/llvm/test/CodeGen/XCore/dwarf_debug.ll
+++ b/llvm/test/CodeGen/XCore/dwarf_debug.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: f
 ; CHECK: entsp [[S:[0-9]+]]
 ; ...the prologue...
-; CHECK: .loc 1 2 0 prologue_end      # test.c:2:0
+; CHECK: .loc 1 2 0 prologue_end      # test.c:2
 ; CHECK: add r0, r0, 1
 ; CHECK: retsp [[S]]
 define i32 @f(i32 %a) !dbg !4 {
diff --git a/llvm/test/DebugInfo/X86/inline-seldag-test.ll b/llvm/test/DebugInfo/X86/inline-seldag-test.ll
index 6417612a54580..6a62d2bab884b 100644
--- a/llvm/test/DebugInfo/X86/inline-seldag-test.ll
+++ b/llvm/test/DebugInfo/X86/inline-seldag-test.ll
@@ -18,8 +18,8 @@
 ; Make sure the condition test is attributed to the inline function, not the
 ; location of the test's operands within the caller.
 
-; ASM: # inline-seldag-test.c:2:0
-; ASM-NOT: .loc
+; ASM: # inline-seldag-test.c:4:0
+; ASM: .loc 1 2 0 # inline-seldag-test.c:2 @[ inline-seldag-test.c:6:7 ]
 ; ASM: testl
 
 ; Function Attrs: nounwind uwtable

>From e15545cad8297ec7555f26e5ae74a9f0511203e7 Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel at users.noreply.github.com>
Date: Fri, 7 Mar 2025 09:24:32 +0530
Subject: [PATCH 06/23] [Flang][OpenMP] Allow copyprivate and nowait on the
 directive clauses (#127769)

Issue:
- Single construct used to throw a semantic error for copyprivate and
  nowait clause when used in the single directive.
- Also, the copyprivate with nowait restriction has been removed from
  OpenMP 6.0

Fix:
- Allow copyprivate and nowait on both single and end single directive
- Allow at most one nowait clause
- Throw a warning when the same list item is used in the copyprivate clause
  on the end single directive

>From Reference guide (OpenMP 5.2, 2.10.2):
```
!$omp single [clause[ [,]clause] ... ]
loosely-structured-block
!$omp end single [end-clause[ [,]end-clause] ...]

clause:
  copyprivate (list)
  nowait
  [...]

end-clause:
  copyprivate (list)
  nowait
```

Towards: https://github.com/llvm/llvm-project/issues/110008
---
 flang/lib/Semantics/check-omp-structure.cpp   | 82 +++++++++++++------
 .../Semantics/OpenMP/clause-validity01.f90    |  7 +-
 flang/test/Semantics/OpenMP/single03.f90      | 54 ++++++++++++
 flang/test/Semantics/OpenMP/single04.f90      | 81 ++++++++++++++++++
 .../test/Semantics/OpenMP/threadprivate04.f90 |  1 -
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |  4 +-
 6 files changed, 199 insertions(+), 30 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/single03.f90
 create mode 100644 flang/test/Semantics/OpenMP/single04.f90

diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index c6ed211549401..f5e3623282819 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1203,6 +1203,64 @@ void OmpStructureChecker::Enter(const parser::OpenMPBlockConstruct &x) {
     deviceConstructFound_ = true;
   }
 
+  if (GetContext().directive == llvm::omp::Directive::OMPD_single) {
+    std::set<Symbol *> singleCopyprivateSyms;
+    std::set<Symbol *> endSingleCopyprivateSyms;
+    bool foundNowait{false};
+    parser::CharBlock NowaitSource;
+
+    auto catchCopyPrivateNowaitClauses = [&](const auto &dir, bool endDir) {
+      for (auto &clause : std::get<parser::OmpClauseList>(dir.t).v) {
+        if (clause.Id() == llvm::omp::Clause::OMPC_copyprivate) {
+          for (const auto &ompObject : GetOmpObjectList(clause)->v) {
+            const auto *name{parser::Unwrap<parser::Name>(ompObject)};
+            if (Symbol * symbol{name->symbol}) {
+              if (singleCopyprivateSyms.count(symbol)) {
+                if (endDir) {
+                  context_.Warn(common::UsageWarning::OpenMPUsage, name->source,
+                      "The COPYPRIVATE clause with '%s' is already used on the SINGLE directive"_warn_en_US,
+                      name->ToString());
+                } else {
+                  context_.Say(name->source,
+                      "'%s' appears in more than one COPYPRIVATE clause on the SINGLE directive"_err_en_US,
+                      name->ToString());
+                }
+              } else if (endSingleCopyprivateSyms.count(symbol)) {
+                context_.Say(name->source,
+                    "'%s' appears in more than one COPYPRIVATE clause on the END SINGLE directive"_err_en_US,
+                    name->ToString());
+              } else {
+                if (endDir) {
+                  endSingleCopyprivateSyms.insert(symbol);
+                } else {
+                  singleCopyprivateSyms.insert(symbol);
+                }
+              }
+            }
+          }
+        } else if (clause.Id() == llvm::omp::Clause::OMPC_nowait) {
+          if (foundNowait) {
+            context_.Say(clause.source,
+                "At most one NOWAIT clause can appear on the SINGLE directive"_err_en_US);
+          } else {
+            foundNowait = !endDir;
+          }
+          if (!NowaitSource.ToString().size()) {
+            NowaitSource = clause.source;
+          }
+        }
+      }
+    };
+    catchCopyPrivateNowaitClauses(beginBlockDir, false);
+    catchCopyPrivateNowaitClauses(endBlockDir, true);
+    unsigned version{context_.langOptions().OpenMPVersion};
+    if (version <= 52 && NowaitSource.ToString().size() &&
+        (singleCopyprivateSyms.size() || endSingleCopyprivateSyms.size())) {
+      context_.Say(NowaitSource,
+          "NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive"_err_en_US);
+    }
+  }
+
   switch (beginDir.v) {
   case llvm::omp::Directive::OMPD_target:
     if (CheckTargetBlockOnlyTeams(block)) {
@@ -2903,12 +2961,6 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) {
   // clause
   CheckMultListItems();
 
-  // 2.7.3 Single Construct Restriction
-  if (GetContext().directive == llvm::omp::Directive::OMPD_end_single) {
-    CheckNotAllowedIfClause(
-        llvm::omp::Clause::OMPC_copyprivate, {llvm::omp::Clause::OMPC_nowait});
-  }
-
   auto testThreadprivateVarErr = [&](Symbol sym, parser::Name name,
                                      llvmOmpClause clauseTy) {
     if (sym.test(Symbol::Flag::OmpThreadprivate))
@@ -3549,15 +3601,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Private &x) {
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Nowait &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_nowait);
-  if (llvm::omp::noWaitClauseNotAllowedSet.test(GetContext().directive)) {
-    context_.Say(GetContext().clauseSource,
-        "%s clause is not allowed on the OMP %s directive,"
-        " use it on OMP END %s directive "_err_en_US,
-        parser::ToUpperCaseLetters(
-            getClauseName(llvm::omp::Clause::OMPC_nowait).str()),
-        parser::ToUpperCaseLetters(GetContext().directiveSource.ToString()),
-        parser::ToUpperCaseLetters(GetContext().directiveSource.ToString()));
-  }
 }
 
 bool OmpStructureChecker::IsDataRefTypeParamInquiry(
@@ -4288,15 +4331,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Copyprivate &x) {
   CheckIntentInPointer(symbols, llvm::omp::Clause::OMPC_copyprivate);
   CheckCopyingPolymorphicAllocatable(
       symbols, llvm::omp::Clause::OMPC_copyprivate);
-  if (GetContext().directive == llvm::omp::Directive::OMPD_single) {
-    context_.Say(GetContext().clauseSource,
-        "%s clause is not allowed on the OMP %s directive,"
-        " use it on OMP END %s directive "_err_en_US,
-        parser::ToUpperCaseLetters(
-            getClauseName(llvm::omp::Clause::OMPC_copyprivate).str()),
-        parser::ToUpperCaseLetters(GetContext().directiveSource.ToString()),
-        parser::ToUpperCaseLetters(GetContext().directiveSource.ToString()));
-  }
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Lastprivate &x) {
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index e8114154a809b..5e0d91914c441 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -330,11 +330,12 @@
   !$omp parallel
   b = 1
   !ERROR: LASTPRIVATE clause is not allowed on the SINGLE directive
-  !ERROR: NOWAIT clause is not allowed on the OMP SINGLE directive, use it on OMP END SINGLE directive 
+  !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
   !$omp single private(a) lastprivate(c) nowait
   a = 3.14
-  !ERROR: Clause NOWAIT is not allowed if clause COPYPRIVATE appears on the END SINGLE directive
   !ERROR: COPYPRIVATE variable 'a' may not appear on a PRIVATE or FIRSTPRIVATE clause on a SINGLE construct
+  !ERROR: At most one NOWAIT clause can appear on the SINGLE directive
+  !ERROR: At most one NOWAIT clause can appear on the SINGLE directive
   !ERROR: At most one NOWAIT clause can appear on the END SINGLE directive
   !$omp end single copyprivate(a) nowait nowait
   c = 2
@@ -351,7 +352,6 @@
   a = 1.0
   !ERROR: COPYPRIVATE clause is not allowed on the END WORKSHARE directive
   !$omp end workshare nowait copyprivate(a)
-  !ERROR: NOWAIT clause is not allowed on the OMP WORKSHARE directive, use it on OMP END WORKSHARE directive 
   !$omp workshare nowait
   !$omp end workshare
   !$omp end parallel
@@ -420,7 +420,6 @@
   !$omp parallel
   !ERROR: No ORDERED clause with a parameter can be specified on the DO SIMD directive
   !ERROR: NOGROUP clause is not allowed on the DO SIMD directive
-  !ERROR: NOWAIT clause is not allowed on the OMP DO SIMD directive, use it on OMP END DO SIMD directive 
   !$omp do simd ordered(2) NOGROUP nowait
   do i = 1, N
      do j = 1, N
diff --git a/flang/test/Semantics/OpenMP/single03.f90 b/flang/test/Semantics/OpenMP/single03.f90
new file mode 100644
index 0000000000000..dc2c2fd27eb04
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/single03.f90
@@ -0,0 +1,54 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=52
+!
+! OpenMP Version 5.2
+!
+! 2.10.2 single Construct
+! Copyprivate and Nowait clauses are allowed in both clause and end clause
+
+subroutine omp_single
+    integer, save :: i
+    integer       :: j
+    i = 10; j = 11
+
+    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+    !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+    !$omp single copyprivate(i) nowait
+        print *, "omp single", i
+    !$omp end single
+
+    !$omp parallel private(i)
+        !$omp single copyprivate(i)
+            print *, "omp single", i
+        !$omp end single
+    !$omp end parallel
+
+    !$omp parallel
+        !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+        !$omp single nowait
+            print *, "omp single", i
+        !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+        !$omp end single copyprivate(i)
+
+        !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+        !$omp single copyprivate(i)
+            print *, "omp single", i
+        !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+        !$omp end single nowait
+
+        !ERROR: COPYPRIVATE variable 'j' may not appear on a PRIVATE or FIRSTPRIVATE clause on a SINGLE construct
+        !$omp single private(j) copyprivate(j)
+            print *, "omp single", j
+        !ERROR: COPYPRIVATE variable 'j' may not appear on a PRIVATE or FIRSTPRIVATE clause on a SINGLE construct
+        !WARNING: The COPYPRIVATE clause with 'j' is already used on the SINGLE directive
+        !$omp end single copyprivate(j)
+
+        !$omp single nowait
+            print *, "omp single", j
+        !ERROR: At most one NOWAIT clause can appear on the SINGLE directive
+        !$omp end single nowait
+    !$omp end parallel
+
+    !$omp single nowait
+        print *, "omp single", i
+    !$omp end single
+end subroutine omp_single
diff --git a/flang/test/Semantics/OpenMP/single04.f90 b/flang/test/Semantics/OpenMP/single04.f90
new file mode 100644
index 0000000000000..9505745c600e9
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/single04.f90
@@ -0,0 +1,81 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=52
+!
+! OpenMP Version 5.2
+!
+! 2.10.2 single Construct
+! Valid and invalid testcases for copyprivate and nowait clause on the single directive
+
+program single
+    ! Valid testcases
+    !$omp single
+        print *, x
+    !$omp end single
+
+    !$omp single nowait
+        print *, x
+    !$omp end single
+
+    !$omp single copyprivate(x, y, z)
+        print *, x
+    !$omp end single
+
+    !$omp single
+        print *, x
+    !$omp end single copyprivate(x)
+
+    ! Invalid testcases
+    !$omp single
+        print *, x
+    !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+    !$omp end single copyprivate(x) nowait
+
+    !ERROR: 'x' appears in more than one COPYPRIVATE clause on the SINGLE directive
+    !$omp single copyprivate(x) copyprivate(x)
+        print *, x
+    !$omp end single
+
+    !$omp single
+        print *, x
+    !ERROR: 'x' appears in more than one COPYPRIVATE clause on the END SINGLE directive
+    !$omp end single copyprivate(x) copyprivate(x)
+
+    !ERROR: At most one NOWAIT clause can appear on the SINGLE directive
+    !$omp single nowait nowait
+        print *, x
+    !$omp end single
+
+    !$omp single
+        print *, x
+    !ERROR: At most one NOWAIT clause can appear on the END SINGLE directive
+    !$omp end single nowait nowait
+
+    !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+    !$omp single copyprivate(x) nowait
+        print *, x
+    !WARNING: The COPYPRIVATE clause with 'x' is already used on the SINGLE directive
+    !ERROR: At most one NOWAIT clause can appear on the SINGLE directive
+    !$omp end single copyprivate(x) nowait
+
+    !$omp single copyprivate(x)
+        print *, x
+    !WARNING: The COPYPRIVATE clause with 'x' is already used on the SINGLE directive
+    !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+    !$omp end single copyprivate(x) nowait
+
+    !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+    !$omp single copyprivate(x, y) nowait
+        print *, x
+    !WARNING: The COPYPRIVATE clause with 'x' is already used on the SINGLE directive
+    !ERROR: 'z' appears in more than one COPYPRIVATE clause on the END SINGLE directive
+    !ERROR: At most one NOWAIT clause can appear on the SINGLE directive
+    !$omp end single copyprivate(x, z) copyprivate(z) nowait
+
+    !ERROR: NOWAIT clause must not be used with COPYPRIVATE clause on the SINGLE directive
+    !$omp single copyprivate(x) nowait copyprivate(y) copyprivate(z)
+        print *, x
+    !WARNING: The COPYPRIVATE clause with 'x' is already used on the SINGLE directive
+    !WARNING: The COPYPRIVATE clause with 'y' is already used on the SINGLE directive
+    !WARNING: The COPYPRIVATE clause with 'z' is already used on the SINGLE directive
+    !ERROR: At most one NOWAIT clause can appear on the SINGLE directive
+    !$omp end single copyprivate(x, y, z) nowait
+end program
diff --git a/flang/test/Semantics/OpenMP/threadprivate04.f90 b/flang/test/Semantics/OpenMP/threadprivate04.f90
index 3d8c7fb8de8fa..d261f33b4dbd7 100644
--- a/flang/test/Semantics/OpenMP/threadprivate04.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate04.f90
@@ -14,7 +14,6 @@ program main
   !$omp parallel num_threads(x1)
   !$omp end parallel
 
-  !ERROR: COPYPRIVATE clause is not allowed on the OMP SINGLE directive, use it on OMP END SINGLE directive 
   !$omp single copyprivate(x2, /blk1/)
   !$omp end single
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 8a2f30a7995dc..c5d03d554616e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -1023,9 +1023,11 @@ def OMP_Single : Directive<"single"> {
     VersionedClause<OMPC_Allocate>,
     VersionedClause<OMPC_CopyPrivate>,
     VersionedClause<OMPC_FirstPrivate>,
-    VersionedClause<OMPC_NoWait>,
     VersionedClause<OMPC_Private>,
   ];
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_NoWait>,
+  ];
   let association = AS_Block;
   let category = CA_Executable;
 }

>From 9543e9e9270e01f2c7311b571246c6ea105bcdb0 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy at amd.com>
Date: Fri, 7 Mar 2025 05:44:39 +0100
Subject: [PATCH 07/23] [flang][OpenMP] Handle pre-detemined `lastprivate` for
 `simd` (#129507)

This PR tries to fix `lastprivate` update issues in composite
constructs. In particular, pre-determined `lastprivate` symbols are
attached to the wrong leaf of the composite construct (the outermost
one). When using delayed privatization (should be the default mode in
the future), this results in trying to update the `lastprivate` symbol
in the wrong construct (outside the `omp.loop_nest` op).

For example, given the following input:
```fortran
!$omp target teams distribute parallel do simd collapse(2) private(y_max)
  do i=x_min,x_max
    do j=y_min,y_max
    enddo
  enddo
```

Without the fixes introduced in this PR, the `DataSharingProcessor`
tries to generate the `lastprivate` update ops in the `parallel` op
since this is the op for which the DSP instance is created.

The fix consists of 2 main parts:
1. Instead of creating a single DSP instance, one instance is created
for the leaf constructs that might need privatization (whether for
explicit, implicit, or pre-determined symbols).
2. When generating the `lastprivate` comparison ops, we don't directly
use the SSA values of the UBs and steps. Instead, we regenerated these
SSA values from the original loop bounds' expressions. We have to do
this to avoid using `host_eval` values in the `lastprivate` comparison
logic which is illegal.
---
 flang/lib/Lower/OpenMP/ClauseFinder.h         | 76 +++++++++++++++++++
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    | 70 +----------------
 flang/lib/Lower/OpenMP/ClauseProcessor.h      | 37 +--------
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 10 ++-
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 35 +++++----
 flang/lib/Lower/OpenMP/Utils.cpp              | 75 ++++++++++++++++++
 flang/lib/Lower/OpenMP/Utils.h                |  5 ++
 .../OpenMP/distribute-parallel-do-simd.f90    | 71 +++++++++++++----
 flang/test/Lower/OpenMP/lastprivate-iv.f90    | 20 +++--
 flang/test/Lower/OpenMP/lastprivate-simd.f90  |  2 +-
 .../Lower/OpenMP/parallel-wsloop-lastpriv.f90 | 75 +++++++++++-------
 11 files changed, 309 insertions(+), 167 deletions(-)
 create mode 100644 flang/lib/Lower/OpenMP/ClauseFinder.h

diff --git a/flang/lib/Lower/OpenMP/ClauseFinder.h b/flang/lib/Lower/OpenMP/ClauseFinder.h
new file mode 100644
index 0000000000000..3b77f2ca1d4cb
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/ClauseFinder.h
@@ -0,0 +1,76 @@
+//===-- Lower/OpenMP/ClauseFinder.h --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_LOWER_CLAUSEFINDER_H
+#define FORTRAN_LOWER_CLAUSEFINDER_H
+
+#include "Clauses.h"
+
+namespace Fortran {
+namespace lower {
+namespace omp {
+
+class ClauseFinder {
+  using ClauseIterator = List<Clause>::const_iterator;
+
+public:
+  /// Utility to find a clause within a range in the clause list.
+  template <typename T>
+  static ClauseIterator findClause(ClauseIterator begin, ClauseIterator end) {
+    for (ClauseIterator it = begin; it != end; ++it) {
+      if (std::get_if<T>(&it->u))
+        return it;
+    }
+
+    return end;
+  }
+
+  /// Return the first instance of the given clause found in the clause list or
+  /// `nullptr` if not present. If more than one instance is expected, use
+  /// `findRepeatableClause` instead.
+  template <typename T>
+  static const T *findUniqueClause(const List<Clause> &clauses,
+                                   const parser::CharBlock **source = nullptr) {
+    ClauseIterator it = findClause<T>(clauses.begin(), clauses.end());
+    if (it != clauses.end()) {
+      if (source)
+        *source = &it->source;
+      return &std::get<T>(it->u);
+    }
+    return nullptr;
+  }
+
+  /// Call `callbackFn` for each occurrence of the given clause. Return `true`
+  /// if at least one instance was found.
+  template <typename T>
+  static bool findRepeatableClause(
+      const List<Clause> &clauses,
+      std::function<void(const T &, const parser::CharBlock &source)>
+          callbackFn) {
+    bool found = false;
+    ClauseIterator nextIt, endIt = clauses.end();
+    for (ClauseIterator it = clauses.begin(); it != endIt; it = nextIt) {
+      nextIt = findClause<T>(it, endIt);
+
+      if (nextIt != endIt) {
+        callbackFn(std::get<T>(nextIt->u), nextIt->source);
+        found = true;
+        ++nextIt;
+      }
+    }
+    return found;
+  }
+};
+} // namespace omp
+} // namespace lower
+} // namespace Fortran
+
+#endif // FORTRAN_LOWER_CLAUSEFINDER_H
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index e21d299570b86..98a2bb7583d98 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -12,6 +12,7 @@
 
 #include "ClauseProcessor.h"
 #include "Clauses.h"
+#include "Utils.h"
 
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Parser/tools.h"
@@ -201,24 +202,6 @@ static void addUseDeviceClause(
     useDeviceSyms.push_back(object.sym());
 }
 
-static void convertLoopBounds(lower::AbstractConverter &converter,
-                              mlir::Location loc,
-                              mlir::omp::LoopRelatedClauseOps &result,
-                              std::size_t loopVarTypeSize) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  // The types of lower bound, upper bound, and step are converted into the
-  // type of the loop variable if necessary.
-  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  for (unsigned it = 0; it < (unsigned)result.loopLowerBounds.size(); it++) {
-    result.loopLowerBounds[it] = firOpBuilder.createConvert(
-        loc, loopVarType, result.loopLowerBounds[it]);
-    result.loopUpperBounds[it] = firOpBuilder.createConvert(
-        loc, loopVarType, result.loopUpperBounds[it]);
-    result.loopSteps[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopSteps[it]);
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // ClauseProcessor unique clauses
 //===----------------------------------------------------------------------===//
@@ -240,55 +223,8 @@ bool ClauseProcessor::processCollapse(
     mlir::Location currentLocation, lower::pft::Evaluation &eval,
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
-  bool found = false;
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-
-  // Collect the loops to collapse.
-  lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
-  if (doConstructEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
-    TODO(currentLocation, "Do Concurrent in Worksharing loop construct");
-  }
-
-  std::int64_t collapseValue = 1l;
-  if (auto *clause = findUniqueClause<omp::clause::Collapse>()) {
-    collapseValue = evaluate::ToInt64(clause->v).value();
-    found = true;
-  }
-
-  std::size_t loopVarTypeSize = 0;
-  do {
-    lower::pft::Evaluation *doLoop =
-        &doConstructEval->getFirstNestedEvaluation();
-    auto *doStmt = doLoop->getIf<parser::NonLabelDoStmt>();
-    assert(doStmt && "Expected do loop to be in the nested evaluation");
-    const auto &loopControl =
-        std::get<std::optional<parser::LoopControl>>(doStmt->t);
-    const parser::LoopControl::Bounds *bounds =
-        std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
-    assert(bounds && "Expected bounds for worksharing do loop");
-    lower::StatementContext stmtCtx;
-    result.loopLowerBounds.push_back(fir::getBase(
-        converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx)));
-    result.loopUpperBounds.push_back(fir::getBase(
-        converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx)));
-    if (bounds->step) {
-      result.loopSteps.push_back(fir::getBase(
-          converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx)));
-    } else { // If `step` is not present, assume it as `1`.
-      result.loopSteps.push_back(firOpBuilder.createIntegerConstant(
-          currentLocation, firOpBuilder.getIntegerType(32), 1));
-    }
-    iv.push_back(bounds->name.thing.symbol);
-    loopVarTypeSize = std::max(loopVarTypeSize,
-                               bounds->name.thing.symbol->GetUltimate().size());
-    collapseValue--;
-    doConstructEval =
-        &*std::next(doConstructEval->getNestedEvaluations().begin());
-  } while (collapseValue > 0);
-
-  convertLoopBounds(converter, currentLocation, result, loopVarTypeSize);
-
-  return found;
+  return collectLoopRelatedInfo(converter, currentLocation, eval, clauses,
+                                result, iv);
 }
 
 bool ClauseProcessor::processDevice(lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 889a09a8f0cd8..c2a136d620b29 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -12,6 +12,7 @@
 #ifndef FORTRAN_LOWER_CLAUSEPROCESSOR_H
 #define FORTRAN_LOWER_CLAUSEPROCESSOR_H
 
+#include "ClauseFinder.h"
 #include "Clauses.h"
 #include "ReductionProcessor.h"
 #include "Utils.h"
@@ -148,10 +149,6 @@ class ClauseProcessor {
 private:
   using ClauseIterator = List<Clause>::const_iterator;
 
-  /// Utility to find a clause within a range in the clause list.
-  template <typename T>
-  static ClauseIterator findClause(ClauseIterator begin, ClauseIterator end);
-
   /// Return the first instance of the given clause found in the clause list or
   /// `nullptr` if not present. If more than one instance is expected, use
   /// `findRepeatableClause` instead.
@@ -199,45 +196,17 @@ void ClauseProcessor::processTODO(mlir::Location currentLocation,
     (checkUnhandledClause(it->id, std::get_if<Ts>(&it->u)), ...);
 }
 
-template <typename T>
-ClauseProcessor::ClauseIterator
-ClauseProcessor::findClause(ClauseIterator begin, ClauseIterator end) {
-  for (ClauseIterator it = begin; it != end; ++it) {
-    if (std::get_if<T>(&it->u))
-      return it;
-  }
-
-  return end;
-}
-
 template <typename T>
 const T *
 ClauseProcessor::findUniqueClause(const parser::CharBlock **source) const {
-  ClauseIterator it = findClause<T>(clauses.begin(), clauses.end());
-  if (it != clauses.end()) {
-    if (source)
-      *source = &it->source;
-    return &std::get<T>(it->u);
-  }
-  return nullptr;
+  return ClauseFinder::findUniqueClause<T>(clauses, source);
 }
 
 template <typename T>
 bool ClauseProcessor::findRepeatableClause(
     std::function<void(const T &, const parser::CharBlock &source)> callbackFn)
     const {
-  bool found = false;
-  ClauseIterator nextIt, endIt = clauses.end();
-  for (ClauseIterator it = clauses.begin(); it != endIt; it = nextIt) {
-    nextIt = findClause<T>(it, endIt);
-
-    if (nextIt != endIt) {
-      callbackFn(std::get<T>(nextIt->u), nextIt->source);
-      found = true;
-      ++nextIt;
-    }
-  }
-  return found;
+  return ClauseFinder::findRepeatableClause<T>(clauses, callbackFn);
 }
 
 template <typename T>
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 781b0dfceff9e..b88454c45da85 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -257,6 +257,11 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
     return;
 
   if (mlir::isa<mlir::omp::WsloopOp>(op) || mlir::isa<mlir::omp::SimdOp>(op)) {
+    mlir::omp::LoopRelatedClauseOps result;
+    llvm::SmallVector<const semantics::Symbol *> iv;
+    collectLoopRelatedInfo(converter, converter.getCurrentLocation(), eval,
+                           clauses, result, iv);
+
     // Update the original variable just before exiting the worksharing
     // loop. Conversion as follows:
     //
@@ -280,9 +285,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
     mlir::Value cmpOp;
     llvm::SmallVector<mlir::Value> vs;
     vs.reserve(loopOp.getIVs().size());
-    for (auto [iv, ub, step] :
-         llvm::zip_equal(loopOp.getIVs(), loopOp.getLoopUpperBounds(),
-                         loopOp.getLoopSteps())) {
+    for (auto [iv, ub, step] : llvm::zip_equal(
+             loopOp.getIVs(), result.loopUpperBounds, result.loopSteps)) {
       // v = iv + step
       // cmp = step < 0 ? v < ub : v > ub
       mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index b1568cc12a05a..ca161bc2ba337 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1208,27 +1208,27 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
     if (privatize) {
       // DataSharingProcessor::processStep2() may create operations before/after
       // the one passed as argument. We need to treat loop wrappers and their
-      // nested loop as a unit, so we need to pass the top level wrapper (if
+      // nested loop as a unit, so we need to pass the bottom level wrapper (if
       // present). Otherwise, these operations will be inserted within a
       // wrapper region.
-      mlir::Operation *privatizationTopLevelOp = &op;
+      mlir::Operation *privatizationBottomLevelOp = &op;
       if (auto loopNest = llvm::dyn_cast<mlir::omp::LoopNestOp>(op)) {
         llvm::SmallVector<mlir::omp::LoopWrapperInterface> wrappers;
         loopNest.gatherWrappers(wrappers);
         if (!wrappers.empty())
-          privatizationTopLevelOp = &*wrappers.back();
+          privatizationBottomLevelOp = &*wrappers.front();
       }
 
       if (!info.dsp) {
         assert(tempDsp.has_value());
-        tempDsp->processStep2(privatizationTopLevelOp, isLoop);
+        tempDsp->processStep2(privatizationBottomLevelOp, isLoop);
       } else {
         if (isLoop && regionArgs.size() > 0) {
           for (const auto &regionArg : regionArgs) {
             info.dsp->pushLoopIV(info.converter.getSymbolAddress(*regionArg));
           }
         }
-        info.dsp->processStep2(privatizationTopLevelOp, isLoop);
+        info.dsp->processStep2(privatizationBottomLevelOp, isLoop);
       }
     }
   }
@@ -2741,18 +2741,20 @@ static void genCompositeDistributeParallelDoSimd(
   genParallelClauses(converter, semaCtx, stmtCtx, parallelItem->clauses, loc,
                      parallelClauseOps, parallelReductionSyms);
 
-  DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval,
-                           /*shouldCollectPreDeterminedSymbols=*/true,
-                           /*useDelayedPrivatization=*/true, symTable);
-  dsp.processStep1(&parallelClauseOps);
+  DataSharingProcessor parallelItemDSP(
+      converter, semaCtx, parallelItem->clauses, eval,
+      /*shouldCollectPreDeterminedSymbols=*/false,
+      /*useDelayedPrivatization=*/true, symTable);
+  parallelItemDSP.processStep1(&parallelClauseOps);
 
   EntryBlockArgs parallelArgs;
-  parallelArgs.priv.syms = dsp.getDelayedPrivSymbols();
+  parallelArgs.priv.syms = parallelItemDSP.getDelayedPrivSymbols();
   parallelArgs.priv.vars = parallelClauseOps.privateVars;
   parallelArgs.reduction.syms = parallelReductionSyms;
   parallelArgs.reduction.vars = parallelClauseOps.reductionVars;
   genParallelOp(converter, symTable, semaCtx, eval, loc, queue, parallelItem,
-                parallelClauseOps, parallelArgs, &dsp, /*isComposite=*/true);
+                parallelClauseOps, parallelArgs, &parallelItemDSP,
+                /*isComposite=*/true);
 
   // Clause processing.
   mlir::omp::DistributeOperands distributeClauseOps;
@@ -2769,6 +2771,11 @@ static void genCompositeDistributeParallelDoSimd(
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
                  simdReductionSyms);
 
+  DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
+                                   /*shouldCollectPreDeterminedSymbols=*/true,
+                                   /*useDelayedPrivatization=*/true, symTable);
+  simdItemDSP.processStep1(&simdClauseOps);
+
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc,
@@ -2790,7 +2797,8 @@ static void genCompositeDistributeParallelDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private syms and vars.
+  simdArgs.priv.syms = simdItemDSP.getDelayedPrivSymbols();
+  simdArgs.priv.vars = simdClauseOps.privateVars;
   simdArgs.reduction.syms = simdReductionSyms;
   simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
@@ -2802,7 +2810,8 @@ static void genCompositeDistributeParallelDoSimd(
                 {{distributeOp, distributeArgs},
                  {wsloopOp, wsloopArgs},
                  {simdOp, simdArgs}},
-                llvm::omp::Directive::OMPD_distribute_parallel_do_simd, dsp);
+                llvm::omp::Directive::OMPD_distribute_parallel_do_simd,
+                simdItemDSP);
 }
 
 static void genCompositeDistributeSimd(lower::AbstractConverter &converter,
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 48bcf492fd368..744c3bd04a0a7 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -14,6 +14,7 @@
 
 #include "Clauses.h"
 
+#include "ClauseFinder.h"
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
 #include <flang/Lower/DirectivesCommon.h>
@@ -595,6 +596,80 @@ void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp,
   }
 }
 
+static void convertLoopBounds(lower::AbstractConverter &converter,
+                              mlir::Location loc,
+                              mlir::omp::LoopRelatedClauseOps &result,
+                              std::size_t loopVarTypeSize) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  // The types of lower bound, upper bound, and step are converted into the
+  // type of the loop variable if necessary.
+  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+  for (unsigned it = 0; it < (unsigned)result.loopLowerBounds.size(); it++) {
+    result.loopLowerBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopLowerBounds[it]);
+    result.loopUpperBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopUpperBounds[it]);
+    result.loopSteps[it] =
+        firOpBuilder.createConvert(loc, loopVarType, result.loopSteps[it]);
+  }
+}
+
+bool collectLoopRelatedInfo(
+    lower::AbstractConverter &converter, mlir::Location currentLocation,
+    lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
+    mlir::omp::LoopRelatedClauseOps &result,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
+  bool found = false;
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+  // Collect the loops to collapse.
+  lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
+  if (doConstructEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
+    TODO(currentLocation, "Do Concurrent in Worksharing loop construct");
+  }
+
+  std::int64_t collapseValue = 1l;
+  if (auto *clause =
+          ClauseFinder::findUniqueClause<omp::clause::Collapse>(clauses)) {
+    collapseValue = evaluate::ToInt64(clause->v).value();
+    found = true;
+  }
+
+  std::size_t loopVarTypeSize = 0;
+  do {
+    lower::pft::Evaluation *doLoop =
+        &doConstructEval->getFirstNestedEvaluation();
+    auto *doStmt = doLoop->getIf<parser::NonLabelDoStmt>();
+    assert(doStmt && "Expected do loop to be in the nested evaluation");
+    const auto &loopControl =
+        std::get<std::optional<parser::LoopControl>>(doStmt->t);
+    const parser::LoopControl::Bounds *bounds =
+        std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
+    assert(bounds && "Expected bounds for worksharing do loop");
+    lower::StatementContext stmtCtx;
+    result.loopLowerBounds.push_back(fir::getBase(
+        converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx)));
+    result.loopUpperBounds.push_back(fir::getBase(
+        converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx)));
+    if (bounds->step) {
+      result.loopSteps.push_back(fir::getBase(
+          converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx)));
+    } else { // If `step` is not present, assume it as `1`.
+      result.loopSteps.push_back(firOpBuilder.createIntegerConstant(
+          currentLocation, firOpBuilder.getIntegerType(32), 1));
+    }
+    iv.push_back(bounds->name.thing.symbol);
+    loopVarTypeSize = std::max(loopVarTypeSize,
+                               bounds->name.thing.symbol->GetUltimate().size());
+    collapseValue--;
+    doConstructEval =
+        &*std::next(doConstructEval->getNestedEvaluations().begin());
+  } while (collapseValue > 0);
+
+  convertLoopBounds(converter, currentLocation, result, loopVarTypeSize);
+
+  return found;
+}
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 3943eb633b04e..30b4613837b9a 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -163,6 +163,11 @@ void genObjectList(const ObjectList &objects,
 void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp,
                                      mlir::Location loc);
 
+bool collectLoopRelatedInfo(
+    lower::AbstractConverter &converter, mlir::Location currentLocation,
+    lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
+    mlir::omp::LoopRelatedClauseOps &result,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90 b/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
index bea7f037cecf3..142bc02ae8c1d 100644
--- a/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
+++ b/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
@@ -8,10 +8,10 @@
 subroutine distribute_parallel_do_simd_num_threads()
   !$omp teams
 
-  ! CHECK:      omp.parallel num_threads({{.*}}) private({{.*}}) {
+  ! CHECK:      omp.parallel num_threads({{.*}}) {
   ! CHECK:      omp.distribute {
   ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.simd private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest
   !$omp distribute parallel do simd num_threads(10)
   do index_ = 1, 10
@@ -25,10 +25,10 @@ end subroutine distribute_parallel_do_simd_num_threads
 subroutine distribute_parallel_do_simd_dist_schedule()
   !$omp teams
 
-  ! CHECK:      omp.parallel private({{.*}}) {
+  ! CHECK:      omp.parallel  {
   ! CHECK:      omp.distribute dist_schedule_static dist_schedule_chunk_size({{.*}}) {
   ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.simd private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest
   !$omp distribute parallel do simd dist_schedule(static, 4)
   do index_ = 1, 10
@@ -42,10 +42,10 @@ end subroutine distribute_parallel_do_simd_dist_schedule
 subroutine distribute_parallel_do_simd_schedule()
   !$omp teams
 
-  ! CHECK:      omp.parallel private({{.*}}) {
+  ! CHECK:      omp.parallel {
   ! CHECK:      omp.distribute {
   ! CHECK-NEXT: omp.wsloop schedule(static = {{.*}}) {
-  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.simd private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest
   !$omp distribute parallel do simd schedule(static, 4)
   do index_ = 1, 10
@@ -59,10 +59,10 @@ end subroutine distribute_parallel_do_simd_schedule
 subroutine distribute_parallel_do_simd_simdlen()
   !$omp teams
 
-  ! CHECK:      omp.parallel private({{.*}}) {
+  ! CHECK:      omp.parallel {
   ! CHECK:      omp.distribute {
   ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.simd simdlen(4) {
+  ! CHECK-NEXT: omp.simd simdlen(4) private({{.*}}) {
   ! CHECK-NEXT: omp.loop_nest
   !$omp distribute parallel do simd simdlen(4)
   do index_ = 1, 10
@@ -83,14 +83,14 @@ subroutine distribute_parallel_do_simd_private()
   ! CHECK: omp.teams {
   !$omp teams
 
-  ! CHECK:      omp.parallel private(@{{.*}} %[[X]]#0 -> %[[X_ARG:[^,]+]],
-  ! CHECK-SAME:                      @{{.*}} %[[INDEX]]#0 -> %[[INDEX_ARG:.*]] : !fir.ref<i64>, !fir.ref<i32>) {
-  ! CHECK:      %[[X_PRIV:.*]]:2 = hlfir.declare %[[X_ARG]]
-  ! CHECK:      %[[INDEX_PRIV:.*]]:2 = hlfir.declare %[[INDEX_ARG]]
+  ! CHECK:      omp.parallel {
   ! CHECK:      omp.distribute {
   ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.simd private(@{{.*}} %[[X]]#0 -> %[[X_ARG:[^,]+]],
+  ! CHECK-SAME:                  @{{.*}} %[[INDEX]]#0 -> %[[INDEX_ARG:.*]] : !fir.ref<i64>, !fir.ref<i32>) {
   ! CHECK-NEXT: omp.loop_nest
+  ! CHECK:      %[[X_PRIV:.*]]:2 = hlfir.declare %[[X_ARG]]
+  ! CHECK:      %[[INDEX_PRIV:.*]]:2 = hlfir.declare %[[INDEX_ARG]]
   !$omp distribute parallel do simd private(x)
   do index_ = 1, 10
   end do
@@ -98,3 +98,48 @@ subroutine distribute_parallel_do_simd_private()
 
   !$omp end teams
 end subroutine distribute_parallel_do_simd_private
+
+! CHECK-LABEL:   func.func @_QPlastprivate_cond_in_composite_construct
+subroutine lastprivate_cond_in_composite_construct(x_min, x_max, y_min, y_max)
+implicit none
+integer :: x_min,x_max,y_min,y_max
+integer :: i,j
+
+! CHECK:           omp.target {{.*}} {
+! CHECK:             %[[X_MAX_MAPPED:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}x_max"}
+! CHECK:             omp.teams {
+! CHECK:               omp.parallel {
+! CHECK:                 omp.distribute {
+! CHECK:                   omp.wsloop {
+! CHECK:                     omp.simd private({{.*}}) {
+! CHECK:                       omp.loop_nest (%[[I_IV:.*]], %[[J_IV:.*]]) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) {
+! CHECK:                         %[[Y_MAX_PRIV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}y_max"}
+
+! CHECK:                         %[[I_UB:.*]] = fir.load %[[X_MAX_MAPPED]]#0 : !fir.ref<i32>
+! CHECK:                         %[[I_STEP:.*]] = arith.constant 1 : i32
+! CHECK:                         %[[J_UB:.*]] = fir.load %[[Y_MAX_PRIV]]#0 : !fir.ref<i32>
+! CHECK:                         %[[J_STEP:.*]] = arith.constant 1 : i32
+
+! CHECK:                         %[[VAL_55:.*]] = arith.addi %[[I_IV]], %[[I_STEP]] : i32
+! CHECK:                         %[[VAL_56:.*]] = arith.constant 0 : i32
+! CHECK:                         %[[VAL_57:.*]] = arith.cmpi slt, %[[I_STEP]], %[[VAL_56]] : i32
+! CHECK:                         %[[VAL_58:.*]] = arith.cmpi slt, %[[VAL_55]], %[[I_UB]] : i32
+! CHECK:                         %[[VAL_59:.*]] = arith.cmpi sgt, %[[VAL_55]], %[[I_UB]] : i32
+! CHECK:                         %[[VAL_60:.*]] = arith.select %[[VAL_57]], %[[VAL_58]], %[[VAL_59]] : i1
+
+! CHECK:                         %[[VAL_61:.*]] = arith.addi %[[J_IV]], %[[J_STEP]] : i32
+! CHECK:                         %[[VAL_62:.*]] = arith.constant 0 : i32
+! CHECK:                         %[[VAL_63:.*]] = arith.cmpi slt, %[[J_STEP]], %[[VAL_62]] : i32
+! CHECK:                         %[[VAL_64:.*]] = arith.cmpi slt, %[[VAL_61]], %[[J_UB]] : i32
+! CHECK:                         %[[VAL_65:.*]] = arith.cmpi sgt, %[[VAL_61]], %[[J_UB]] : i32
+! CHECK:                         %[[VAL_66:.*]] = arith.select %[[VAL_63]], %[[VAL_64]], %[[VAL_65]] : i1
+
+! CHECK:                         %[[LASTPRIV_CMP:.*]] = arith.andi %[[VAL_60]], %[[VAL_66]] : i1
+! CHECK:                         fir.if %[[LASTPRIV_CMP]] {
+
+!$omp target teams distribute parallel do simd collapse(2) private(y_max)
+  do i=x_min,x_max
+    do j=y_min,y_max
+    enddo
+  enddo
+end subroutine
diff --git a/flang/test/Lower/OpenMP/lastprivate-iv.f90 b/flang/test/Lower/OpenMP/lastprivate-iv.f90
index 7918b47a400b9..e90c7e253ac06 100644
--- a/flang/test/Lower/OpenMP/lastprivate-iv.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-iv.f90
@@ -14,11 +14,13 @@
 !CHECK-NEXT:   omp.loop_nest (%[[IV:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
 !CHECK:          %[[I:.*]]:2 = hlfir.declare %[[I_MEM]] {uniq_name = "_QFlastprivate_iv_incEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:          hlfir.assign %[[IV]] to %[[I]]#1 : i32, !fir.ref<i32>
-!CHECK:          %[[V:.*]] = arith.addi %[[IV]], %[[STEP]] : i32
+!CHECK:          %[[UB_2:.*]] = arith.constant 10 : i32
+!CHECK:          %[[STEP_2:.*]]  = arith.constant 3 : i32
+!CHECK:          %[[V:.*]] = arith.addi %[[IV]], %[[STEP_2]] : i32
 !CHECK:          %[[C0:.*]] = arith.constant 0 : i32
-!CHECK:          %[[STEP_NEG:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]] : i32
-!CHECK:          %[[V_LT:.*]] = arith.cmpi slt, %[[V]], %[[UB]] : i32
-!CHECK:          %[[V_GT:.*]] = arith.cmpi sgt, %[[V]], %[[UB]] : i32
+!CHECK:          %[[STEP_NEG:.*]] = arith.cmpi slt, %[[STEP_2]], %[[C0]] : i32
+!CHECK:          %[[V_LT:.*]] = arith.cmpi slt, %[[V]], %[[UB_2]] : i32
+!CHECK:          %[[V_GT:.*]] = arith.cmpi sgt, %[[V]], %[[UB_2]] : i32
 !CHECK:          %[[CMP:.*]] = arith.select %[[STEP_NEG]], %[[V_LT]], %[[V_GT]] : i1
 !CHECK:          fir.if %[[CMP]] {
 !CHECK:            hlfir.assign %[[V]] to %[[I]]#1 : i32, !fir.ref<i32>
@@ -48,11 +50,13 @@ subroutine lastprivate_iv_inc()
 !CHECK-NEXT:   omp.loop_nest (%[[IV:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
 !CHECK:          %[[I:.*]]:2 = hlfir.declare %[[I_MEM]] {uniq_name = "_QFlastprivate_iv_decEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:          hlfir.assign %[[IV]] to %[[I]]#1 : i32, !fir.ref<i32>
-!CHECK:          %[[V:.*]] = arith.addi %[[IV]], %[[STEP]] : i32
+!CHECK:          %[[UB_2:.*]] = arith.constant 1 : i32
+!CHECK:          %[[STEP_2:.*]]  = arith.constant -3 : i32
+!CHECK:          %[[V:.*]] = arith.addi %[[IV]], %[[STEP_2]] : i32
 !CHECK:          %[[C0:.*]] = arith.constant 0 : i32
-!CHECK:          %[[STEP_NEG:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]] : i32
-!CHECK:          %[[V_LT:.*]] = arith.cmpi slt, %[[V]], %[[UB]] : i32
-!CHECK:          %[[V_GT:.*]] = arith.cmpi sgt, %[[V]], %[[UB]] : i32
+!CHECK:          %[[STEP_NEG:.*]] = arith.cmpi slt, %[[STEP_2]], %[[C0]] : i32
+!CHECK:          %[[V_LT:.*]] = arith.cmpi slt, %[[V]], %[[UB_2]] : i32
+!CHECK:          %[[V_GT:.*]] = arith.cmpi sgt, %[[V]], %[[UB_2]] : i32
 !CHECK:          %[[CMP:.*]] = arith.select %[[STEP_NEG]], %[[V_LT]], %[[V_GT]] : i1
 !CHECK:          fir.if %[[CMP]] {
 !CHECK:            hlfir.assign %[[V]] to %[[I]]#1 : i32, !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/lastprivate-simd.f90 b/flang/test/Lower/OpenMP/lastprivate-simd.f90
index df42b35637de7..c542f1f99dc25 100644
--- a/flang/test/Lower/OpenMP/lastprivate-simd.f90
+++ b/flang/test/Lower/OpenMP/lastprivate-simd.f90
@@ -38,7 +38,7 @@ subroutine simd_ivs
 ! CHECK:         %[[IDO2_PRIV_DECL:.*]]:2 = hlfir.declare %[[IDO2_PRIV_ARG]] {uniq_name = "{{.*}}Eido2"}
 ! CHECK:         %[[IDO3_PRIV_DECL:.*]]:2 = hlfir.declare %[[IDO3_PRIV_ARG]] {uniq_name = "{{.*}}Eido3"}
 
-! CHECK:         fir.if %33 {
+! CHECK:         fir.if %{{.*}} {
 ! CHECK:           hlfir.assign %{{.*}} to %[[IDO1_PRIV_DECL]]#1
 ! CHECK:           hlfir.assign %{{.*}} to %[[IDO2_PRIV_DECL]]#1
 ! CHECK:           hlfir.assign %{{.*}} to %[[IDO3_PRIV_DECL]]#1
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
index 72482fc184861..14b83ce60a674 100644
--- a/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-lastpriv.f90
@@ -23,11 +23,13 @@ subroutine omp_do_lastprivate(a)
 
   ! CHECK-NEXT: hlfir.assign %[[ARG1]] to %[[I_PVT_DECL]]#1 : i32, !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP]] : i32
+  ! CHECK:      %[[UB_2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:      %[[STEP_2:.*]] = arith.constant 1 : i32
+  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP_2]] : i32
   ! CHECK:      %[[ZERO:.*]] = arith.constant 0 : i32
-  ! CHECK:      %[[STEP_DIR:.*]] = arith.cmpi slt, %[[STEP]], %[[ZERO]] : i32
-  ! CHECK:      %[[LT_UB:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB]] : i32
-  ! CHECK:      %[[GT_UB:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB]] : i32
+  ! CHECK:      %[[STEP_DIR:.*]] = arith.cmpi slt, %[[STEP_2]], %[[ZERO]] : i32
+  ! CHECK:      %[[LT_UB:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB_2]] : i32
+  ! CHECK:      %[[GT_UB:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB_2]] : i32
   ! CHECK:      %[[SEL:.*]] = arith.select %[[STEP_DIR]], %[[LT_UB]], %[[GT_UB]] : i1
   ! CHECK:      fir.if %[[SEL]] {
   ! CHECK:        hlfir.assign %[[NEXT_ARG1]] to %[[I_PVT_DECL]]#1 : i32, !fir.ref<i32>
@@ -67,11 +69,13 @@ subroutine omp_do_lastprivate2(a, n)
 
   ! CHECK: hlfir.assign %[[ARG2]] to %[[I_PVT_DECL]]#1 : i32, !fir.ref<i32>
   ! CHECK: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK: %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP]] : i32
+  ! CHECK: %[[UB_2:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[STEP_2:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP_2]] : i32
   ! CHECK: %[[ZERO:.*]] = arith.constant 0 : i32
-  ! CHECK: %[[STEP_DIR:.*]] = arith.cmpi slt, %[[STEP]], %[[ZERO]] : i32
-  ! CHECK: %[[LT_UB:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB]] : i32
-  ! CHECK: %[[GT_UB:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB]] : i32
+  ! CHECK: %[[STEP_DIR:.*]] = arith.cmpi slt, %[[STEP_2]], %[[ZERO]] : i32
+  ! CHECK: %[[LT_UB:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB_2]] : i32
+  ! CHECK: %[[GT_UB:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB_2]] : i32
   ! CHECK: %[[SEL:.*]] = arith.select %[[STEP_DIR]], %[[LT_UB]], %[[GT_UB]] : i1
   ! CHECK: fir.if %[[SEL]] {
   ! CHECK:   hlfir.assign %[[NEXT_ARG2]] to %[[I_PVT_DECL]]#1 : i32, !fir.ref<i32>
@@ -112,17 +116,24 @@ subroutine omp_do_lastprivate_collapse2(a)
   ! CHECK-NEXT: hlfir.assign %[[ARG1]] to %[[I_PVT_DECL]]#1 : i32, !fir.ref<i32>
   ! CHECK-NEXT: hlfir.assign %[[ARG2]] to %[[J_PVT_DECL]]#1 : i32, !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP1]] : i32
+
+  ! CHECK:      %[[UB1_2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:      %[[STEP1_2:.*]] = arith.constant 1 : i32
+  ! CHECK:      %[[UB2_2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:      %[[STEP2_2:.*]] = arith.constant 1 : i32
+
+  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP1_2]] : i32
   ! CHECK:      %[[ZERO1:.*]] = arith.constant 0 : i32
-  ! CHECK:      %[[STEP1_END:.*]] = arith.cmpi slt, %[[STEP1]], %[[ZERO1]] : i32
-  ! CHECK:      %[[LT_UB1:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB1]] : i32
-  ! CHECK:      %[[GT_UB1:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB1]] : i32
+  ! CHECK:      %[[STEP1_END:.*]] = arith.cmpi slt, %[[STEP1_2]], %[[ZERO1]] : i32
+  ! CHECK:      %[[LT_UB1:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB1_2]] : i32
+  ! CHECK:      %[[GT_UB1:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB1_2]] : i32
   ! CHECK:      %[[SEL1:.*]] = arith.select %[[STEP1_END]], %[[LT_UB1]], %[[GT_UB1]] : i1
-  ! CHECK:      %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP2]] : i32
+
+  ! CHECK:      %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP2_2]] : i32
   ! CHECK:      %[[ZERO2:.*]] = arith.constant 0 : i32
-  ! CHECK:      %[[STEP2_END:.*]] = arith.cmpi slt, %[[STEP2]], %[[ZERO2]] : i32
-  ! CHECK:      %[[LT_UB2:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB2]] : i32
-  ! CHECK:      %[[GT_UB2:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB2]] : i32
+  ! CHECK:      %[[STEP2_END:.*]] = arith.cmpi slt, %[[STEP2_2]], %[[ZERO2]] : i32
+  ! CHECK:      %[[LT_UB2:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB2_2]] : i32
+  ! CHECK:      %[[GT_UB2:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB2_2]] : i32
   ! CHECK:      %[[SEL2:.*]] = arith.select %[[STEP2_END]], %[[LT_UB2]], %[[GT_UB2]] : i1
   ! CHECK:      %[[AND:.*]] = arith.andi %[[SEL1]], %[[SEL2]] : i1
   ! CHECK:      fir.if %[[AND]] {
@@ -173,24 +184,32 @@ subroutine omp_do_lastprivate_collapse3(a)
   ! CHECK-NEXT: hlfir.assign %[[ARG2]] to %[[J_PVT_DECL]]#1 : i32, !fir.ref<i32>
   ! CHECK-NEXT: hlfir.assign %[[ARG3]] to %[[K_PVT_DECL]]#1 : i32, !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP1]] : i32
+
+  ! CHECK:      %[[UB1_2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:      %[[STEP1_2:.*]] = arith.constant 1 : i32
+  ! CHECK:      %[[UB2_2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:      %[[STEP2_2:.*]] = arith.constant 1 : i32
+  ! CHECK:      %[[UB3_2:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:      %[[STEP3_2:.*]] = arith.constant 1 : i32
+
+  ! CHECK:      %[[NEXT_ARG1:.*]] = arith.addi %[[ARG1]], %[[STEP1_2]] : i32
   ! CHECK:      %[[ZERO1:.*]] = arith.constant 0 : i32
-  ! CHECK:      %[[STEP1_END:.*]] = arith.cmpi slt, %[[STEP1]], %[[ZERO1]] : i32
-  ! CHECK:      %[[LT_UB1:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB1]] : i32
-  ! CHECK:      %[[GT_UB1:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB1]] : i32
+  ! CHECK:      %[[STEP1_END:.*]] = arith.cmpi slt, %[[STEP1_2]], %[[ZERO1]] : i32
+  ! CHECK:      %[[LT_UB1:.*]] = arith.cmpi slt, %[[NEXT_ARG1]], %[[UB1_2]] : i32
+  ! CHECK:      %[[GT_UB1:.*]] = arith.cmpi sgt, %[[NEXT_ARG1]], %[[UB1_2]] : i32
   ! CHECK:      %[[SEL1:.*]] = arith.select %[[STEP1_END]], %[[LT_UB1]], %[[GT_UB1]] : i1
-  ! CHECK:      %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP2]] : i32
+  ! CHECK:      %[[NEXT_ARG2:.*]] = arith.addi %[[ARG2]], %[[STEP2_2]] : i32
   ! CHECK:      %[[ZERO2:.*]] = arith.constant 0 : i32
-  ! CHECK:      %[[STEP2_END:.*]] = arith.cmpi slt, %[[STEP2]], %[[ZERO2]] : i32
-  ! CHECK:      %[[LT_UB2:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB2]] : i32
-  ! CHECK:      %[[GT_UB2:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB2]] : i32
+  ! CHECK:      %[[STEP2_END:.*]] = arith.cmpi slt, %[[STEP2_2]], %[[ZERO2]] : i32
+  ! CHECK:      %[[LT_UB2:.*]] = arith.cmpi slt, %[[NEXT_ARG2]], %[[UB2_2]] : i32
+  ! CHECK:      %[[GT_UB2:.*]] = arith.cmpi sgt, %[[NEXT_ARG2]], %[[UB2_2]] : i32
   ! CHECK:      %[[SEL2:.*]] = arith.select %[[STEP2_END]], %[[LT_UB2]], %[[GT_UB2]] : i1
   ! CHECK:      %[[AND1:.*]] = arith.andi %[[SEL1]], %[[SEL2]] : i1
-  ! CHECK:      %[[NEXT_ARG3:.*]] = arith.addi %[[ARG3]], %[[STEP3]] : i32
+  ! CHECK:      %[[NEXT_ARG3:.*]] = arith.addi %[[ARG3]], %[[STEP3_2]] : i32
   ! CHECK:      %[[ZERO3:.*]] = arith.constant 0 : i32
-  ! CHECK:      %[[STEP3_END:.*]] = arith.cmpi slt, %[[STEP3]], %[[ZERO3]] : i32
-  ! CHECK:      %[[LT_UB3:.*]] = arith.cmpi slt, %[[NEXT_ARG3]], %[[UB3]] : i32
-  ! CHECK:      %[[GT_UB3:.*]] = arith.cmpi sgt, %[[NEXT_ARG3]], %[[UB3]] : i32
+  ! CHECK:      %[[STEP3_END:.*]] = arith.cmpi slt, %[[STEP3_2]], %[[ZERO3]] : i32
+  ! CHECK:      %[[LT_UB3:.*]] = arith.cmpi slt, %[[NEXT_ARG3]], %[[UB3_2]] : i32
+  ! CHECK:      %[[GT_UB3:.*]] = arith.cmpi sgt, %[[NEXT_ARG3]], %[[UB3_2]] : i32
   ! CHECK:      %[[SEL3:.*]] = arith.select %[[STEP3_END]], %[[LT_UB3]], %[[GT_UB3]] : i1
   ! CHECK:      %[[AND2:.*]] = arith.andi %[[AND1]], %[[SEL3]] : i1
   ! CHECK:      fir.if %[[AND2]] {

>From 95767a9903208e545badd920a1a16e5476ae09f9 Mon Sep 17 00:00:00 2001
From: lonely eagle <2020382038 at qq.com>
Date: Fri, 7 Mar 2025 13:00:05 +0800
Subject: [PATCH 08/23] [mlir][nvgpu] separate ops, types, attribute
 definitions in NVGPU dialect. (#129846)

It is hoped that the Ops, Types, and Attribute of the NVGPU dialect can
be defined in separate files.If downstream projects extend NVGPU and
define other Ops, the types and attributes will be used.This PR was
raised to avoid including the definition of NVGPU Ops.
---
 .../mlir/Dialect/NVGPU/IR/CMakeLists.txt      |  13 +-
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td   | 727 +-----------------
 .../mlir/Dialect/NVGPU/IR/NVGPUDialect.h      |   4 +-
 .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 638 +++++++++++++++
 .../mlir/Dialect/NVGPU/IR/NVGPUTypes.td       | 117 +++
 mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt      |   2 +-
 mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp    |   8 +-
 mlir/python/mlir/dialects/NVGPUOps.td         |   2 +-
 8 files changed, 776 insertions(+), 735 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
 create mode 100644 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
index 13d754ca06316..ecdaae7f24d93 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
@@ -1,5 +1,10 @@
 add_mlir_dialect(NVGPU nvgpu)
-add_mlir_doc(NVGPU NVGPU Dialects/ -gen-dialect-doc)
+add_mlir_doc(NVGPUOps NVGPU Dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS NVGPUOps.td)
+mlir_tablegen(NVGPUOps.h.inc -gen-op-decls)
+mlir_tablegen(NVGPUOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRNVGPUOpsIncGen)
 
 set(LLVM_TARGET_DEFINITIONS NVGPU.td)
 mlir_tablegen(NVGPUEnums.h.inc -gen-enum-decls)
@@ -11,7 +16,7 @@ mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
 mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
 add_public_tablegen_target(MLIRNVGPUAttributesIncGen)
 
-set(LLVM_TARGET_DEFINITIONS NVGPU.td)
-mlir_tablegen(NVGPUAttrTypes.h.inc -gen-typedef-decls)
-mlir_tablegen(NVGPUAttrTypes.cpp.inc -gen-typedef-decls)
+set(LLVM_TARGET_DEFINITIONS NVGPUTypes.td)
+mlir_tablegen(NVGPUTypeDefs.h.inc -gen-typedef-decls)
+mlir_tablegen(NVGPUTypeDefs.cpp.inc -gen-typedef-defs)
 add_public_tablegen_target(MLIRNVGPUTypesIncGen)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index f48fa9976da12..7f7a54cb0c57e 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -1,24 +1,13 @@
-//===-- NVGPU.td - NVGPU dialect operation definitions *- tablegen -*------===//
+//===-- NVGPU.td - Attribute defs for NVGPU dialect *- tablegen -*---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This file defines the basic operations for the NVGPU dialect.
-//
-// This NVGPU provides a bridge between the target agnostic GPU and Vector
-// dialects and lower level NVVM dialect. This allow representing PTX specific
-// operations while using MLIR high level concepts like memref and 2-D vector.
-//
-// Ops semantic are going to be based on vendor specific PTX defintion:
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
-//
-//===----------------------------------------------------------------------===//
 
-#ifndef NVGPU
-#define NVGPU
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPU_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPU_TD
 
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -127,712 +116,4 @@ def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
 def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
 def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
 
-//===----------------------------------------------------------------------===//
-// NVGPU Type Definitions
-//===----------------------------------------------------------------------===//
-
-class NVGPU_Type<string name, string typeMnemonic,
-        list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
-  let mnemonic = typeMnemonic;
-}
-
-def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
-                                        "device.async.token", []> {
-  let summary = "device async token type";
-  let description = [{
-    `nvgpu.device.async.token` is a type returned by an asynchronous operation
-    that runs on the GPU (device). It is used to establish an SSA-based link
-    between the async operation (e.g. DeviceAsyncCopy) and operations that
-    group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
-    DeviceAsyncWaitOp).
-  }];
-}
-
-def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
-  let summary = "mbarrier barrier type";
-  let description = [{
-    This is the type for one or more mbarrier object in shared memory that is 
-    used to synchronize a variable number of threads.
-
-    If `num_barriers` is not set, the number of mbarrier objects is 1.
-
-    A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object 
-    can be initiated and invalidated.
-
-    [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
-  }];    
-  let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
-  let assemblyFormat = "`<` struct(params) `>`";
-  let builders = [
-    TypeBuilder<(ins "Attribute":$memorySpace), [{
-      return $_get($_ctxt, memorySpace, 1);
-    }]>
-  ];
-}
-
-def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
-def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
-  let summary = "TensorMap descriptor";
-  let parameters = (ins "MemRefType":$tensor,
-                        EnumParameter<TensorMapSwizzleKind>:$swizzle,
-                        EnumParameter<TensorMapL2PromoKind>:$l2promo,
-                        EnumParameter<TensorMapOOBKind>:$oob,
-                        EnumParameter<TensorMapInterleaveKind>:$interleave);
-  let description = [{
-    `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is 
-    128-byte object either in constant space or kernel paramater.    
-  }];
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
-  let summary = "Warpgroup matrix descriptor type";
-  let description = [{
-  The descriptor specifies the properties of the matrix in shared memory that 
-  is a multiplicand in the matrix multiply and accumulate operation. 
-  
-  The descriptor is a 64-bit value contained in a register with the following:
-  ```
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  |   0-13  |14-15|   16-29   |30-31|   32-45   |46-48|49-51|   52-61   |62-63|
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  |  14bits |2bits|   14bits  |2bits|   14bits  |2bits|3bits|   10bits  |2bits|
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  | BaseAddr|  0  | LeadingDim|  0  |   Stride  |  0  |Offst|     0     |Swzle|
-  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
-  ```
-   
-  [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor) 
-  
-  }];  
-  let parameters = (ins "MemRefType":$tensor);
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> {
-  let parameters = (ins "VectorType":$fragmented);
-  let assemblyFormat = "`<` struct(params) `>`";
-  let description = [{
-    This type represents the result matrix obtained from `nvgpu.warpgroup.mma`. 
-    The `$fragmented` type signifies the distributed or fragmented result 
-    vector that is collectively owned by all the threads in the warp-group 
-    that executed `nvgpu.warpgroup.mma`.
-    [See the details of register fragment layout for accumulator matrix D]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
-  }];
-}
-
-//===----------------------------------------------------------------------===//
-// NVGPU Op Definitions
-//===----------------------------------------------------------------------===//
-
-class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
-  Op<NVGPU_Dialect, mnemonic, traits> {}
-
-def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
-                                MemoryEffects<[MemRead]>,
-                                PredOpTrait<"srcMemref and res have same element type",
-                                            TCresVTEtIsSameAsOp<0, 0>>]> {
-  let description = [{
-    The `nvgpu.ldmatrix` op represents loading a matrix fragment from
-    memory to registers. The source and result type must be compatible
-    with lowering to the `nvvm.ldmatrix` instruction. This op represents
-    the distributed version of a `vector.transfer_read` as an intermediate
-    step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
-
-    This operation is meant to follow the semantic of described here:
-    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
-
-    Example:
-    ```mlir
-    %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
-      memref<?x?xf16, 3> -> vector<4x2xf16>
-    ```
-  }];
-
-  let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
-                           Variadic<Index>:$indices, BoolAttr:$transpose,
-                           I32Attr:$numTiles);
-  let results = (outs AnyVectorOfNonZeroRank:$res);
-  let assemblyFormat = [{
-    $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
-  }];
-
-  let hasVerifier = 1;
-}
-
-class NVGPU_MmaSyncOp<string mnemonic> :
-        NVGPU_Op<mnemonic,  [Pure,
-                             PredOpTrait<"matrixA and matrixB have same element type",
-                                         TCopVTEtIsSameAs<0, 1>>]> {
-  code extraBaseClassDeclaration = [{
-    std::array<int64_t, 3> getMmaShapeAsArray() {
-      ArrayAttr mmaShape = this->getMmaShape();
-      assert(mmaShape.size() == 3 && "mmaShape should be three integers");
-      return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(),
-              ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(),
-              ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()};
-    }
-  }];
-
-  let hasVerifier = 1;
-}
-
-def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
-  let description = [{
-    The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
-    accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
-    The operands and results vector sizes are thread-level onwership to
-    the warp-level mma operation shape. `mmaShape` attribute holds the
-    warp-level matrix-multiply shape.
-
-    The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
-    `vector.contract` to `nvvm.mma.sync`.
-
-    This operation is meant to follow the semantic of described here:
-      https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
-
-    Example:
-
-    ```mlir
-    %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
-        (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
-    ```
-  }];
-  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
-                       AnyVectorOfNonZeroRank:$matrixB,
-                       AnyVectorOfNonZeroRank:$matrixC,
-                       I64ArrayAttr:$mmaShape,
-                       OptionalAttr<UnitAttr>:$tf32Enabled);
-
-  let results = (outs AnyVectorOfNonZeroRank:$res);
-
-  let builders = [
-    OpBuilder<(ins "Value":$matrixA,
-                   "Value":$matrixB,
-                   "Value":$matrixC,
-                   "ArrayAttr":$mmaShape)>,
-    OpBuilder<(ins "Value":$matrixA,
-                   "Value":$matrixB,
-                   "Value":$matrixC,
-                   "ArrayRef<int64_t>":$mmaShape,
-                   CArg<"bool", "false">:$tf32Enabled)>
-  ];
-
-  let assemblyFormat = [{
-    `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
-    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
-  }];
-
-  let extraClassDeclaration = extraBaseClassDeclaration;
-}
-
-def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>,
-                        BuildableType<"::mlir::VectorType::get("
-                          "{2},$_builder.getI16Type())">;
-
-def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
-  let description = [{
-  The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
-  where operand A is "structured sparse". In this case, the `matrixA` operand
-  represents the (warp-distributed) non-zero values of operand A, and the
-  `sparse_metadata` operand provides the indices.
-
-  The full description of the sparsity storage format and distribution scheme is
-  described in the PTX docs. This operation is meant to follow the semantic
-  described in the PTX documentation here:
-  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
-
-  The way the indices are distributed among the threads in a warp is controlled
-  by the optional `sparsity_selector` operand, which is `0` by default. For
-  more information, please consult the PTX documentation linked above.
-
-  Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
-
-  ```mlir
-  nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
-    (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
-  ```
-  }];
-
-  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
-                       AnyVectorOfNonZeroRank:$matrixB,
-                       AnyVectorOfNonZeroRank:$matrixC,
-                       NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
-                       I64ArrayAttr:$mmaShape,
-                       DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
-                       OptionalAttr<UnitAttr>:$tf32Enabled
-                       );
-
-  let results = (outs AnyVectorOfNonZeroRank:$res);
-
-  let builders = [
-    OpBuilder<(ins "Value":$matrixA,
-                   "Value":$matrixB,
-                   "Value":$matrixC,
-                   "Value":$sparseMetadata,
-                   "ArrayRef<int64_t>":$mmaShape)>
-  ];
-
-  let assemblyFormat = [{
-    `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict
-    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
-  }];
-
-  let extraClassDeclaration = extraBaseClassDeclaration;
-}
-
-def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
-                                       AttrSizedOperandSegments]> {
-  let summary = "device-side asynchronous copy";
-  let description = [{
-    The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
-    elements from source (global memory) to the destination (shared memory)
-    without blocking the thread. The async copy is added to a group.
-
-    This op is meant to be used with `nvgpu.device_async_create_group` and
-    `nvgpu.device_async_wait` to synchronize copies as explained in those ops
-    descriptions.
-
-    `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
-    async copy, this hint may be ignored by the hardware.
-
-    `dstElements` attribute is the total number of elements written to
-    destination (shared memory).
-
-    `srcElements` argument is the total number of elements read from
-    source (global memory).
-
-    `srcElements` is an optional argument and when present the op only reads
-    `srcElements` number of elements from the source (global memory) and zero fills
-    the rest of the elements in the destination (shared memory).
-
-    In order to do a copy and wait for the result we need the following
-    combination:
-    ```
-    // copy 1.
-    %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
-    // copy 2.
-    %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
-    // group 1 contains copy 1 and copy 2.
-    %token1 = nvgpu.device_async_create_group %cp1, %cp2
-    // copy 3.
-    %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
-    // group 2 contains copy 3.
-    %token2 = nvgpu.device_async_create_group %cp3
-    // after the wait copy 1 and copy 2 are complete.
-    nvgpu.device_async_wait %token1
-    // after the wait copy 3 is complete.
-    nvgpu.device_async_wait %token2
-    ```
-
-    Example:
-
-    ```mlir
-    %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
-      memref<4x5xf32> to memref<2x7x5xf32, 3>
-    ```
-  }];
-  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
-  let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
-                       Variadic<Index>:$dstIndices,
-                       Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
-                       Variadic<Index>:$srcIndices,
-                       IndexAttr:$dstElements,
-                       Optional<Index>:$srcElements,
-                       OptionalAttr<UnitAttr>:$bypassL1);
-  let assemblyFormat = [{
-    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
-      attr-dict `:` type($src) `to` type($dst)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
-  let summary = "device side asynchronous create group operation";
-  let description = [{
-    The `nvgpu.device_async_create_group` op creates a group of memory accesses
-    containing all the pending `device_async_copy` operations associated with
-    argument tokens. Each token can only be part of one group.
-
-    It returns a token that can be use to wait until the group fully completes.
-
-    This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
-    as explained in those ops descriptions.
-
-    Groups are executed in the order they are created.
-
-    Example:
-
-    ```mlir
-    %0 = nvgpu.device_async_create_group
-  ```
-  }];
-  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
-  let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
-  let assemblyFormat = [{
-    $inputTokens attr-dict
-  }];
-}
-
-def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
-  let summary = "Wait for async gpu ops to complete.";
-  let description = [{
-    The `nvgpu.device_async_wait` op will block the execution thread until the group
-    associated with the source token is fully completed.
-
-    The optional `$numGroups` attribute gives an upper bound of the number of
-    groups uncompleted when the wait can unblock the thread. For example,  if
-    16 async groups are pushe and `$numGroups` is set to 12, then the thread
-    will unblock when 12 groups or fewer are in flight (4 groups have
-    completed).
-
-    Example:
-
-    ```mlir
-    nvgpu.device_async_wait %0
-    ```
-  }];
-  let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
-                       OptionalAttr<I32Attr>:$numGroups);
-  let assemblyFormat = [{
-    $asyncDependencies attr-dict
-  }];
-}
-
-def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
-  let summary = "Creates a `nvgpu.mbarrier` object.";
-  let description = [{
-    The Op generates one or more `mbarrier` object, which is a barrier created in 
-    shared memory and supports various synchronization behaviors for threads.
-
-    The `mbarrier` object has the following type and alignment requirements:
-      Type: .b64, Alignment: 8, Memory space: .shared
-    
-    Example:
-    ```mlir
-      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-    }];
-  let arguments = (ins);
-  let results = (outs NVGPU_MBarrierGroup:$barriers);
-  let assemblyFormat = [{
-     attr-dict `->` type($barriers)
-  }];
-}
-
-def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
-  let summary = "Initialize the `nvgpu.mbarrier`.";
-  let description = [{
-    The Op initializes the `mbarrier` object with the given number of threads.
-
-    Example:
-    ```mlir
-      %num_threads = gpu.block_dim x
-      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-      nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
-}
-
-def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
-  let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
-  let description = [{
-    Checks whether the mbarrier object has completed the phase. It is is a 
-    non-blocking instruction which tests for the completion of the phase.
-
-    Example:
-    ```mlir
-      %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
-  let results = (outs I1:$waitComplete);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
-}
-
-def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
-  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
-  let description = [{
-    The Op performs arrive-on operation on the `mbarrier` object and returns a 
-    `nvgpu.mbarrier.token`.
-
-    For more information, see
-    https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
-
-    Example:
-    ```mlir
-      %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
-  let results = (outs NVGPU_MBarrierToken:$token);
-let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
-}
-
-def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
-  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
-  let description = [{
-    The Op performs arrive-on operation on the `mbarrier` object and returns a 
-    `nvgpu.mbarrier.token`.
-
-    The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
-
-    Example:
-    ```mlir
-      %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
-                       Index:$count);
-  let results = (outs NVGPU_MBarrierToken:$token);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
-}
-
-def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
-  let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
-  let description = [{
-    A thread executing the Op performs an expect-tx operation on the mbarrier 
-    object at the location specified by the address operand $barrier. The 
-    expect-tx operation, with an $txcount argument, increases the tx-count of 
-    an mbarrier object by the value specified by $txcount. This makes the 
-    current phase of the mbarrier object to expect and track the completion of 
-    additional asynchronous transactions.
-    
-    The `$txCount` specifies the number of element to the expect-tx operation.
-
-    Example:
-    ```mlir
-      nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount  (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
-}
-
-def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
-  let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase.";
-  let description = [{
-    Checks whether the mbarrier object has completed the phase. It is is a 
-    potentially blocking instruction which tests for the completion of the 
-    phase. Suspended thread resumes execution when the specified phase completes 
-    OR before the phase completes following a system-dependent time limit. 
-
-    The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
-    wait.
-
-    Example:
-    ```mlir
-      nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-    ```
-  }];
-  let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
-  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";  
-}
-
-def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
-  let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
-  let description = [{
-    The Op brings the cache line containing the given `$tmaDescriptor` for 
-    subsequent use by the `tma.async.load` instruction.
-  }];
-  let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
-  let assemblyFormat = [{
-    $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
-  }];
-}
-
-def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
-  let summary = "TMA asynchronous load";
-  let description = [{
-    The Op loads a tile memory region from global memory to shared memory by 
-    Tensor Memory Access (TMA).
-    
-    `$tensorMapDescriptor` is tensor map descriptor which has information about
-    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
-
-    The Op uses `$barrier` mbarrier based completion mechanism. 
-  }];  
-  let arguments = (ins  Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
-                        NVGPU_MBarrierGroup:$barriers,
-                        NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
-                        Variadic<Index>:$coordinates, 
-                        Index:$mbarId,
-                        Optional<I16>:$multicastMask,
-                        Optional<I1>:$predicate);
-  let assemblyFormat = [{
-    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` 
-      `to` $dst
-      (`multicast_mask` `=` $multicastMask^ )?
-      (`,` `predicate` `=` $predicate^)?
-      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) 
-      `->` type($dst)
-  }];
-  let hasVerifier = 1;
-
-}
-
-def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> {
-  let summary = "TMA asynchronous store";
-  let description = [{
-    The Op store a tile memory region from global memory to shared memory by 
-    Tensor Memory Access (TMA).
-    
-    `$tensorMapDescriptor` is tensor map descriptor which has information about
-    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
-  }];  
-  let arguments = (ins  Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
-                        Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
-                        Variadic<Index>:$coordinates, 
-                        Optional<I1>:$predicate);
-  let assemblyFormat = [{
-      $src `to` $tensorMapDescriptor `[` $coordinates `]`
-      (`,` `predicate` `=` $predicate^)?
-      attr-dict `:` type($src)
-      `->` type($tensorMapDescriptor)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
-  let summary = "TMA create descriptor";
-  let description = [{
-    The Op creates a tensor map descriptor object representing tiled memory 
-    region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The 
-    descriptor is used by Tensor Memory Access (TMA).
-
-    The `tensor` is the source tensor to be tiled. 
-
-    The `boxDimensions` is the size of the tiled memory region in each dimension.
-
-    For more information see below:
-    https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
-  }];
-
-  let arguments = (ins AnyUnrankedMemRef:$tensor,
-                       Variadic<Index>:$boxDimensions);
-  let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
-  let assemblyFormat = [{
-         $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
-  let summary = "Generate a warpgroup matrix descriptor";
-  let description = [{
-  This Op builds a `nvgpu.warpgroup.descriptor` that is used by 
-  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 
-  accumulate.
-
-  The descriptor specifies the properties of the matrix in shared memory that 
-  is a multiplicand in the matrix multiply and accumulate operation. 
-  }];  
-  let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
-  let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor, 
-                       NVGPU_TensorMapDescriptor:$tensorMap);
-  let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
-  let description = [{
-    The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps) 
-    matrix-multiply-and-accumulate (mma) operation that results in 
-    `nvvm.wgmma.mma_async`. 
-    
-    The operands are `descriptorA` and `descriptorB` that are wgmma matrix 
-    descriptors that shows the properties of the matrix in shared memory. The 
-    results are thread-level ownership to the warpgroup-level mma operation 
-    shape. The shape is deduced from the descriptor types and output vector.
-
-    The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete 
-    the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX 
-    instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and 
-    surrounds them between `wgmma.fence.aligned` and 
-    `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
-
-    Example:
-    ```mlir
-      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
-                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
-                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-                 -> 
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
-                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
-    ```
-  }];
-
-  let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA, 
-                       NVGPU_WarpgroupMatrixDescriptor:$descriptorB,                                               
-                       DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
-                       OptionalAttr<UnitAttr>:$transposeA,
-                       OptionalAttr<UnitAttr>:$transposeB,
-                       NVGPU_WarpgroupAccumulator:$matrixC);
-  let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
-  let assemblyFormat = [{    
-    $descriptorA`,` $descriptorB`,` $matrixC attr-dict
-    `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
-  let description = [{
-    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
-    in $matrixD to given memref. 
-
-    [See the details of register fragment layout for accumulator matrix D]
-    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
-
-    Note that, the op must be run with warp group.
-  }];
-
-  let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
-                       Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
-  
-  let assemblyFormat = [{
-    $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
-  }];
-  let hasVerifier = 1;
-}
-
-def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {  
-  let summary = "Initializes the accumulator matrix";
-
-  let description = [{
-    This Op generates and initializes the accumulator matrix for 
-    `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
-  }];
-  let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
-  let assemblyFormat = "attr-dict `->` type($matrixC)";
-  let hasVerifier = 1;
-}
-
-def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
-                                   SameOperandsAndResultType]> {
-  let summary = "The reciprocal calculation for vector types";
-  let description = [{
-    Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
-
-    Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
-
-    The input and output must be of the same vector type and shape.
-  }];
-  let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
-                       DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
-                       UnitAttr:$ftz);
-  let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
-  let assemblyFormat = [{
-    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
-    attr-dict `:` type($out)
-  }];
-  let hasVerifier = 1;
-}
-#endif // NVGPU
+#endif // MLIR_DIALECT_NVGPU_IR_NVGPU_TD
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index db4c63b3390eb..61a57fb60bda4 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -53,11 +53,11 @@ constexpr unsigned kMaxTMALastdimByte = 128;
 #include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.h.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.h.inc"
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h.inc"
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPU.h.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.h.inc"
 
 #endif // MLIR_DIALECT_NVGPU_NVGPUDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
new file mode 100644
index 0000000000000..eb0fb90d271ed
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -0,0 +1,638 @@
+//===-- NVGPUOps.td - NVGPU dialect operation definitions *- tablegen -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the basic operations for the NVGPU dialect.
+//
+// This NVGPU provides a bridge between the target agnostic GPU and Vector
+// dialects and lower level NVVM dialect. This allow representing PTX specific
+// operations while using MLIR high level concepts like memref and 2-D vector.
+//
+// Ops semantic are going to be based on vendor specific PTX defintion:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
+
+include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
+
+//===----------------------------------------------------------------------===//
+// NVGPU Op Definitions
+//===----------------------------------------------------------------------===//
+
+class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
+  Op<NVGPU_Dialect, mnemonic, traits> {}
+
+def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
+                                MemoryEffects<[MemRead]>,
+                                PredOpTrait<"srcMemref and res have same element type",
+                                            TCresVTEtIsSameAsOp<0, 0>>]> {
+  let description = [{
+    The `nvgpu.ldmatrix` op represents loading a matrix fragment from
+    memory to registers. The source and result type must be compatible
+    with lowering to the `nvvm.ldmatrix` instruction. This op represents
+    the distributed version of a `vector.transfer_read` as an intermediate
+    step between lowering from `vector.transfer_read` to `nvvm.ldmatrix`.
+
+    This operation is meant to follow the semantic of described here:
+    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
+
+    Example:
+    ```mlir
+    %0 = nvgpu.ldmatrix %sm[%c0, %c0] {numTiles = 4 : i32, transpose = false} :
+      memref<?x?xf16, 3> -> vector<4x2xf16>
+    ```
+  }];
+
+  let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
+                           Variadic<Index>:$indices, BoolAttr:$transpose,
+                           I32Attr:$numTiles);
+  let results = (outs AnyVectorOfNonZeroRank:$res);
+  let assemblyFormat = [{
+    $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
+  }];
+
+  let hasVerifier = 1;
+}
+
+class NVGPU_MmaSyncOp<string mnemonic> :
+        NVGPU_Op<mnemonic,  [Pure,
+                             PredOpTrait<"matrixA and matrixB have same element type",
+                                         TCopVTEtIsSameAs<0, 1>>]> {
+  code extraBaseClassDeclaration = [{
+    std::array<int64_t, 3> getMmaShapeAsArray() {
+      ArrayAttr mmaShape = this->getMmaShape();
+      assert(mmaShape.size() == 3 && "mmaShape should be three integers");
+      return {::llvm::cast<IntegerAttr>(mmaShape[0]).getInt(),
+              ::llvm::cast<IntegerAttr>(mmaShape[1]).getInt(),
+              ::llvm::cast<IntegerAttr>(mmaShape[2]).getInt()};
+    }
+  }];
+
+  let hasVerifier = 1;
+}
+
+def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
+  let description = [{
+    The `nvgpu.mma.sync` op represents the warp-level matrix-multiply-and-
+    accumulate (mma) operation that is compatible with `nvvm.mma.sync`.
+    The operands and results vector sizes are thread-level onwership to
+    the warp-level mma operation shape. `mmaShape` attribute holds the
+    warp-level matrix-multiply shape.
+
+    The `nvgpu.mma.sync` op serves as an intermediate point between lowering from
+    `vector.contract` to `nvvm.mma.sync`.
+
+    This operation is meant to follow the semantic of described here:
+      https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
+
+    Example:
+
+    ```mlir
+    %res = nvgpu.mma.sync (%matrixA, %matrixB, %matrixC) {mmaShape = [16, 8, 16]} :
+        (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
+    ```
+  }];
+  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+                       AnyVectorOfNonZeroRank:$matrixB,
+                       AnyVectorOfNonZeroRank:$matrixC,
+                       I64ArrayAttr:$mmaShape,
+                       OptionalAttr<UnitAttr>:$tf32Enabled);
+
+  let results = (outs AnyVectorOfNonZeroRank:$res);
+
+  let builders = [
+    OpBuilder<(ins "Value":$matrixA,
+                   "Value":$matrixB,
+                   "Value":$matrixC,
+                   "ArrayAttr":$mmaShape)>,
+    OpBuilder<(ins "Value":$matrixA,
+                   "Value":$matrixB,
+                   "Value":$matrixC,
+                   "ArrayRef<int64_t>":$mmaShape,
+                   CArg<"bool", "false">:$tf32Enabled)>
+  ];
+
+  let assemblyFormat = [{
+    `(` $matrixA`,` $matrixB`,` $matrixC `)` attr-dict
+    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
+  }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
+}
+
+def NVGPU_MmaSparseSyncMetadataType : FixedVectorOfLengthAndType<[2], [I16]>,
+                        BuildableType<"::mlir::VectorType::get("
+                          "{2},$_builder.getI16Type())">;
+
+def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
+  let description = [{
+  The `nvgu.mma.sp.sync` operation performs a warp-distributed MMA operation
+  where operand A is "structured sparse". In this case, the `matrixA` operand
+  represents the (warp-distributed) non-zero values of operand A, and the
+  `sparse_metadata` operand provides the indices.
+
+  The full description of the sparsity storage format and distribution scheme is
+  described in the PTX docs. This operation is meant to follow the semantic
+  described in the PTX documentation here:
+  https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma
+
+  The way the indices are distributed among the threads in a warp is controlled
+  by the optional `sparsity_selector` operand, which is `0` by default. For
+  more information, please consult the PTX documentation linked above.
+
+  Example (targetingthe f16 16x8x32 `mma.sp` PTX instruction):
+
+  ```mlir
+  nvgpu.mma.sp.sync (%a, %b, %c) metadata (%meta) {mmaShape = [16, 8, 32]} :
+    (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+  ```
+  }];
+
+  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+                       AnyVectorOfNonZeroRank:$matrixB,
+                       AnyVectorOfNonZeroRank:$matrixC,
+                       NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
+                       I64ArrayAttr:$mmaShape,
+                       DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
+                       OptionalAttr<UnitAttr>:$tf32Enabled
+                       );
+
+  let results = (outs AnyVectorOfNonZeroRank:$res);
+
+  let builders = [
+    OpBuilder<(ins "Value":$matrixA,
+                   "Value":$matrixB,
+                   "Value":$matrixC,
+                   "Value":$sparseMetadata,
+                   "ArrayRef<int64_t>":$mmaShape)>
+  ];
+
+  let assemblyFormat = [{
+    `(` $matrixA`,` $matrixB`,` $matrixC `)` `metadata` `(` $sparseMetadata `)` attr-dict
+    `:` `(` type($matrixA) `,` type($matrixB) `,` type($matrixC) `)` `->` type($res)
+  }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
+}
+
+def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
+                                       AttrSizedOperandSegments]> {
+  let summary = "device-side asynchronous copy";
+  let description = [{
+    The `nvgpu.device_async_copy` op initiates an asynchronous copy operation of
+    elements from source (global memory) to the destination (shared memory)
+    without blocking the thread. The async copy is added to a group.
+
+    This op is meant to be used with `nvgpu.device_async_create_group` and
+    `nvgpu.device_async_wait` to synchronize copies as explained in those ops
+    descriptions.
+
+    `bypassL1` attribute is hint to the hardware to bypass the L1 cache during
+    async copy, this hint may be ignored by the hardware.
+
+    `dstElements` attribute is the total number of elements written to
+    destination (shared memory).
+
+    `srcElements` argument is the total number of elements read from
+    source (global memory).
+
+    `srcElements` is an optional argument and when present the op only reads
+    `srcElements` number of elements from the source (global memory) and zero fills
+    the rest of the elements in the destination (shared memory).
+
+    In order to do a copy and wait for the result we need the following
+    combination:
+    ```
+    // copy 1.
+    %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
+    // copy 2.
+    %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    // group 1 contains copy 1 and copy 2.
+    %token1 = nvgpu.device_async_create_group %cp1, %cp2
+    // copy 3.
+    %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    // group 2 contains copy 3.
+    %token2 = nvgpu.device_async_create_group %cp3
+    // after the wait copy 1 and copy 2 are complete.
+    nvgpu.device_async_wait %token1
+    // after the wait copy 3 is complete.
+    nvgpu.device_async_wait %token2
+    ```
+
+    Example:
+
+    ```mlir
+    %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
+      memref<4x5xf32> to memref<2x7x5xf32, 3>
+    ```
+  }];
+  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+  let arguments = (ins Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
+                       Variadic<Index>:$dstIndices,
+                       Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
+                       Variadic<Index>:$srcIndices,
+                       IndexAttr:$dstElements,
+                       Optional<Index>:$srcElements,
+                       OptionalAttr<UnitAttr>:$bypassL1);
+  let assemblyFormat = [{
+    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $dstElements (`,` $srcElements^)?
+      attr-dict `:` type($src) `to` type($dst)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
+  let summary = "device side asynchronous create group operation";
+  let description = [{
+    The `nvgpu.device_async_create_group` op creates a group of memory accesses
+    containing all the pending `device_async_copy` operations associated with
+    argument tokens. Each token can only be part of one group.
+
+    It returns a token that can be use to wait until the group fully completes.
+
+    This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
+    as explained in those ops descriptions.
+
+    Groups are executed in the order they are created.
+
+    Example:
+
+    ```mlir
+    %0 = nvgpu.device_async_create_group
+  ```
+  }];
+  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+  let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
+  let assemblyFormat = [{
+    $inputTokens attr-dict
+  }];
+}
+
+def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
+  let summary = "Wait for async gpu ops to complete.";
+  let description = [{
+    The `nvgpu.device_async_wait` op will block the execution thread until the group
+    associated with the source token is fully completed.
+
+    The optional `$numGroups` attribute gives an upper bound of the number of
+    groups uncompleted when the wait can unblock the thread. For example,  if
+    16 async groups are pushe and `$numGroups` is set to 12, then the thread
+    will unblock when 12 groups or fewer are in flight (4 groups have
+    completed).
+
+    Example:
+
+    ```mlir
+    nvgpu.device_async_wait %0
+    ```
+  }];
+  let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
+                       OptionalAttr<I32Attr>:$numGroups);
+  let assemblyFormat = [{
+    $asyncDependencies attr-dict
+  }];
+}
+
+def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
+  let summary = "Creates a `nvgpu.mbarrier` object.";
+  let description = [{
+    The Op generates one or more `mbarrier` object, which is a barrier created in 
+    shared memory and supports various synchronization behaviors for threads.
+
+    The `mbarrier` object has the following type and alignment requirements:
+      Type: .b64, Alignment: 8, Memory space: .shared
+    
+    Example:
+    ```mlir
+      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+    }];
+  let arguments = (ins);
+  let results = (outs NVGPU_MBarrierGroup:$barriers);
+  let assemblyFormat = [{
+     attr-dict `->` type($barriers)
+  }];
+}
+
+def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
+  let summary = "Initialize the `nvgpu.mbarrier`.";
+  let description = [{
+    The Op initializes the `mbarrier` object with the given number of threads.
+
+    Example:
+    ```mlir
+      %num_threads = gpu.block_dim x
+      %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+      nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId, Optional<I1>:$predicate);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
+}
+
+def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
+  let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
+  let description = [{
+    Checks whether the mbarrier object has completed the phase. It is is a 
+    non-blocking instruction which tests for the completion of the phase.
+
+    Example:
+    ```mlir
+      %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
+  let results = (outs I1:$waitComplete);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
+}
+
+def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
+  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
+  let description = [{
+    The Op performs arrive-on operation on the `mbarrier` object and returns a 
+    `nvgpu.mbarrier.token`.
+
+    For more information, see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
+
+    Example:
+    ```mlir
+      %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
+  let results = (outs NVGPU_MBarrierToken:$token);
+let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
+}
+
+def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
+  let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
+  let description = [{
+    The Op performs arrive-on operation on the `mbarrier` object and returns a 
+    `nvgpu.mbarrier.token`.
+
+    The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
+
+    Example:
+    ```mlir
+      %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
+                       Index:$count);
+  let results = (outs NVGPU_MBarrierToken:$token);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
+}
+
+def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
+  let summary = "Performs expect_tx operation on the `nvgpu.mbarrier.arrive`";
+  let description = [{
+    A thread executing the Op performs an expect-tx operation on the mbarrier 
+    object at the location specified by the address operand $barrier. The 
+    expect-tx operation, with an $txcount argument, increases the tx-count of 
+    an mbarrier object by the value specified by $txcount. This makes the 
+    current phase of the mbarrier object to expect and track the completion of 
+    additional asynchronous transactions.
+    
+    The `$txCount` specifies the number of element to the expect-tx operation.
+
+    Example:
+    ```mlir
+      nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId, Optional<I1>:$predicate);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount  (`,` `predicate` `=` $predicate^)? attr-dict `:` type($barriers)";
+}
+
+def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
+  let summary = "Waits for the `nvgpu.mbarrier` to complete its current phase.";
+  let description = [{
+    Checks whether the mbarrier object has completed the phase. It is is a 
+    potentially blocking instruction which tests for the completion of the 
+    phase. Suspended thread resumes execution when the specified phase completes 
+    OR before the phase completes following a system-dependent time limit. 
+
+    The `$phaseParity` specifies either even phase (0) or odd phase (1) to 
+    wait.
+
+    Example:
+    ```mlir
+      nvgpu.mbarrier.try_wait.parity %barrier, %phaseParity, %ticks : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+    ```
+  }];
+  let arguments = (ins NVGPU_MBarrierGroup:$barriers, I1:$phaseParity, Index:$ticks, Index:$mbarId);
+  let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phaseParity `,` $ticks attr-dict `:` type($barriers)";  
+}
+
+def NVGPU_TmaPrefetchOp : NVGPU_Op<"tma.prefetch.descriptor", []> {
+  let summary = "Prefetch given `nvgpu.tensormap.descriptor` ";
+  let description = [{
+    The Op brings the cache line containing the given `$tmaDescriptor` for 
+    subsequent use by the `tma.async.load` instruction.
+  }];
+  let arguments = (ins NVGPU_TensorMapDescriptor:$tensorMapDescriptor, Optional<I1>:$predicate);
+  let assemblyFormat = [{
+    $tensorMapDescriptor (`,` `predicate` `=` $predicate^)? attr-dict `:` type($tensorMapDescriptor)
+  }];
+}
+
+def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", [AttrSizedOperandSegments]> {
+  let summary = "TMA asynchronous load";
+  let description = [{
+    The Op loads a tile memory region from global memory to shared memory by 
+    Tensor Memory Access (TMA).
+    
+    `$tensorMapDescriptor` is tensor map descriptor which has information about
+    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
+
+    The Op uses `$barrier` mbarrier based completion mechanism. 
+  }];  
+  let arguments = (ins  Arg<AnyMemRef, "", [MemWriteAt<0, FullEffect>]>:$dst,
+                        NVGPU_MBarrierGroup:$barriers,
+                        NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
+                        Variadic<Index>:$coordinates, 
+                        Index:$mbarId,
+                        Optional<I16>:$multicastMask,
+                        Optional<I1>:$predicate);
+  let assemblyFormat = [{
+    $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` 
+      `to` $dst
+      (`multicast_mask` `=` $multicastMask^ )?
+      (`,` `predicate` `=` $predicate^)?
+      attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) 
+      `->` type($dst)
+  }];
+  let hasVerifier = 1;
+
+}
+
+def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegments]> {
+  let summary = "TMA asynchronous store";
+  let description = [{
+    The Op store a tile memory region from global memory to shared memory by 
+    Tensor Memory Access (TMA).
+    
+    `$tensorMapDescriptor` is tensor map descriptor which has information about
+    tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
+  }];  
+  let arguments = (ins  Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
+                        Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
+                        Variadic<Index>:$coordinates, 
+                        Optional<I1>:$predicate);
+  let assemblyFormat = [{
+      $src `to` $tensorMapDescriptor `[` $coordinates `]`
+      (`,` `predicate` `=` $predicate^)?
+      attr-dict `:` type($src)
+      `->` type($tensorMapDescriptor)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
+  let summary = "TMA create descriptor";
+  let description = [{
+    The Op creates a tensor map descriptor object representing tiled memory 
+    region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The 
+    descriptor is used by Tensor Memory Access (TMA).
+
+    The `tensor` is the source tensor to be tiled. 
+
+    The `boxDimensions` is the size of the tiled memory region in each dimension.
+
+    For more information see below:
+    https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+  }];
+
+  let arguments = (ins AnyUnrankedMemRef:$tensor,
+                       Variadic<Index>:$boxDimensions);
+  let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
+  let assemblyFormat = [{
+         $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupGenerateDescriptorOp : NVGPU_Op<"warpgroup.generate.descriptor", []> {
+  let summary = "Generate a warpgroup matrix descriptor";
+  let description = [{
+  This Op builds a `nvgpu.warpgroup.descriptor` that is used by 
+  `nvgpu.warpgroup.mma` to perform warpgroup-level matrix multiply and 
+  accumulate.
+
+  The descriptor specifies the properties of the matrix in shared memory that 
+  is a multiplicand in the matrix multiply and accumulate operation. 
+  }];  
+  let results = (outs NVGPU_WarpgroupMatrixDescriptor:$descriptor);
+  let arguments = (ins Arg<AnyMemRef, "", [MemRead]>:$tensor, 
+                       NVGPU_TensorMapDescriptor:$tensorMap);
+  let assemblyFormat = [{$tensor `,` $tensorMap attr-dict `:` type($tensor) `,` type($tensorMap) `->` type($descriptor)}];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
+  let description = [{
+    The `nvgpu.warpgroup.mma` op performs the warpgroup-level (4 warps) 
+    matrix-multiply-and-accumulate (mma) operation that results in 
+    `nvvm.wgmma.mma_async`. 
+    
+    The operands are `descriptorA` and `descriptorB` that are wgmma matrix 
+    descriptors that shows the properties of the matrix in shared memory. The 
+    results are thread-level ownership to the warpgroup-level mma operation 
+    shape. The shape is deduced from the descriptor types and output vector.
+
+    The Op encapsulates multiple `nvvm.wgmma.mma_async` operations to complete 
+    the given shape. As `nvvm.wgmma.async` Op, or its corresponding PTX 
+    instruction, is asynchronous, this Op groups the `nvvm.wgmma.async` and 
+    surrounds them between `wgmma.fence.aligned` and 
+    `wgmma.commit.group.sync.aligned`, `wgmma.wait.group.sync.aligned` Ops.
+
+    Example:
+    ```mlir
+      %r1,%r2 = nvgpu.warpgroup.mma %descA, %descB, %acc1, %acc2: 
+                 !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>, 
+                 !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>, 
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
+                 -> 
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>,
+                 !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>
+    ```
+  }];
+
+  let arguments = (ins NVGPU_WarpgroupMatrixDescriptor:$descriptorA, 
+                       NVGPU_WarpgroupMatrixDescriptor:$descriptorB,                                               
+                       DefaultValuedOptionalAttr<I64Attr, "1">:$waitGroup,
+                       OptionalAttr<UnitAttr>:$transposeA,
+                       OptionalAttr<UnitAttr>:$transposeB,
+                       NVGPU_WarpgroupAccumulator:$matrixC);
+  let results = (outs NVGPU_WarpgroupAccumulator:$matrixD);
+  let assemblyFormat = [{    
+    $descriptorA`,` $descriptorB`,` $matrixC attr-dict
+    `:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
+  let description = [{
+    The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
+    in $matrixD to given memref. 
+
+    [See the details of register fragment layout for accumulator matrix D]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
+
+    Note that, the op must be run with warp group.
+  }];
+
+  let arguments = (ins NVGPU_WarpgroupAccumulator:$matrixD,
+                       Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);
+  
+  let assemblyFormat = [{
+    $matrixD `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {  
+  let summary = "Initializes the accumulator matrix";
+
+  let description = [{
+    This Op generates and initializes the accumulator matrix for 
+    `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
+  }];
+  let results = (outs NVGPU_WarpgroupAccumulator:$matrixC);
+  let assemblyFormat = "attr-dict `->` type($matrixC)";
+  let hasVerifier = 1;
+}
+
+def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
+                                   SameOperandsAndResultType]> {
+  let summary = "The reciprocal calculation for vector types";
+  let description = [{
+    Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
+
+    Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
+
+    The input and output must be of the same vector type and shape.
+  }];
+  let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
+                       DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
+                       UnitAttr:$ftz);
+  let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
+  let assemblyFormat = [{
+    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
+    attr-dict `:` type($out)
+  }];
+  let hasVerifier = 1;
+}
+
+#endif // MLIR_DIALECT_NVGPU_IR_NVGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
new file mode 100644
index 0000000000000..8836a1a9dfcd8
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
@@ -0,0 +1,117 @@
+//===- NVGPUTypes.td - NVGPU types -------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVGPU dialect types.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
+#define MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+
+//===----------------------------------------------------------------------===//
+// NVGPU Type Definitions
+//===----------------------------------------------------------------------===//
+
+class NVGPU_Type<string name, string typeMnemonic,
+        list<Trait> traits = []> : TypeDef<NVGPU_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
+def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
+                                        "device.async.token", []> {
+  let summary = "device async token type";
+  let description = [{
+    `nvgpu.device.async.token` is a type returned by an asynchronous operation
+    that runs on the GPU (device). It is used to establish an SSA-based link
+    between the async operation (e.g. DeviceAsyncCopy) and operations that
+    group or synchronize the async operations (e.g. DeviceAsyncCreateGroupOp,
+    DeviceAsyncWaitOp).
+  }];
+}
+
+def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
+  let summary = "mbarrier barrier type";
+  let description = [{
+    This is the type for one or more mbarrier object in shared memory that is 
+    used to synchronize a variable number of threads.
+
+    If `num_barriers` is not set, the number of mbarrier objects is 1.
+
+    A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object 
+    can be initiated and invalidated.
+
+    [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
+  }];    
+  let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
+  let assemblyFormat = "`<` struct(params) `>`";
+  let builders = [
+    TypeBuilder<(ins "Attribute":$memorySpace), [{
+      return $_get($_ctxt, memorySpace, 1);
+    }]>
+  ];
+}
+
+def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
+def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
+  let summary = "TensorMap descriptor";
+  let parameters = (ins "MemRefType":$tensor,
+                        EnumParameter<TensorMapSwizzleKind>:$swizzle,
+                        EnumParameter<TensorMapL2PromoKind>:$l2promo,
+                        EnumParameter<TensorMapOOBKind>:$oob,
+                        EnumParameter<TensorMapInterleaveKind>:$interleave);
+  let description = [{
+    `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is 
+    128-byte object either in constant space or kernel paramater.    
+  }];
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def NVGPU_WarpgroupMatrixDescriptor : NVGPU_Type<"WarpgroupMatrixDescriptor", "warpgroup.descriptor", []> {
+  let summary = "Warpgroup matrix descriptor type";
+  let description = [{
+  The descriptor specifies the properties of the matrix in shared memory that 
+  is a multiplicand in the matrix multiply and accumulate operation. 
+  
+  The descriptor is a 64-bit value contained in a register with the following:
+  ```
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  |   0-13  |14-15|   16-29   |30-31|   32-45   |46-48|49-51|   52-61   |62-63|
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  |  14bits |2bits|   14bits  |2bits|   14bits  |2bits|3bits|   10bits  |2bits|
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  | BaseAddr|  0  | LeadingDim|  0  |   Stride  |  0  |Offst|     0     |Swzle|
+  +---------+-----+-----------+-----+-----------+-----+-----+-----------+-----+
+  ```
+   
+  [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-shared-memory-layout-matrix-descriptor) 
+  
+  }];  
+  let parameters = (ins "MemRefType":$tensor);
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def NVGPU_WarpgroupAccumulator : NVGPU_Type<"WarpgroupAccumulator", "warpgroup.accumulator", []> {
+  let parameters = (ins "VectorType":$fragmented);
+  let assemblyFormat = "`<` struct(params) `>`";
+  let description = [{
+    This type represents the result matrix obtained from `nvgpu.warpgroup.mma`. 
+    The `$fragmented` type signifies the distributed or fragmented result 
+    vector that is collectively owned by all the threads in the warp-group 
+    that executed `nvgpu.warpgroup.mma`.
+    [See the details of register fragment layout for accumulator matrix D]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
+  }];
+}
+
+#endif //MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
diff --git a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
index 4d47ce4746dbb..10aa502ee67f8 100644
--- a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
@@ -5,7 +5,7 @@ add_mlir_dialect_library(MLIRNVGPUDialect
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/NVGPU
 
   DEPENDS
-  MLIRNVGPUIncGen
+  MLIRNVGPUOpsIncGen
   MLIRNVGPUEnumsIncGen
   MLIRNVGPUAttributesIncGen
   MLIRNVGPUTypesIncGen
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index ba86e8d6ceaf9..abbdb6a0f53ec 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -35,7 +35,7 @@ using namespace mlir::nvgpu;
 void nvgpu::NVGPUDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc"
       >();
   addAttributes<
 #define GET_ATTRDEF_LIST
@@ -43,7 +43,7 @@ void nvgpu::NVGPUDialect::initialize() {
       >();
   addOperations<
 #define GET_OP_LIST
-#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
       >();
 }
 
@@ -681,7 +681,7 @@ LogicalResult RcpOp::verify() {
 #include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc"
 
 #define GET_OP_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
+#include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc"
diff --git a/mlir/python/mlir/dialects/NVGPUOps.td b/mlir/python/mlir/dialects/NVGPUOps.td
index ae54822cd9070..cdf651901e074 100644
--- a/mlir/python/mlir/dialects/NVGPUOps.td
+++ b/mlir/python/mlir/dialects/NVGPUOps.td
@@ -9,6 +9,6 @@
 #ifndef PYTHON_BINDINGS_NVGPU_OPS
 #define PYTHON_BINDINGS_NVGPU_OPS
 
-include "mlir/Dialect/NVGPU/IR/NVGPU.td"
+include "mlir/Dialect/NVGPU/IR/NVGPUOps.td"
 
 #endif

>From 3664b4e2d5800eca5c8253a3915b87fa12ea7ebc Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano at gmail.com>
Date: Thu, 6 Mar 2025 21:08:25 -0800
Subject: [PATCH 09/23] [clang-format] Remove special handling of C++ access
 specifiers in C (#129983)

This effectively reverts d1aed486efc6d35a81ca4acbabb4203c4b91cda9
because of
#129426.
---
 clang/lib/Format/UnwrappedLineFormatter.cpp | 30 ++--------
 clang/lib/Format/UnwrappedLineParser.cpp    | 64 +--------------------
 clang/unittests/Format/FormatTest.cpp       | 49 ++++++++--------
 3 files changed, 33 insertions(+), 110 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 14e984529d640..000a5105ca407 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -116,36 +116,18 @@ class LevelIndentTracker {
         Style.isCSharp()) {
       return 0;
     }
-
-    auto IsAccessModifier = [&](const FormatToken &RootToken) {
-      if (Line.Type == LT_AccessModifier || RootToken.isObjCAccessSpecifier())
-        return true;
-
-      const auto *Next = RootToken.Next;
-
-      // Handle Qt signals.
-      if (RootToken.isOneOf(Keywords.kw_signals, Keywords.kw_qsignals) &&
-          Next && Next->is(tok::colon)) {
-        return true;
-      }
-
-      if (Next && Next->isOneOf(Keywords.kw_slots, Keywords.kw_qslots) &&
-          Next->Next && Next->Next->is(tok::colon)) {
-        return true;
-      }
-
-      // Handle malformed access specifier e.g. 'private' without trailing ':'.
-      return !Next && RootToken.isAccessSpecifier(false);
-    };
-
-    if (IsAccessModifier(*Line.First)) {
+    const auto &RootToken = *Line.First;
+    if (Line.Type == LT_AccessModifier ||
+        RootToken.isAccessSpecifier(/*ColonRequired=*/false) ||
+        RootToken.isObjCAccessSpecifier() ||
+        (RootToken.isOneOf(Keywords.kw_signals, Keywords.kw_qsignals) &&
+         RootToken.Next && RootToken.Next->is(tok::colon))) {
       // The AccessModifierOffset may be overridden by IndentAccessModifiers,
       // in which case we take a negative value of the IndentWidth to simulate
       // the upper indent level.
       return Style.IndentAccessModifiers ? -Style.IndentWidth
                                          : Style.AccessModifierOffset;
     }
-
     return 0;
   }
 
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index efb22bcdbe53f..6854e224c2631 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -3386,75 +3386,15 @@ void UnwrappedLineParser::parseSwitch(bool IsExpr) {
     NestedTooDeep.pop_back();
 }
 
-// Operators that can follow a C variable.
-static bool isCOperatorFollowingVar(tok::TokenKind Kind) {
-  switch (Kind) {
-  case tok::ampamp:
-  case tok::ampequal:
-  case tok::arrow:
-  case tok::caret:
-  case tok::caretequal:
-  case tok::comma:
-  case tok::ellipsis:
-  case tok::equal:
-  case tok::equalequal:
-  case tok::exclaim:
-  case tok::exclaimequal:
-  case tok::greater:
-  case tok::greaterequal:
-  case tok::greatergreater:
-  case tok::greatergreaterequal:
-  case tok::l_paren:
-  case tok::l_square:
-  case tok::less:
-  case tok::lessequal:
-  case tok::lessless:
-  case tok::lesslessequal:
-  case tok::minus:
-  case tok::minusequal:
-  case tok::minusminus:
-  case tok::percent:
-  case tok::percentequal:
-  case tok::period:
-  case tok::pipe:
-  case tok::pipeequal:
-  case tok::pipepipe:
-  case tok::plus:
-  case tok::plusequal:
-  case tok::plusplus:
-  case tok::question:
-  case tok::r_brace:
-  case tok::r_paren:
-  case tok::r_square:
-  case tok::semi:
-  case tok::slash:
-  case tok::slashequal:
-  case tok::star:
-  case tok::starequal:
-    return true;
-  default:
-    return false;
-  }
-}
-
 void UnwrappedLineParser::parseAccessSpecifier() {
-  FormatToken *AccessSpecifierCandidate = FormatTok;
   nextToken();
   // Understand Qt's slots.
   if (FormatTok->isOneOf(Keywords.kw_slots, Keywords.kw_qslots))
     nextToken();
   // Otherwise, we don't know what it is, and we'd better keep the next token.
-  if (FormatTok->is(tok::colon)) {
+  if (FormatTok->is(tok::colon))
     nextToken();
-    addUnwrappedLine();
-  } else if (FormatTok->isNot(tok::coloncolon) &&
-             !isCOperatorFollowingVar(FormatTok->Tok.getKind())) {
-    // Not a variable name nor namespace name.
-    addUnwrappedLine();
-  } else if (AccessSpecifierCandidate) {
-    // Consider the access specifier to be a C identifier.
-    AccessSpecifierCandidate->Tok.setKind(tok::identifier);
-  }
+  addUnwrappedLine();
 }
 
 /// \brief Parses a requires, decides if it is a clause or an expression.
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index ae2eaf70de1c2..bd335f4b6a21b 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -3501,46 +3501,47 @@ TEST_F(FormatTest, UnderstandsAccessSpecifiers) {
                "label:\n"
                "  signals.baz();\n"
                "}");
-  verifyFormat("private[1];");
+
+  const auto Style = getLLVMStyle(FormatStyle::LK_C);
+  verifyFormat("private[1];", Style);
   verifyFormat("testArray[public] = 1;");
-  verifyFormat("public();");
+  verifyFormat("public();", Style);
   verifyFormat("myFunc(public);");
   verifyFormat("std::vector<int> testVec = {private};");
-  verifyFormat("private.p = 1;");
+  verifyFormat("private.p = 1;", Style);
   verifyFormat("void function(private...) {};");
   verifyFormat("if (private && public)");
-  verifyFormat("private &= true;");
+  verifyFormat("private &= true;", Style);
   verifyFormat("int x = private * public;");
-  verifyFormat("public *= private;");
+  verifyFormat("public *= private;", Style);
   verifyFormat("int x = public + private;");
-  verifyFormat("private++;");
+  verifyFormat("private++;", Style);
   verifyFormat("++private;");
-  verifyFormat("public += private;");
-  verifyFormat("public = public - private;");
-  verifyFormat("public->foo();");
-  verifyFormat("private--;");
+  verifyFormat("public += private;", Style);
+  verifyFormat("public = public - private;", Style);
+  verifyFormat("public->foo();", Style);
+  verifyFormat("private--;", Style);
   verifyFormat("--private;");
-  verifyFormat("public -= 1;");
+  verifyFormat("public -= 1;", Style);
   verifyFormat("if (!private && !public)");
-  verifyFormat("public != private;");
+  verifyFormat("public != private;", Style);
   verifyFormat("int x = public / private;");
-  verifyFormat("public /= 2;");
-  verifyFormat("public = public % 2;");
-  verifyFormat("public %= 2;");
+  verifyFormat("public /= 2;", Style);
+  verifyFormat("public = public % 2;", Style);
+  verifyFormat("public %= 2;", Style);
   verifyFormat("if (public < private)");
-  verifyFormat("public << private;");
-  verifyFormat("public <<= private;");
+  verifyFormat("public << private;", Style);
+  verifyFormat("public <<= private;", Style);
   verifyFormat("if (public > private)");
-  verifyFormat("public >> private;");
-  verifyFormat("public >>= private;");
-  verifyFormat("public ^ private;");
-  verifyFormat("public ^= private;");
-  verifyFormat("public | private;");
-  verifyFormat("public |= private;");
+  verifyFormat("public >> private;", Style);
+  verifyFormat("public >>= private;", Style);
+  verifyFormat("public ^ private;", Style);
+  verifyFormat("public ^= private;", Style);
+  verifyFormat("public | private;", Style);
+  verifyFormat("public |= private;", Style);
   verifyFormat("auto x = private ? 1 : 2;");
   verifyFormat("if (public == private)");
   verifyFormat("void foo(public, private)");
-  verifyFormat("public::foo();");
 
   verifyFormat("class A {\n"
                "public:\n"

>From c8fd7a8a3aa1c13a71e0b90d0c60743d4901bcf4 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 6 Mar 2025 21:18:57 -0800
Subject: [PATCH 10/23] [ctxprof] Profile section for flat profiles (#129932)

A section for flat profiles (i.e. non-contextual). This is useful for debugging or for intentional cases where a root isn't identified.

This patch adds the reader/writer support. `compiler-rt` changes follow in a subsequent change.
---
 .../llvm/ProfileData/PGOCtxProfReader.h       |  3 +
 .../llvm/ProfileData/PGOCtxProfWriter.h       | 11 ++-
 llvm/lib/ProfileData/PGOCtxProfReader.cpp     | 73 ++++++++++++++++---
 llvm/lib/ProfileData/PGOCtxProfWriter.cpp     | 42 +++++++++++
 .../Inputs/invalid-flat.yaml                  |  2 +
 .../Inputs/valid-ctx-only.yaml                | 14 ++++
 .../Inputs/valid-flat-first.yaml              | 19 +++++
 .../Inputs/valid-flat-only.yaml               |  6 ++
 .../tools/llvm-ctxprof-util/Inputs/valid.yaml |  5 ++
 .../llvm-ctxprof-util-negative.test           |  2 +
 .../llvm-ctxprof-util/llvm-ctxprof-util.test  | 27 ++++++-
 11 files changed, 190 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-flat.yaml
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/valid-ctx-only.yaml
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-first.yaml
 create mode 100644 llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-only.yaml

diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
index 33f03120a835a..4b0c944a5258c 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
@@ -174,6 +174,7 @@ using CtxProfContextualProfiles =
     std::map<GlobalValue::GUID, PGOCtxProfContext>;
 struct PGOCtxProfile {
   CtxProfContextualProfiles Contexts;
+  CtxProfFlatProfile FlatProfiles;
 
   PGOCtxProfile() = default;
   PGOCtxProfile(const PGOCtxProfile &) = delete;
@@ -192,10 +193,12 @@ class PGOCtxProfileReader final {
   Expected<std::pair<std::optional<uint32_t>, PGOCtxProfContext>>
   readProfile(PGOCtxProfileBlockIDs Kind);
 
+  bool tryGetNextKnownBlockID(PGOCtxProfileBlockIDs &ID);
   bool canEnterBlockWithID(PGOCtxProfileBlockIDs ID);
   Error enterBlockWithID(PGOCtxProfileBlockIDs ID);
 
   Error loadContexts(CtxProfContextualProfiles &P);
+  Error loadFlatProfiles(CtxProfFlatProfile &P);
 
 public:
   PGOCtxProfileReader(StringRef Buffer)
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index 0ff11c998f02a..82cf787153cd9 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -22,10 +22,14 @@ namespace llvm {
 enum PGOCtxProfileRecords { Invalid = 0, Version, Guid, CalleeIndex, Counters };
 
 enum PGOCtxProfileBlockIDs {
-  ProfileMetadataBlockID = bitc::FIRST_APPLICATION_BLOCKID,
+  FIRST_VALID = bitc::FIRST_APPLICATION_BLOCKID,
+  ProfileMetadataBlockID = FIRST_VALID,
   ContextsSectionBlockID = ProfileMetadataBlockID + 1,
   ContextRootBlockID = ContextsSectionBlockID + 1,
   ContextNodeBlockID = ContextRootBlockID + 1,
+  FlatProfilesSectionBlockID = ContextNodeBlockID + 1,
+  FlatProfileBlockID = FlatProfilesSectionBlockID + 1,
+  LAST_VALID = FlatProfileBlockID
 };
 
 /// Write one or more ContextNodes to the provided raw_fd_stream.
@@ -83,6 +87,11 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter {
   void writeContextual(const ctx_profile::ContextNode &RootNode) override;
   void endContextSection() override;
 
+  void startFlatSection();
+  void writeFlatSection(ctx_profile::GUID Guid, const uint64_t *Buffer,
+                        size_t BufferSize);
+  void endFlatSection();
+
   // constants used in writing which a reader may find useful.
   static constexpr unsigned CodeLen = 2;
   static constexpr uint32_t CurrentVersion = 2;
diff --git a/llvm/lib/ProfileData/PGOCtxProfReader.cpp b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
index 63a6b013baba6..5cc4c94c74b76 100644
--- a/llvm/lib/ProfileData/PGOCtxProfReader.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
@@ -57,13 +57,24 @@ Error PGOCtxProfileReader::unsupported(const Twine &Msg) {
   return make_error<InstrProfError>(instrprof_error::unsupported_version, Msg);
 }
 
-bool PGOCtxProfileReader::canEnterBlockWithID(PGOCtxProfileBlockIDs ID) {
+bool PGOCtxProfileReader::tryGetNextKnownBlockID(PGOCtxProfileBlockIDs &ID) {
   auto Blk = advance();
   if (!Blk) {
     consumeError(Blk.takeError());
     return false;
   }
-  return Blk->Kind == BitstreamEntry::SubBlock && Blk->ID == ID;
+  if (Blk->Kind != BitstreamEntry::SubBlock)
+    return false;
+  if (PGOCtxProfileBlockIDs::FIRST_VALID > Blk->ID ||
+      PGOCtxProfileBlockIDs::LAST_VALID < Blk->ID)
+    return false;
+  ID = static_cast<PGOCtxProfileBlockIDs>(Blk->ID);
+  return true;
+}
+
+bool PGOCtxProfileReader::canEnterBlockWithID(PGOCtxProfileBlockIDs ID) {
+  PGOCtxProfileBlockIDs Test = {};
+  return tryGetNextKnownBlockID(Test) && Test == ID;
 }
 
 Error PGOCtxProfileReader::enterBlockWithID(PGOCtxProfileBlockIDs ID) {
@@ -71,10 +82,14 @@ Error PGOCtxProfileReader::enterBlockWithID(PGOCtxProfileBlockIDs ID) {
   return Error::success();
 }
 
+// Note: we use PGOCtxProfContext for flat profiles also, as the latter are
+// structurally similar. Alternative modeling here seems a bit overkill at the
+// moment.
 Expected<std::pair<std::optional<uint32_t>, PGOCtxProfContext>>
 PGOCtxProfileReader::readProfile(PGOCtxProfileBlockIDs Kind) {
   assert((Kind == PGOCtxProfileBlockIDs::ContextRootBlockID ||
-          Kind == PGOCtxProfileBlockIDs::ContextNodeBlockID) &&
+          Kind == PGOCtxProfileBlockIDs::ContextNodeBlockID ||
+          Kind == PGOCtxProfileBlockIDs::FlatProfileBlockID) &&
          "Unexpected profile kind");
   RET_ON_ERR(enterBlockWithID(Kind));
 
@@ -176,14 +191,24 @@ Error PGOCtxProfileReader::readMetadata() {
 }
 
 Error PGOCtxProfileReader::loadContexts(CtxProfContextualProfiles &P) {
-  if (canEnterBlockWithID(PGOCtxProfileBlockIDs::ContextsSectionBlockID)) {
-    RET_ON_ERR(enterBlockWithID(PGOCtxProfileBlockIDs::ContextsSectionBlockID));
-    while (canEnterBlockWithID(PGOCtxProfileBlockIDs::ContextRootBlockID)) {
-      EXPECT_OR_RET(E, readProfile(PGOCtxProfileBlockIDs::ContextRootBlockID));
-      auto Key = E->second.guid();
-      if (!P.insert({Key, std::move(E->second)}).second)
-        return wrongValue("Duplicate roots");
-    }
+  RET_ON_ERR(enterBlockWithID(PGOCtxProfileBlockIDs::ContextsSectionBlockID));
+  while (canEnterBlockWithID(PGOCtxProfileBlockIDs::ContextRootBlockID)) {
+    EXPECT_OR_RET(E, readProfile(PGOCtxProfileBlockIDs::ContextRootBlockID));
+    auto Key = E->second.guid();
+    if (!P.insert({Key, std::move(E->second)}).second)
+      return wrongValue("Duplicate roots");
+  }
+  return Error::success();
+}
+
+Error PGOCtxProfileReader::loadFlatProfiles(CtxProfFlatProfile &P) {
+  RET_ON_ERR(
+      enterBlockWithID(PGOCtxProfileBlockIDs::FlatProfilesSectionBlockID));
+  while (canEnterBlockWithID(PGOCtxProfileBlockIDs::FlatProfileBlockID)) {
+    EXPECT_OR_RET(E, readProfile(PGOCtxProfileBlockIDs::FlatProfileBlockID));
+    auto Guid = E->second.guid();
+    if (!P.insert({Guid, std::move(E->second.counters())}).second)
+      return wrongValue("Duplicate flat profile entries");
   }
   return Error::success();
 }
@@ -191,7 +216,19 @@ Error PGOCtxProfileReader::loadContexts(CtxProfContextualProfiles &P) {
 Expected<PGOCtxProfile> PGOCtxProfileReader::loadProfiles() {
   RET_ON_ERR(readMetadata());
   PGOCtxProfile Ret;
-  RET_ON_ERR(loadContexts(Ret.Contexts));
+  PGOCtxProfileBlockIDs Test = {};
+  for (auto I = 0; I < 2; ++I) {
+    if (!tryGetNextKnownBlockID(Test))
+      break;
+    if (Test == PGOCtxProfileBlockIDs::ContextsSectionBlockID) {
+      RET_ON_ERR(loadContexts(Ret.Contexts));
+    } else if (Test == PGOCtxProfileBlockIDs::FlatProfilesSectionBlockID) {
+      RET_ON_ERR(loadFlatProfiles(Ret.FlatProfiles));
+    } else {
+      return wrongValue("Unexpected section");
+    }
+  }
+
   return std::move(Ret);
 }
 
@@ -287,5 +324,17 @@ void llvm::convertCtxProfToYaml(raw_ostream &OS, const PGOCtxProfile &Profile) {
     toYaml(Out, Profile.Contexts);
     Out.postflightKey(nullptr);
   }
+  if (!Profile.FlatProfiles.empty()) {
+    Out.preflightKey("FlatProfiles", false, false, UseDefault, SaveInfo);
+    Out.beginSequence();
+    size_t ElemID = 0;
+    for (const auto &[Guid, Counters] : Profile.FlatProfiles) {
+      Out.preflightElement(ElemID++, SaveInfo);
+      toYaml(Out, Guid, Counters, {});
+      Out.postflightElement(nullptr);
+    }
+    Out.endSequence();
+    Out.postflightKey(nullptr);
+  }
   Out.endMapping();
 }
diff --git a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
index d4184da1c2509..d099572fc152a 100644
--- a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
+++ b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ProfileData/PGOCtxProfWriter.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/ProfileData/CtxInstrContextNode.h"
+#include "llvm/ProfileData/PGOCtxProfReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -59,6 +60,11 @@ PGOCtxProfileWriter::PGOCtxProfileWriter(
     DescribeRecord(PGOCtxProfileRecords::Guid, "GUID");
     DescribeRecord(PGOCtxProfileRecords::CalleeIndex, "CalleeIndex");
     DescribeRecord(PGOCtxProfileRecords::Counters, "Counters");
+    DescribeBlock(PGOCtxProfileBlockIDs::FlatProfilesSectionBlockID,
+                  "FlatProfiles");
+    DescribeBlock(PGOCtxProfileBlockIDs::FlatProfileBlockID, "Flat");
+    DescribeRecord(PGOCtxProfileRecords::Guid, "GUID");
+    DescribeRecord(PGOCtxProfileRecords::Counters, "Counters");
   }
   Writer.ExitBlock();
   Writer.EnterSubblock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID, CodeLen);
@@ -108,12 +114,27 @@ void PGOCtxProfileWriter::startContextSection() {
   Writer.EnterSubblock(PGOCtxProfileBlockIDs::ContextsSectionBlockID, CodeLen);
 }
 
+void PGOCtxProfileWriter::startFlatSection() {
+  Writer.EnterSubblock(PGOCtxProfileBlockIDs::FlatProfilesSectionBlockID,
+                       CodeLen);
+}
+
 void PGOCtxProfileWriter::endContextSection() { Writer.ExitBlock(); }
+void PGOCtxProfileWriter::endFlatSection() { Writer.ExitBlock(); }
 
 void PGOCtxProfileWriter::writeContextual(const ContextNode &RootNode) {
   writeImpl(std::nullopt, RootNode);
 }
 
+void PGOCtxProfileWriter::writeFlatSection(ctx_profile::GUID Guid,
+                                           const uint64_t *Buffer,
+                                           size_t Size) {
+  Writer.EnterSubblock(PGOCtxProfileBlockIDs::FlatProfileBlockID, CodeLen);
+  writeGuid(Guid);
+  writeCounters({Buffer, Size});
+  Writer.ExitBlock();
+}
+
 namespace {
 
 /// Representation of the context node suitable for yaml serialization /
@@ -123,8 +144,13 @@ struct SerializableCtxRepresentation {
   std::vector<uint64_t> Counters;
   std::vector<std::vector<SerializableCtxRepresentation>> Callsites;
 };
+
+using SerializableFlatProfileRepresentation =
+    std::pair<ctx_profile::GUID, std::vector<uint64_t>>;
+
 struct SerializableProfileRepresentation {
   std::vector<SerializableCtxRepresentation> Contexts;
+  std::vector<SerializableFlatProfileRepresentation> FlatProfiles;
 };
 
 ctx_profile::ContextNode *
@@ -164,6 +190,7 @@ createNode(std::vector<std::unique_ptr<char[]>> &Nodes,
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(SerializableCtxRepresentation)
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<SerializableCtxRepresentation>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(SerializableFlatProfileRepresentation)
 template <> struct yaml::MappingTraits<SerializableCtxRepresentation> {
   static void mapping(yaml::IO &IO, SerializableCtxRepresentation &SCR) {
     IO.mapRequired("Guid", SCR.Guid);
@@ -175,6 +202,15 @@ template <> struct yaml::MappingTraits<SerializableCtxRepresentation> {
 template <> struct yaml::MappingTraits<SerializableProfileRepresentation> {
   static void mapping(yaml::IO &IO, SerializableProfileRepresentation &SPR) {
     IO.mapOptional("Contexts", SPR.Contexts);
+    IO.mapOptional("FlatProfiles", SPR.FlatProfiles);
+  }
+};
+
+template <> struct yaml::MappingTraits<SerializableFlatProfileRepresentation> {
+  static void mapping(yaml::IO &IO,
+                      SerializableFlatProfileRepresentation &SFPR) {
+    IO.mapRequired("Guid", SFPR.first);
+    IO.mapRequired("Counters", SFPR.second);
   }
 };
 
@@ -201,6 +237,12 @@ Error llvm::createCtxProfFromYAML(StringRef Profile, raw_ostream &Out) {
     }
     Writer.endContextSection();
   }
+  if (!SPR.FlatProfiles.empty()) {
+    Writer.startFlatSection();
+    for (const auto &[Guid, Counters] : SPR.FlatProfiles)
+      Writer.writeFlatSection(Guid, Counters.data(), Counters.size());
+    Writer.endFlatSection();
+  }
   if (EC)
     return createStringError(EC, "failed to write output");
   return Error::success();
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-flat.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-flat.yaml
new file mode 100644
index 0000000000000..c3bc89a9a3519
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/invalid-flat.yaml
@@ -0,0 +1,2 @@
+FlatProfiles:
+  - Guid: 1
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-ctx-only.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-ctx-only.yaml
new file mode 100644
index 0000000000000..0de489dd0b1eb
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-ctx-only.yaml
@@ -0,0 +1,14 @@
+
+Contexts:
+  - Guid:            1000
+    Counters:        [ 1, 2, 3 ]
+    Callsites:
+      - [  ]
+      - - Guid:            2000
+          Counters:        [ 4, 5 ]
+        - Guid:            18446744073709551613
+          Counters:        [ 6, 7, 8 ]
+      - - Guid:            3000
+          Counters:        [ 40, 50 ]
+  - Guid:            18446744073709551612
+    Counters:        [ 5, 9, 10 ]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-first.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-first.yaml
new file mode 100644
index 0000000000000..5567faaa9e0a4
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-first.yaml
@@ -0,0 +1,19 @@
+
+FlatProfiles:
+  - Guid:            1234
+    Counters:        [ 5, 6, 7 ]
+  - Guid:            5555
+    Counters:        [ 1 ]
+Contexts:
+  - Guid:            1000
+    Counters:        [ 1, 2, 3 ]
+    Callsites:
+      - [  ]
+      - - Guid:            2000
+          Counters:        [ 4, 5 ]
+        - Guid:            18446744073709551613
+          Counters:        [ 6, 7, 8 ]
+      - - Guid:            3000
+          Counters:        [ 40, 50 ]
+  - Guid:            18446744073709551612
+    Counters:        [ 5, 9, 10 ]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-only.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-only.yaml
new file mode 100644
index 0000000000000..98231ed70d0ec
--- /dev/null
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid-flat-only.yaml
@@ -0,0 +1,6 @@
+
+FlatProfiles:
+  - Guid:            1234
+    Counters:        [ 5, 6, 7 ]
+  - Guid:            5555
+    Counters:        [ 1 ]
diff --git a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
index 0de489dd0b1eb..1541b0d136514 100644
--- a/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
+++ b/llvm/test/tools/llvm-ctxprof-util/Inputs/valid.yaml
@@ -12,3 +12,8 @@ Contexts:
           Counters:        [ 40, 50 ]
   - Guid:            18446744073709551612
     Counters:        [ 5, 9, 10 ]
+FlatProfiles:
+  - Guid:            1234
+    Counters:        [ 5, 6, 7 ]
+  - Guid:            5555
+    Counters:        [ 1 ]
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
index 487d5ae1d17be..f312f50ffee8e 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util-negative.test
@@ -9,6 +9,7 @@
 ; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-no-ctx.yaml 2>&1 | FileCheck %s --check-prefix=NO_CTX
 ; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-no-counters.yaml 2>&1 | FileCheck %s --check-prefix=NO_COUNTERS
 ; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-bad-subctx.yaml 2>&1 | FileCheck %s --check-prefix=BAD_SUBCTX
+; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/invalid-flat.yaml 2>&1 | FileCheck %s --check-prefix=BAD_FLAT
 ; RUN: rm -rf %t
 ; RUN: not llvm-ctxprof-util fromYAML --input %S/Inputs/valid.yaml --output %t/output.bitstream 2>&1 | FileCheck %s --check-prefix=NO_DIR
 
@@ -21,4 +22,5 @@
 ; NO_CTX: YAML:1:1: error: not a mapping
 ; NO_COUNTERS: YAML:2:5: error: missing required key 'Counters'
 ; BAD_SUBCTX: YAML:4:18: error: not a sequence
+; BAD_FLAT: YAML:2:5: error: missing required key 'Counters'
 ; NO_DIR: failed to open output
diff --git a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
index 07cbdd97210fb..a9e350388577c 100644
--- a/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
+++ b/llvm/test/tools/llvm-ctxprof-util/llvm-ctxprof-util.test
@@ -8,6 +8,21 @@
 ; RUN: llvm-ctxprof-util toYAML -input %t/valid.bitstream -output %t/valid2.yaml
 ; RUN: diff %t/valid2.yaml %S/Inputs/valid.yaml
 
+
+; RUN: llvm-ctxprof-util fromYAML -input %S/Inputs/valid-ctx-only.yaml -output %t/valid-ctx-only.bitstream
+; RUN: llvm-ctxprof-util toYAML -input %t/valid-ctx-only.bitstream -output %t/valid-ctx-only.yaml
+; RUN: diff %t/valid-ctx-only.yaml %S/Inputs/valid-ctx-only.yaml
+
+; RUN: llvm-ctxprof-util fromYAML -input %S/Inputs/valid-flat-only.yaml -output %t/valid-flat-only.bitstream
+; RUN: llvm-ctxprof-util toYAML -input %t/valid-flat-only.bitstream -output %t/valid-flat-only.yaml
+; RUN: diff %t/valid-flat-only.yaml %S/Inputs/valid-flat-only.yaml
+
+; This case is the "valid.yaml" case but with the flat profile first.
+; The output, though, should match valid.yaml
+; RUN: llvm-ctxprof-util fromYAML -input %S/Inputs/valid-flat-first.yaml -output %t/valid-flat-first.bitstream
+; RUN: llvm-ctxprof-util toYAML -input %t/valid-flat-first.bitstream -output %t/valid-flat-first.yaml
+; RUN: diff %t/valid-flat-first.yaml %S/Inputs/valid.yaml
+
 ; For the valid case, check against a reference output.
 ; Note that uint64_t are printed as signed values by llvm-bcanalyzer:
 ;  * 18446744073709551613 in yaml is -3 in the output
@@ -22,7 +37,7 @@
 ; EMPTY-NEXT: </Metadata>
 
 ; VALID:      <BLOCKINFO_BLOCK/>
-; VALID-NEXT: <Metadata NumWords=33 BlockCodeSize=2>
+; VALID-NEXT: <Metadata NumWords=45 BlockCodeSize=2>
 ; VALID-NEXT:   <Version op0=2/>
 ; VALID-NEXT:   <Contexts NumWords=29 BlockCodeSize=2>
 ; VALID-NEXT:     <Root NumWords=20 BlockCodeSize=2>
@@ -49,4 +64,14 @@
 ; VALID-NEXT:       <Counters op0=5 op1=9 op2=10/>
 ; VALID-NEXT:     </Root>
 ; VALID-NEXT:   </Contexts>
+; VALID-NEXT:   <FlatProfiles NumWords=10 BlockCodeSize=2>
+; VALID-NEXT:       <Flat NumWords=3 BlockCodeSize=2>
+; VALID-NEXT:         <GUID op0=1234/>
+; VALID-NEXT:         <Counters op0=5 op1=6 op2=7/>
+; VALID-NEXT:       </Flat>
+; VALID-NEXT:       <Flat NumWords=2 BlockCodeSize=2>
+; VALID-NEXT:         <GUID op0=5555/>
+; VALID-NEXT:         <Counters op0=1/>
+; VALID-NEXT:       </Flat>
+; VALID-NEXT:   </FlatProfiles>
 ; VALID-NEXT: </Metadata>
\ No newline at end of file

>From 21b261102504c97fc0b81c101898c0f1c1a7e79c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 21 Jan 2025 15:50:32 +0700
Subject: [PATCH 11/23] AMDGPU Add baseline tests for shufflevector of physreg
 copy combine

---
 .../AMDGPU/shufflevector-physreg-copy.ll      | 795 ++++++++++++++++++
 1 file changed, 795 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll

diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
new file mode 100644
index 0000000000000..e130ce0a45467
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -0,0 +1,795 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s
+
+; Test that we can form v_pk_mov_b32 in certain shuffles when they
+; originate from 32-bit physreg copy sequences.
+
+; TODO: Test 16-bit paired cases
+
+define void @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v5
+; GFX900-NEXT:    global_store_dwordx2 v0, v[3:4], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v5
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v2i32_10_physreg_odd_vgpr_pair_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_odd_vgpr_pair_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v5, v6
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v4, v6
+; GFX900-NEXT:    global_store_dwordx2 v0, v[4:5], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_odd_vgpr_pair_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v5, v6
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v6
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[4:5], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_odd_vgpr_pair_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v5, v6
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v4, v6
+; GFX940-NEXT:    global_store_dwordx2 v0, v[4:5], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={v5},={v6}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v2i32_10_physreg_even_disjoint_even_vgpr_pair(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_even_disjoint_even_vgpr_pair:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v6
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v6
+; GFX900-NEXT:    global_store_dwordx2 v0, v[3:4], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_disjoint_even_vgpr_pair:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v6
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[6:7], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_disjoint_even_vgpr_pair:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v6
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v7, v4
+; GFX940-NEXT:    global_store_dwordx2 v0, v[6:7], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v6}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v2i32_00_physreg_even_vgpr_pair_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_00_physreg_even_vgpr_pair_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v5, v4
+; GFX900-NEXT:    global_store_dwordx2 v0, v[4:5], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_00_physreg_even_vgpr_pair_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[4:5], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_00_physreg_even_vgpr_pair_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v4
+; GFX940-NEXT:    global_store_dwordx2 v0, v[4:5], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> zeroinitializer
+  store <2 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v2i32_11_physreg_even_vgpr_pair_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v6, v5
+; GFX900-NEXT:    global_store_dwordx2 v0, v[5:6], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v5
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> splat (i32 1)
+  store <2 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5, v6, v7
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v5
+; GFX900-NEXT:    v_mov_b32_e32 v2, v6
+; GFX900-NEXT:    v_mov_b32_e32 v1, v7
+; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5, v6, v7
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v7
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v8, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5, v6, v7
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v7
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
+  %asm.0 = extractvalue { i32, i32, i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32, i32, i32 } %asm, 1
+  %asm.2 = extractvalue { i32, i32, i32, i32 } %asm, 2
+  %asm.3 = extractvalue { i32, i32, i32, i32 } %asm, 3
+
+  %insert0 = insertelement <4 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <4 x i32> %insert0, i32 %asm.1, i32 1
+  %insert2 = insertelement <4 x i32> %insert1, i32 %asm.2, i32 2
+  %insert3 = insertelement <4 x i32> %insert2, i32 %asm.3, i32 3
+
+  %shuffle = shufflevector <4 x i32> %insert3, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+
+
+define void @shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5, v6, v7
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v3, v5
+; GFX900-NEXT:    v_mov_b32_e32 v5, v7
+; GFX900-NEXT:    global_store_dwordx4 v0, v[3:6], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5, v6, v7
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v7
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
+; GFX90A-NEXT:    global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v8, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5, v6, v7
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v5
+; GFX940-NEXT:    v_mov_b32_e32 v3, v6
+; GFX940-NEXT:    v_mov_b32_e32 v2, v7
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
+  %asm.0 = extractvalue { i32, i32, i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32, i32, i32 } %asm, 1
+  %asm.2 = extractvalue { i32, i32, i32, i32 } %asm, 2
+  %asm.3 = extractvalue { i32, i32, i32, i32 } %asm, 3
+
+  %insert0 = insertelement <4 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <4 x i32> %insert0, i32 %asm.1, i32 1
+  %insert2 = insertelement <4 x i32> %insert1, i32 %asm.2, i32 2
+  %insert3 = insertelement <4 x i32> %insert2, i32 %asm.3, i32 3
+
+  %shuffle = shufflevector <4 x i32> %insert3, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  store <4 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v4i32_1132_physreg_even_vgpr_quad_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v4i32_1132_physreg_even_vgpr_quad_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5, v6, v7
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v8, v6
+; GFX900-NEXT:    v_mov_b32_e32 v6, v5
+; GFX900-NEXT:    global_store_dwordx4 v0, v[5:8], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v4i32_1132_physreg_even_vgpr_quad_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5, v6, v7
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[1,0]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v4i32_1132_physreg_even_vgpr_quad_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v8, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5, v6, v7
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v5
+; GFX940-NEXT:    v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[1,0]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
+  %asm.0 = extractvalue { i32, i32, i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32, i32, i32 } %asm, 1
+  %asm.2 = extractvalue { i32, i32, i32, i32 } %asm, 2
+  %asm.3 = extractvalue { i32, i32, i32, i32 } %asm, 3
+
+  %insert0 = insertelement <4 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <4 x i32> %insert0, i32 %asm.1, i32 1
+  %insert2 = insertelement <4 x i32> %insert1, i32 %asm.2, i32 2
+  %insert3 = insertelement <4 x i32> %insert2, i32 %asm.3, i32 3
+
+  %shuffle = shufflevector <4 x i32> %insert3, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 2>
+  store <4 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v4i32_3201_physreg_even_vgpr_quad_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v4i32_3201_physreg_even_vgpr_quad_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5, v6, v7
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v3, v6
+; GFX900-NEXT:    v_mov_b32_e32 v2, v7
+; GFX900-NEXT:    global_store_dwordx4 v0, v[2:5], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v4i32_3201_physreg_even_vgpr_quad_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5, v6, v7
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v7
+; GFX90A-NEXT:    global_store_dwordx4 v0, v[2:5], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v4i32_3201_physreg_even_vgpr_quad_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5, v6, v7
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v3, v6
+; GFX940-NEXT:    v_mov_b32_e32 v2, v7
+; GFX940-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
+  %asm.0 = extractvalue { i32, i32, i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32, i32, i32 } %asm, 1
+  %asm.2 = extractvalue { i32, i32, i32, i32 } %asm, 2
+  %asm.3 = extractvalue { i32, i32, i32, i32 } %asm, 3
+
+  %insert0 = insertelement <4 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <4 x i32> %insert0, i32 %asm.1, i32 1
+  %insert2 = insertelement <4 x i32> %insert1, i32 %asm.2, i32 2
+  %insert3 = insertelement <4 x i32> %insert2, i32 %asm.3, i32 3
+
+  %shuffle = shufflevector <4 x i32> %insert3, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  store <4 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v2i32_10_physreg_even_sgpr_pair_copy() {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_even_sgpr_pair_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s4, s5
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_mov_b32 s7, s4
+; GFX900-NEXT:    s_mov_b32 s6, s5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[6:7]
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_sgpr_pair_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def s4, s5
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_mov_b32 s7, s4
+; GFX90A-NEXT:    s_mov_b32 s6, s5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use s[6:7]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_sgpr_pair_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def s4, s5
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_mov_b32 s1, s4
+; GFX940-NEXT:    s_mov_b32 s0, s5
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[0:1]
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={s4},={s5}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %shuffle)
+  ret void
+}
+
+define void @shufflevector_v2i32_10_physreg_odd_sgpr_pair_copy() {
+; GFX9-LABEL: shufflevector_v2i32_10_physreg_odd_sgpr_pair_copy:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s5, s6
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_mov_b32 s7, s5
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s[6:7]
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={s5},={s6}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %shuffle)
+  ret void
+}
+
+define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_even_agpr_pair_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def a4, a5
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_accvgpr_read_b32 v0, a5
+; GFX900-NEXT:    v_accvgpr_read_b32 v1, a4
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_agpr_pair_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a4, a5
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a4
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a0, a5
+; GFX90A-NEXT:    global_store_dwordx2 v0, a[0:1], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_agpr_pair_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def a4, a5
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_accvgpr_mov_b32 a1, a4
+; GFX940-NEXT:    v_accvgpr_mov_b32 a0, a5
+; GFX940-NEXT:    global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={a4},={a5}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define void @shufflevector_v2i32_10_physreg_odd_agpr_pair_copy(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_odd_agpr_pair_copy:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def a5, a6
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_accvgpr_read_b32 v0, a6
+; GFX900-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_odd_agpr_pair_copy:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a5, a6
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a6
+; GFX90A-NEXT:    global_store_dwordx2 v0, a[4:5], s[16:17]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_odd_agpr_pair_copy:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def a5, a6
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_accvgpr_mov_b32 a4, a6
+; GFX940-NEXT:    global_store_dwordx2 v0, a[4:5], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={a5},={a6}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret void
+}
+
+define i32 @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt0(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt0:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  ret i32 %asm.0 ; other use of copy
+}
+
+define i32 @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt1(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy_other_use_elt1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v5
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
+  %asm.0 = extractvalue { i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32 } %asm, 1
+  %insert0 = insertelement <2 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <2 x i32> %insert0, i32 %asm.1, i32 1
+  %shuffle = shufflevector <2 x i32> %insert1, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  ret i32 %asm.1 ; other use of copy
+}
+
+define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(ptr addrspace(1) inreg %ptr) {
+; GFX900-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def v4, v5, v6, v7
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_mov_b32_e32 v9, v5
+; GFX900-NEXT:    v_mov_b32_e32 v8, v6
+; GFX900-NEXT:    v_mov_b32_e32 v10, v4
+; GFX900-NEXT:    global_store_dwordx4 v0, v[7:10], s[16:17]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v4, v5, v6, v7
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v7
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v8, 0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def v4, v5, v6, v7
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
+; GFX940-NEXT:    v_mov_b32_e32 v0, v7
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
+  %asm.0 = extractvalue { i32, i32, i32, i32 } %asm, 0
+  %asm.1 = extractvalue { i32, i32, i32, i32 } %asm, 1
+  %asm.2 = extractvalue { i32, i32, i32, i32 } %asm, 2
+  %asm.3 = extractvalue { i32, i32, i32, i32 } %asm, 3
+
+  %insert0 = insertelement <4 x i32> poison, i32 %asm.0, i32 0
+  %insert1 = insertelement <4 x i32> %insert0, i32 %asm.1, i32 1
+  %insert2 = insertelement <4 x i32> %insert1, i32 %asm.2, i32 2
+  %insert3 = insertelement <4 x i32> %insert2, i32 %asm.3, i32 3
+
+  %shuffle = shufflevector <4 x i32> %insert3, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x i32> %shuffle, ptr addrspace(1) %ptr, align 8
+  ret i32 %asm.2
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX90APLUS: {{.*}}

>From 1a31bb38a4bb2bc94fbbb43fe04d878cb4a5a05b Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 6 Mar 2025 21:29:25 -0800
Subject: [PATCH 12/23] [lldb][Mach-O] Don't read symbol table of specially
 marked binary (#129967)

We have a binary image on Darwin that has no code, only metadata. It has
a large symbol table with many external symbol names that will not be
needed in the debugger. And it is possible to not have this binary on
the debugger system - so lldb must read all of the symbol names out of
memory, one at a time, which can be quite slow.

We're adding a section __TEXT,__lldb_no_nlist, to this binary to
indicate that lldb should not read the nlist symbols for it when we are
reading out of memory. If lldb is run with an on-disk version of the
binary, we will load the symbol table as we normally would, there's no
benefit to handling this binary differently.

I added a test where I create a dylib with this specially named section,
launch the process. The main binary deletes the dylib from the disk so
lldb is forced to read it out of memory. lldb attaches to the binary,
confirms that the dylib is present in the process and is a memory
Module. If the binary is not present, or lldb found the on-disk copy
because it hasn't been deleted yet, we delete the target, flush the
Debugger's module cache, sleep and retry, up to ten times. I create the
specially named section by compiling an assembly file that puts a byte
in the section which makes for a bit of a messy Makefile (the pre-canned
actions to build a dylib don't quite handle this case) but I don't think
it's much of a problem. This is a purely skipUnlessDarwin test case.

Relanding this change with a restructured Makefiles for the test case
that should pass on the CI bots.

rdar://146167816
---
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     | 68 +++++++++++-------
 .../ObjectFile/Mach-O/ObjectFileMachO.h       |  1 +
 .../macosx/no-nlist-memory-module/Makefile    | 13 ++++
 .../macosx/no-nlist-memory-module/NoNlists.mk | 12 ++++
 .../TestNoNlistsDylib.py                      | 71 +++++++++++++++++++
 .../no-nlist-memory-module/has-nlists.c       |  1 +
 .../API/macosx/no-nlist-memory-module/main.c  | 62 ++++++++++++++++
 .../no-nlist-memory-module/no-nlist-sect.s    |  3 +
 .../macosx/no-nlist-memory-module/no-nlists.c |  1 +
 9 files changed, 208 insertions(+), 24 deletions(-)
 create mode 100644 lldb/test/API/macosx/no-nlist-memory-module/Makefile
 create mode 100644 lldb/test/API/macosx/no-nlist-memory-module/NoNlists.mk
 create mode 100644 lldb/test/API/macosx/no-nlist-memory-module/TestNoNlistsDylib.py
 create mode 100644 lldb/test/API/macosx/no-nlist-memory-module/has-nlists.c
 create mode 100644 lldb/test/API/macosx/no-nlist-memory-module/main.c
 create mode 100644 lldb/test/API/macosx/no-nlist-memory-module/no-nlist-sect.s
 create mode 100644 lldb/test/API/macosx/no-nlist-memory-module/no-nlists.c

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index a19322ff1e263..f31b56b9f81e6 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -918,6 +918,11 @@ ConstString ObjectFileMachO::GetSectionNameEHFrame() {
   return g_section_name_eh_frame;
 }
 
+ConstString ObjectFileMachO::GetSectionNameLLDBNoNlist() {
+  static ConstString g_section_name_lldb_no_nlist("__lldb_no_nlist");
+  return g_section_name_lldb_no_nlist;
+}
+
 bool ObjectFileMachO::MagicBytesMatch(DataBufferSP data_sp,
                                       lldb::addr_t data_offset,
                                       lldb::addr_t data_length) {
@@ -2394,8 +2399,39 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
   uint32_t memory_module_load_level = eMemoryModuleLoadLevelComplete;
   bool is_shared_cache_image = IsSharedCacheBinary();
   bool is_local_shared_cache_image = is_shared_cache_image && !IsInMemory();
+
+  ConstString g_segment_name_TEXT = GetSegmentNameTEXT();
+  ConstString g_segment_name_DATA = GetSegmentNameDATA();
+  ConstString g_segment_name_DATA_DIRTY = GetSegmentNameDATA_DIRTY();
+  ConstString g_segment_name_DATA_CONST = GetSegmentNameDATA_CONST();
+  ConstString g_segment_name_OBJC = GetSegmentNameOBJC();
+  ConstString g_section_name_eh_frame = GetSectionNameEHFrame();
+  ConstString g_section_name_lldb_no_nlist = GetSectionNameLLDBNoNlist();
+  SectionSP text_section_sp(
+      section_list->FindSectionByName(g_segment_name_TEXT));
+  SectionSP data_section_sp(
+      section_list->FindSectionByName(g_segment_name_DATA));
   SectionSP linkedit_section_sp(
       section_list->FindSectionByName(GetSegmentNameLINKEDIT()));
+  SectionSP data_dirty_section_sp(
+      section_list->FindSectionByName(g_segment_name_DATA_DIRTY));
+  SectionSP data_const_section_sp(
+      section_list->FindSectionByName(g_segment_name_DATA_CONST));
+  SectionSP objc_section_sp(
+      section_list->FindSectionByName(g_segment_name_OBJC));
+  SectionSP eh_frame_section_sp;
+  SectionSP lldb_no_nlist_section_sp;
+  if (text_section_sp.get()) {
+    eh_frame_section_sp = text_section_sp->GetChildren().FindSectionByName(
+        g_section_name_eh_frame);
+    lldb_no_nlist_section_sp = text_section_sp->GetChildren().FindSectionByName(
+        g_section_name_lldb_no_nlist);
+  } else {
+    eh_frame_section_sp =
+        section_list->FindSectionByName(g_section_name_eh_frame);
+    lldb_no_nlist_section_sp =
+        section_list->FindSectionByName(g_section_name_lldb_no_nlist);
+  }
 
   if (process && m_header.filetype != llvm::MachO::MH_OBJECT &&
       !is_local_shared_cache_image) {
@@ -2403,6 +2439,14 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
     memory_module_load_level = target.GetMemoryModuleLoadLevel();
 
+    // If __TEXT,__lldb_no_nlist section is present in this binary,
+    // and we're reading it out of memory, do not read any of the
+    // nlist entries.  They are not needed in lldb and it may be
+    // expensive to load these.  This is to handle a dylib consisting
+    // of only metadata, no code, but it has many nlist entries.
+    if (lldb_no_nlist_section_sp)
+      memory_module_load_level = eMemoryModuleLoadLevelMinimal;
+
     // Reading mach file from memory in a process or core file...
 
     if (linkedit_section_sp) {
@@ -2526,30 +2570,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
   const bool have_strtab_data = strtab_data.GetByteSize() > 0;
 
-  ConstString g_segment_name_TEXT = GetSegmentNameTEXT();
-  ConstString g_segment_name_DATA = GetSegmentNameDATA();
-  ConstString g_segment_name_DATA_DIRTY = GetSegmentNameDATA_DIRTY();
-  ConstString g_segment_name_DATA_CONST = GetSegmentNameDATA_CONST();
-  ConstString g_segment_name_OBJC = GetSegmentNameOBJC();
-  ConstString g_section_name_eh_frame = GetSectionNameEHFrame();
-  SectionSP text_section_sp(
-      section_list->FindSectionByName(g_segment_name_TEXT));
-  SectionSP data_section_sp(
-      section_list->FindSectionByName(g_segment_name_DATA));
-  SectionSP data_dirty_section_sp(
-      section_list->FindSectionByName(g_segment_name_DATA_DIRTY));
-  SectionSP data_const_section_sp(
-      section_list->FindSectionByName(g_segment_name_DATA_CONST));
-  SectionSP objc_section_sp(
-      section_list->FindSectionByName(g_segment_name_OBJC));
-  SectionSP eh_frame_section_sp;
-  if (text_section_sp.get())
-    eh_frame_section_sp = text_section_sp->GetChildren().FindSectionByName(
-        g_section_name_eh_frame);
-  else
-    eh_frame_section_sp =
-        section_list->FindSectionByName(g_section_name_eh_frame);
-
   const bool is_arm = (m_header.cputype == llvm::MachO::CPU_TYPE_ARM);
   const bool always_thumb = GetArchitecture().IsAlwaysThumbInstructions();
 
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 27b2078b5a3fc..7f67f5e04f1d6 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -286,6 +286,7 @@ class ObjectFileMachO : public lldb_private::ObjectFile {
   static lldb_private::ConstString GetSegmentNameDWARF();
   static lldb_private::ConstString GetSegmentNameLLVM_COV();
   static lldb_private::ConstString GetSectionNameEHFrame();
+  static lldb_private::ConstString GetSectionNameLLDBNoNlist();
 
   llvm::MachO::dysymtab_command m_dysymtab;
   std::vector<llvm::MachO::section_64> m_mach_sections;
diff --git a/lldb/test/API/macosx/no-nlist-memory-module/Makefile b/lldb/test/API/macosx/no-nlist-memory-module/Makefile
new file mode 100644
index 0000000000000..456e57ac8e65c
--- /dev/null
+++ b/lldb/test/API/macosx/no-nlist-memory-module/Makefile
@@ -0,0 +1,13 @@
+C_SOURCES := main.c
+LD_EXTRAS = -Wl,-rpath "-Wl,$(shell pwd)" -L. -lno-nlists -lhas-nlists
+
+include Makefile.rules
+
+a.out: dylib_HasNlists dylib_NoNlists
+
+dylib_HasNlists:
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+	DYLIB_ONLY=YES DYLIB_NAME=has-nlists DYLIB_C_SOURCES=has-nlists.c
+
+dylib_NoNlists:
+	"$(MAKE)" VPATH=$(SRCDIR) -I $(SRCDIR) -f $(SRCDIR)/NoNlists.mk
diff --git a/lldb/test/API/macosx/no-nlist-memory-module/NoNlists.mk b/lldb/test/API/macosx/no-nlist-memory-module/NoNlists.mk
new file mode 100644
index 0000000000000..046390c028186
--- /dev/null
+++ b/lldb/test/API/macosx/no-nlist-memory-module/NoNlists.mk
@@ -0,0 +1,12 @@
+DYLIB_ONLY := YES
+DYLIB_NAME := no-nlists
+DYLIB_C_SOURCES := no-nlists.c
+DYLIB_OBJECTS += no-nlist-sect.o
+
+no-nlist-sect.o:
+	$(CC) $(CFLAGS) -c -o no-nlist-sect.o $(SRCDIR)/no-nlist-sect.s
+
+include Makefile.rules
+
+clean::
+	rm -rf *.o *.dylib a.out *.dSYM
diff --git a/lldb/test/API/macosx/no-nlist-memory-module/TestNoNlistsDylib.py b/lldb/test/API/macosx/no-nlist-memory-module/TestNoNlistsDylib.py
new file mode 100644
index 0000000000000..9216cf2eab164
--- /dev/null
+++ b/lldb/test/API/macosx/no-nlist-memory-module/TestNoNlistsDylib.py
@@ -0,0 +1,71 @@
+"""
+Test that we read don't read the nlist symbols for a specially marked dylib
+when read from memory.
+"""
+
+import os
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from time import sleep
+
+
+class NoNlistsTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @skipIfRemote
+    @skipUnlessDarwin
+    def test_no_nlist_symbols(self):
+        self.build()
+
+        exe = os.path.realpath(self.getBuildArtifact("a.out"))
+
+        # Use a file as a synchronization point between test and inferior.
+        pid_file_path = lldbutil.append_to_process_working_directory(
+            self, "pid_file_%d" % (int(time.time()))
+        )
+        self.addTearDownHook(
+            lambda: self.run_platform_command("rm %s" % (pid_file_path))
+        )
+
+        # Spawn a new process
+        popen = self.spawnSubprocess(exe, [pid_file_path])
+
+        pid = lldbutil.wait_for_file_on_target(self, pid_file_path)
+
+        os.unlink(self.getBuildArtifact("libno-nlists.dylib"))
+        os.unlink(self.getBuildArtifact("libhas-nlists.dylib"))
+
+        self.runCmd("process attach -p " + str(pid))
+        target = self.dbg.GetSelectedTarget()
+        process = target.GetProcess()
+        m_no_nlist = target.FindModule(lldb.SBFileSpec("libno-nlists.dylib"))
+        m_has_nlist = target.FindModule(lldb.SBFileSpec("libhas-nlists.dylib"))
+
+        self.assertTrue(process, PROCESS_IS_VALID)
+
+        if self.TraceOn():
+            self.runCmd("image list")
+            self.runCmd("target modules dump symtab libno-nlists.dylib")
+            self.runCmd("target modules dump symtab libhas-nlists.dylib")
+
+        # Test that we found libno-nlists.dylib, it is a memory
+        # module, and that it has no symbols.
+        self.assertTrue(m_no_nlist.IsValid())
+        self.assertFalse(m_no_nlist.IsFileBacked())
+        self.assertEqual(m_no_nlist.GetNumSymbols(), 0)
+
+        # Test that we found libhas-nlists.dylib, it is a memory
+        # module, and that it has more than zero symbols.
+        self.assertTrue(m_has_nlist.IsValid())
+        self.assertFalse(m_has_nlist.IsFileBacked())
+        self.assertGreater(m_has_nlist.GetNumSymbols(), 0)
+
+        # And as a sanity check, get the main binary's module,
+        # test that it is file backed and that it has more than
+        # zero symbols.
+        m_exe = target.FindModule(lldb.SBFileSpec("a.out"))
+        self.assertTrue(m_exe.IsValid())
+        self.assertTrue(m_exe.IsFileBacked())
+        self.assertGreater(m_exe.GetNumSymbols(), 0)
diff --git a/lldb/test/API/macosx/no-nlist-memory-module/has-nlists.c b/lldb/test/API/macosx/no-nlist-memory-module/has-nlists.c
new file mode 100644
index 0000000000000..5cfcb5118bbb7
--- /dev/null
+++ b/lldb/test/API/macosx/no-nlist-memory-module/has-nlists.c
@@ -0,0 +1 @@
+int get_return_value2() { return 20; }
diff --git a/lldb/test/API/macosx/no-nlist-memory-module/main.c b/lldb/test/API/macosx/no-nlist-memory-module/main.c
new file mode 100644
index 0000000000000..16a8e4e7ae34b
--- /dev/null
+++ b/lldb/test/API/macosx/no-nlist-memory-module/main.c
@@ -0,0 +1,62 @@
+#include <fcntl.h>
+#include <libgen.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+int get_return_value();
+int get_return_value2();
+
+// Create \a file_name with the c-string of our
+// pid in it.  Initially open & write the contents
+// to a temporary file, then move it to the actual
+// filename once writing is completed.
+bool writePid(const char *file_name, const pid_t pid) {
+  char *tmp_file_name = (char *)malloc(strlen(file_name) + 16);
+  strcpy(tmp_file_name, file_name);
+  strcat(tmp_file_name, "_tmp");
+  int fd = open(tmp_file_name, O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR);
+  if (fd == -1) {
+    fprintf(stderr, "open(%s) failed: %s\n", tmp_file_name, strerror(errno));
+    free(tmp_file_name);
+    return false;
+  }
+  char buffer[64];
+  snprintf(buffer, sizeof(buffer), "%ld", (long)pid);
+  bool res = true;
+  if (write(fd, buffer, strlen(buffer)) == -1) {
+    fprintf(stderr, "write(%s) failed: %s\n", buffer, strerror(errno));
+    res = false;
+  }
+  close(fd);
+
+  if (rename(tmp_file_name, file_name) == -1) {
+    fprintf(stderr, "rename(%s, %s) failed: %s\n", tmp_file_name, file_name,
+            strerror(errno));
+    res = false;
+  }
+  free(tmp_file_name);
+
+  return res;
+}
+
+int main(int argc, char **argv) {
+  if (writePid(argv[1], getpid())) {
+    // we've signaled lldb we are ready to be attached to,
+    // this sleep() call will be interrupted when lldb
+    // attaches.
+    sleep(200);
+  } else {
+    printf("Error writing pid to '%s', exiting.\n", argv[1]);
+    exit(3);
+  }
+
+  int retval = get_return_value();
+  return retval + get_return_value2();
+}
diff --git a/lldb/test/API/macosx/no-nlist-memory-module/no-nlist-sect.s b/lldb/test/API/macosx/no-nlist-memory-module/no-nlist-sect.s
new file mode 100644
index 0000000000000..0a7c974f9362c
--- /dev/null
+++ b/lldb/test/API/macosx/no-nlist-memory-module/no-nlist-sect.s
@@ -0,0 +1,3 @@
+   .section __TEXT,__lldb_no_nlist,regular,pure_instructions
+   .p2align 2
+   .byte 0x10
diff --git a/lldb/test/API/macosx/no-nlist-memory-module/no-nlists.c b/lldb/test/API/macosx/no-nlist-memory-module/no-nlists.c
new file mode 100644
index 0000000000000..ff81940087ab7
--- /dev/null
+++ b/lldb/test/API/macosx/no-nlist-memory-module/no-nlists.c
@@ -0,0 +1 @@
+int get_return_value() { return 10; }

>From ed6bde93f050b9314e54ecbc69c08ab78f4251af Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy at gmail.com>
Date: Fri, 7 Mar 2025 06:39:03 +0100
Subject: [PATCH 13/23] [X86] Use fence(seq_cst) in IdempotentRMWIntoFencedLoad
 (#126521)

This extends this optimization for scenarios where the subtarget
has `!hasMFence` or we have SyncScope SingleThread, by avoiding
the direct usage of `llvm.x64.sse2.mfence`.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  17 +-
 .../X86/atomic-idempotent-syncscope.ll        | 615 ++++++++++++++++++
 llvm/test/CodeGen/X86/atomic-idempotent.ll    | 162 ++---
 3 files changed, 665 insertions(+), 129 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index deab638b7e546..74d96a0219a0e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31905,21 +31905,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
-  if (SSID == SyncScope::SingleThread)
-    // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
-    // the IR level, so we must wrap it in an intrinsic.
-    return nullptr;
-
-  if (!Subtarget.hasMFence())
-    // FIXME: it might make sense to use a locked operation here but on a
-    // different cache-line to prevent cache-line bouncing. In practice it
-    // is probably a small win, and x86 processors without mfence are rare
-    // enough that we do not bother.
-    return nullptr;
 
-  Function *MFence =
-      llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
-  Builder.CreateCall(MFence, {});
+  // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
+  // lowering for SSID == SyncScope::SingleThread and !hasMFence
+  Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
 
   // Finally we can emit the atomic load.
   LoadInst *Loaded = Builder.CreateAlignedLoad(
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
new file mode 100644
index 0000000000000..9e20fdb59f552
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
@@ -0,0 +1,615 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs                           | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs           -mattr=+sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SSE2
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=slm -mattr=-sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=goldmont -mattr=-sse2 | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=knl -mattr=-sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
+; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=atom -mattr=-sse2     | FileCheck %s --check-prefixes=X86,X86-ATOM
+
+; On x86, an atomic rmw operation that does not modify the value in memory
+; (such as atomic add 0) can be replaced by an mfence followed by a mov.
+; This is explained (with the motivation for such an optimization) in
+; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+
+define i8 @add8(ptr %p) #0 {
+; X64-LABEL: add8:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    movzbl (%rdi), %eax
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: add8:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movzbl (%eax), %eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: add8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movzbl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  %1 = atomicrmw add ptr %p, i8 0 syncscope("singlethread") monotonic
+  ret i8 %1
+}
+
+define i16 @or16(ptr %p) #0 {
+; X64-LABEL: or16:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movzwl (%eax), %eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movzwl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  %1 = atomicrmw or ptr %p, i16 0 syncscope("singlethread") acquire
+  ret i16 %1
+}
+
+define i32 @xor32(ptr %p) #0 {
+; X64-LABEL: xor32:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: xor32:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movl (%eax), %eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: xor32:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  %1 = atomicrmw xor ptr %p, i32 0 syncscope("singlethread") release
+  ret i32 %1
+}
+
+define i64 @sub64(ptr %p) #0 {
+; X64-LABEL: sub64:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: sub64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl 4(%esi), %edx
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB3_1: # %atomicrmw.start
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    lock cmpxchg8b (%esi)
+; X86-NEXT:    jne .LBB3_1
+; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %1 = atomicrmw sub ptr %p, i64 0 syncscope("singlethread") seq_cst
+  ret i64 %1
+}
+
+define i128 @or128(ptr %p) #0 {
+; X64-LABEL: or128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or128:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    pushl %ebp
+; X86-GENERIC-NEXT:    movl %esp, %ebp
+; X86-GENERIC-NEXT:    pushl %ebx
+; X86-GENERIC-NEXT:    pushl %edi
+; X86-GENERIC-NEXT:    pushl %esi
+; X86-GENERIC-NEXT:    andl $-16, %esp
+; X86-GENERIC-NEXT:    subl $48, %esp
+; X86-GENERIC-NEXT:    movl 12(%ebp), %edi
+; X86-GENERIC-NEXT:    movl 12(%edi), %ecx
+; X86-GENERIC-NEXT:    movl 8(%edi), %edx
+; X86-GENERIC-NEXT:    movl (%edi), %ebx
+; X86-GENERIC-NEXT:    movl 4(%edi), %esi
+; X86-GENERIC-NEXT:    .p2align 4
+; X86-GENERIC-NEXT:  .LBB4_1: # %atomicrmw.start
+; X86-GENERIC-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-GENERIC-NEXT:    movl %ebx, (%esp)
+; X86-GENERIC-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    pushl $0
+; X86-GENERIC-NEXT:    pushl $0
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    pushl %edi
+; X86-GENERIC-NEXT:    pushl $16
+; X86-GENERIC-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-GENERIC-NEXT:    addl $24, %esp
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-GENERIC-NEXT:    movl (%esp), %ebx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-GENERIC-NEXT:    testb %al, %al
+; X86-GENERIC-NEXT:    je .LBB4_1
+; X86-GENERIC-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-GENERIC-NEXT:    movl 8(%ebp), %eax
+; X86-GENERIC-NEXT:    movl %ebx, (%eax)
+; X86-GENERIC-NEXT:    movl %esi, 4(%eax)
+; X86-GENERIC-NEXT:    movl %edx, 8(%eax)
+; X86-GENERIC-NEXT:    movl %ecx, 12(%eax)
+; X86-GENERIC-NEXT:    leal -12(%ebp), %esp
+; X86-GENERIC-NEXT:    popl %esi
+; X86-GENERIC-NEXT:    popl %edi
+; X86-GENERIC-NEXT:    popl %ebx
+; X86-GENERIC-NEXT:    popl %ebp
+; X86-GENERIC-NEXT:    retl $4
+;
+; X86-ATOM-LABEL: or128:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    pushl %ebp
+; X86-ATOM-NEXT:    movl %esp, %ebp
+; X86-ATOM-NEXT:    pushl %ebx
+; X86-ATOM-NEXT:    pushl %edi
+; X86-ATOM-NEXT:    pushl %esi
+; X86-ATOM-NEXT:    andl $-16, %esp
+; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    movl 12(%ebp), %edi
+; X86-ATOM-NEXT:    movl 12(%edi), %ecx
+; X86-ATOM-NEXT:    movl 8(%edi), %edx
+; X86-ATOM-NEXT:    movl (%edi), %esi
+; X86-ATOM-NEXT:    movl 4(%edi), %ebx
+; X86-ATOM-NEXT:    .p2align 4
+; X86-ATOM-NEXT:  .LBB4_1: # %atomicrmw.start
+; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-ATOM-NEXT:    movl %esi, (%esp)
+; X86-ATOM-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    pushl $0
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    pushl %edi
+; X86-ATOM-NEXT:    pushl $16
+; X86-ATOM-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-ATOM-NEXT:    testb %al, %al
+; X86-ATOM-NEXT:    movl (%esp), %esi
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-ATOM-NEXT:    je .LBB4_1
+; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl 8(%ebp), %eax
+; X86-ATOM-NEXT:    movl %esi, (%eax)
+; X86-ATOM-NEXT:    movl %ebx, 4(%eax)
+; X86-ATOM-NEXT:    movl %edx, 8(%eax)
+; X86-ATOM-NEXT:    movl %ecx, 12(%eax)
+; X86-ATOM-NEXT:    leal -12(%ebp), %esp
+; X86-ATOM-NEXT:    popl %esi
+; X86-ATOM-NEXT:    popl %edi
+; X86-ATOM-NEXT:    popl %ebx
+; X86-ATOM-NEXT:    popl %ebp
+; X86-ATOM-NEXT:    retl $4
+  %1 = atomicrmw or ptr %p, i128 0 syncscope("singlethread") monotonic
+  ret i128 %1
+}
+
+; For 'and', the idempotent value is (-1)
+define i32 @and32 (ptr %p) #0 {
+; X64-LABEL: and32:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: and32:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    movl (%eax), %eax
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: and32:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  %1 = atomicrmw and ptr %p, i32 -1 syncscope("singlethread") acq_rel
+  ret i32 %1
+}
+
+define void @or32_nouse_monotonic(ptr %p) #0 {
+; X64-LABEL: or32_nouse_monotonic:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_monotonic:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_monotonic:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") monotonic
+  ret void
+}
+
+
+define void @or32_nouse_acquire(ptr %p) #0 {
+; X64-LABEL: or32_nouse_acquire:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_acquire:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_acquire:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 acquire
+  ret void
+}
+
+define void @or32_nouse_release(ptr %p) #0 {
+; X64-LABEL: or32_nouse_release:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_release:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_release:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") release
+  ret void
+}
+
+define void @or32_nouse_acq_rel(ptr %p) #0 {
+; X64-LABEL: or32_nouse_acq_rel:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_acq_rel:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_acq_rel:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") acq_rel
+  ret void
+}
+
+define void @or32_nouse_seq_cst(ptr %p) #0 {
+; X64-LABEL: or32_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or32_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or32_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i32 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
+define void @or64_nouse_seq_cst(ptr %p) #0 {
+; X64-LABEL: or64_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-LABEL: or64_nouse_seq_cst:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl 4(%esi), %edx
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB11_1: # %atomicrmw.start
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    lock cmpxchg8b (%esi)
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  atomicrmw or ptr %p, i64 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+; TODO: Don't need to lower as sync_and_fetch call
+define void @or128_nouse_seq_cst(ptr %p) #0 {
+; X64-LABEL: or128_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movl $5, %ecx
+; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or128_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    pushl %ebp
+; X86-GENERIC-NEXT:    movl %esp, %ebp
+; X86-GENERIC-NEXT:    pushl %ebx
+; X86-GENERIC-NEXT:    pushl %edi
+; X86-GENERIC-NEXT:    pushl %esi
+; X86-GENERIC-NEXT:    andl $-16, %esp
+; X86-GENERIC-NEXT:    subl $48, %esp
+; X86-GENERIC-NEXT:    movl 8(%ebp), %esi
+; X86-GENERIC-NEXT:    movl 12(%esi), %ecx
+; X86-GENERIC-NEXT:    movl 8(%esi), %edi
+; X86-GENERIC-NEXT:    movl (%esi), %edx
+; X86-GENERIC-NEXT:    movl 4(%esi), %ebx
+; X86-GENERIC-NEXT:    .p2align 4
+; X86-GENERIC-NEXT:  .LBB12_1: # %atomicrmw.start
+; X86-GENERIC-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-GENERIC-NEXT:    movl %edx, (%esp)
+; X86-GENERIC-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    pushl $5
+; X86-GENERIC-NEXT:    pushl $5
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    pushl %eax
+; X86-GENERIC-NEXT:    pushl %esi
+; X86-GENERIC-NEXT:    pushl $16
+; X86-GENERIC-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-GENERIC-NEXT:    addl $24, %esp
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-GENERIC-NEXT:    movl (%esp), %edx
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-GENERIC-NEXT:    testb %al, %al
+; X86-GENERIC-NEXT:    je .LBB12_1
+; X86-GENERIC-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-GENERIC-NEXT:    leal -12(%ebp), %esp
+; X86-GENERIC-NEXT:    popl %esi
+; X86-GENERIC-NEXT:    popl %edi
+; X86-GENERIC-NEXT:    popl %ebx
+; X86-GENERIC-NEXT:    popl %ebp
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or128_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    pushl %ebp
+; X86-ATOM-NEXT:    movl %esp, %ebp
+; X86-ATOM-NEXT:    pushl %ebx
+; X86-ATOM-NEXT:    pushl %edi
+; X86-ATOM-NEXT:    pushl %esi
+; X86-ATOM-NEXT:    andl $-16, %esp
+; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    movl 8(%ebp), %esi
+; X86-ATOM-NEXT:    movl %esp, %ebx
+; X86-ATOM-NEXT:    movl 12(%esi), %ecx
+; X86-ATOM-NEXT:    movl 8(%esi), %edx
+; X86-ATOM-NEXT:    movl (%esi), %eax
+; X86-ATOM-NEXT:    movl 4(%esi), %edi
+; X86-ATOM-NEXT:    .p2align 4
+; X86-ATOM-NEXT:  .LBB12_1: # %atomicrmw.start
+; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-ATOM-NEXT:    movl %eax, (%esp)
+; X86-ATOM-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    pushl $5
+; X86-ATOM-NEXT:    pushl $5
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    pushl %eax
+; X86-ATOM-NEXT:    pushl %ebx
+; X86-ATOM-NEXT:    pushl %esi
+; X86-ATOM-NEXT:    pushl $16
+; X86-ATOM-NEXT:    calll __atomic_compare_exchange at PLT
+; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
+; X86-ATOM-NEXT:    testb %al, %al
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-ATOM-NEXT:    movl (%esp), %eax
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-ATOM-NEXT:    je .LBB12_1
+; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    leal -12(%ebp), %esp
+; X86-ATOM-NEXT:    popl %esi
+; X86-ATOM-NEXT:    popl %edi
+; X86-ATOM-NEXT:    popl %ebx
+; X86-ATOM-NEXT:    popl %ebp
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i128 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+
+define void @or16_nouse_seq_cst(ptr %p) #0 {
+; X64-LABEL: or16_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or16_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or16_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i16 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+define void @or8_nouse_seq_cst(ptr %p) #0 {
+; X64-LABEL: or8_nouse_seq_cst:
+; X64:       # %bb.0:
+; X64-NEXT:    #MEMBARRIER
+; X64-NEXT:    retq
+;
+; X86-GENERIC-LABEL: or8_nouse_seq_cst:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    #MEMBARRIER
+; X86-GENERIC-NEXT:    retl
+;
+; X86-ATOM-LABEL: or8_nouse_seq_cst:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    #MEMBARRIER
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+  atomicrmw or ptr %p, i8 0 syncscope("singlethread") seq_cst
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X86-SLM: {{.*}}
+; X86-SSE2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index 55b4d1af094f6..91355bd64cade 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -11,7 +11,7 @@
 ; This is explained (with the motivation for such an optimization) in
 ; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
 
-define i8 @add8(ptr %p) {
+define i8 @add8(ptr %p) #0 {
 ; X64-LABEL: add8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
 ;
 ; X86-SLM-LABEL: add8:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    xorl %eax, %eax
-; X86-SLM-NEXT:    lock xaddb %al, (%ecx)
-; X86-SLM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movzbl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: add8:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    xorl %eax, %eax
-; X86-ATOM-NEXT:    lock xaddb %al, (%ecx)
-; X86-ATOM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movzbl (%eax), %eax
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
@@ -46,7 +44,7 @@ define i8 @add8(ptr %p) {
   ret i8 %1
 }
 
-define i16 @or16(ptr %p) {
+define i16 @or16(ptr %p) #0 {
 ; X64-LABEL: or16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -62,32 +60,24 @@ define i16 @or16(ptr %p) {
 ;
 ; X86-SLM-LABEL: or16:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movzwl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4
-; X86-SLM-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgw %ax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB1_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movzwl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or16:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movzwl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4
-; X86-ATOM-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgw %ax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB1_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movzwl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw or ptr %p, i16 0 acquire
   ret i16 %1
 }
 
-define i32 @xor32(ptr %p) {
+define i32 @xor32(ptr %p) #0 {
 ; X64-LABEL: xor32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -103,32 +93,24 @@ define i32 @xor32(ptr %p) {
 ;
 ; X86-SLM-LABEL: xor32:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4
-; X86-SLM-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB2_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: xor32:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4
-; X86-ATOM-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB2_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw xor ptr %p, i32 0 release
   ret i32 %1
 }
 
-define i64 @sub64(ptr %p) {
+define i64 @sub64(ptr %p) #0 {
 ; X64-LABEL: sub64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -138,11 +120,7 @@ define i64 @sub64(ptr %p) {
 ; X86-LABEL: sub64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
@@ -155,42 +133,32 @@ define i64 @sub64(ptr %p) {
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = atomicrmw sub ptr %p, i64 0 seq_cst
   ret i64 %1
 }
 
-define i128 @or128(ptr %p) {
+define i128 @or128(ptr %p) #0 {
 ; X64-LABEL: or128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
 ; X64-NEXT:    popq %rcx
-; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or128:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    pushl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
-; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
 ; X86-GENERIC-NEXT:    movl %esp, %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-GENERIC-NEXT:    pushl %ebx
 ; X86-GENERIC-NEXT:    pushl %edi
 ; X86-GENERIC-NEXT:    pushl %esi
 ; X86-GENERIC-NEXT:    andl $-16, %esp
 ; X86-GENERIC-NEXT:    subl $48, %esp
-; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
-; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
-; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
 ; X86-GENERIC-NEXT:    movl 12(%ebp), %edi
 ; X86-GENERIC-NEXT:    movl 12(%edi), %ecx
 ; X86-GENERIC-NEXT:    movl 8(%edi), %edx
@@ -234,24 +202,17 @@ define i128 @or128(ptr %p) {
 ; X86-GENERIC-NEXT:    popl %edi
 ; X86-GENERIC-NEXT:    popl %ebx
 ; X86-GENERIC-NEXT:    popl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-GENERIC-NEXT:    retl $4
 ;
 ; X86-ATOM-LABEL: or128:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
-; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    movl %esp, %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %ebx
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
 ; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
-; X86-ATOM-NEXT:    .cfi_offset %esi, -20
-; X86-ATOM-NEXT:    .cfi_offset %edi, -16
-; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
 ; X86-ATOM-NEXT:    movl 12(%ebp), %edi
 ; X86-ATOM-NEXT:    movl 12(%edi), %ecx
 ; X86-ATOM-NEXT:    movl 8(%edi), %edx
@@ -295,14 +256,13 @@ define i128 @or128(ptr %p) {
 ; X86-ATOM-NEXT:    popl %edi
 ; X86-ATOM-NEXT:    popl %ebx
 ; X86-ATOM-NEXT:    popl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl $4
   %1 = atomicrmw or ptr %p, i128 0 monotonic
   ret i128 %1
 }
 
 ; For 'and', the idempotent value is (-1)
-define i32 @and32 (ptr %p) {
+define i32 @and32 (ptr %p) #0 {
 ; X64-LABEL: and32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
@@ -318,32 +278,24 @@ define i32 @and32 (ptr %p) {
 ;
 ; X86-SLM-LABEL: and32:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4
-; X86-SLM-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB5_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: and32:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4
-; X86-ATOM-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB5_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw and ptr %p, i32 -1 acq_rel
   ret i32 %1
 }
 
-define void @or32_nouse_monotonic(ptr %p) {
+define void @or32_nouse_monotonic(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_monotonic:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -371,7 +323,7 @@ define void @or32_nouse_monotonic(ptr %p) {
 }
 
 
-define void @or32_nouse_acquire(ptr %p) {
+define void @or32_nouse_acquire(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_acquire:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -398,7 +350,7 @@ define void @or32_nouse_acquire(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_release(ptr %p) {
+define void @or32_nouse_release(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_release:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -425,7 +377,7 @@ define void @or32_nouse_release(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_acq_rel(ptr %p) {
+define void @or32_nouse_acq_rel(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_acq_rel:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
@@ -452,7 +404,7 @@ define void @or32_nouse_acq_rel(ptr %p) {
   ret void
 }
 
-define void @or32_nouse_seq_cst(ptr %p) {
+define void @or32_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or32_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -478,7 +430,7 @@ define void @or32_nouse_seq_cst(ptr %p) {
 }
 
 ; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
-define void @or64_nouse_seq_cst(ptr %p) {
+define void @or64_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or64_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -487,11 +439,7 @@ define void @or64_nouse_seq_cst(ptr %p) {
 ; X86-LABEL: or64_nouse_seq_cst:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
@@ -504,43 +452,33 @@ define void @or64_nouse_seq_cst(ptr %p) {
 ; X86-NEXT:    jne .LBB11_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   atomicrmw or ptr %p, i64 0 seq_cst
   ret void
 }
 
 ; TODO: Don't need to lower as sync_and_fetch call
-define void @or128_nouse_seq_cst(ptr %p) {
+define void @or128_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or128_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movl $5, %ecx
 ; X64-NEXT:    callq __atomic_fetch_or_16 at PLT
 ; X64-NEXT:    popq %rax
-; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or128_nouse_seq_cst:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    pushl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_offset 8
-; X86-GENERIC-NEXT:    .cfi_offset %ebp, -8
 ; X86-GENERIC-NEXT:    movl %esp, %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-GENERIC-NEXT:    pushl %ebx
 ; X86-GENERIC-NEXT:    pushl %edi
 ; X86-GENERIC-NEXT:    pushl %esi
 ; X86-GENERIC-NEXT:    andl $-16, %esp
 ; X86-GENERIC-NEXT:    subl $48, %esp
-; X86-GENERIC-NEXT:    .cfi_offset %esi, -20
-; X86-GENERIC-NEXT:    .cfi_offset %edi, -16
-; X86-GENERIC-NEXT:    .cfi_offset %ebx, -12
 ; X86-GENERIC-NEXT:    movl 8(%ebp), %esi
 ; X86-GENERIC-NEXT:    movl 12(%esi), %ecx
 ; X86-GENERIC-NEXT:    movl 8(%esi), %edi
@@ -579,24 +517,17 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-GENERIC-NEXT:    popl %edi
 ; X86-GENERIC-NEXT:    popl %ebx
 ; X86-GENERIC-NEXT:    popl %ebp
-; X86-GENERIC-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or128_nouse_seq_cst:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
-; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    movl %esp, %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %ebx
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
 ; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
-; X86-ATOM-NEXT:    .cfi_offset %esi, -20
-; X86-ATOM-NEXT:    .cfi_offset %edi, -16
-; X86-ATOM-NEXT:    .cfi_offset %ebx, -12
 ; X86-ATOM-NEXT:    movl 8(%ebp), %esi
 ; X86-ATOM-NEXT:    movl %esp, %ebx
 ; X86-ATOM-NEXT:    movl 12(%esi), %ecx
@@ -635,14 +566,13 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-ATOM-NEXT:    popl %edi
 ; X86-ATOM-NEXT:    popl %ebx
 ; X86-ATOM-NEXT:    popl %ebp
-; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl
   atomicrmw or ptr %p, i128 0 seq_cst
   ret void
 }
 
 
-define void @or16_nouse_seq_cst(ptr %p) {
+define void @or16_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or16_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -667,7 +597,7 @@ define void @or16_nouse_seq_cst(ptr %p) {
   ret void
 }
 
-define void @or8_nouse_seq_cst(ptr %p) {
+define void @or8_nouse_seq_cst(ptr %p) #0 {
 ; X64-LABEL: or8_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
@@ -691,3 +621,5 @@ define void @or8_nouse_seq_cst(ptr %p) {
   atomicrmw or ptr %p, i8 0 seq_cst
   ret void
 }
+
+attributes #0 = { nounwind }

>From 289471931480bf17ee8c2b2c62dd3c6218137b4e Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn at google.com>
Date: Fri, 7 Mar 2025 06:42:26 +0100
Subject: [PATCH 14/23] [lldb-dap] Updating naming and documentation to follow
 style guide. (#130202)

Updating the naming and adding documentation to better follow the style
guide.
---
 lldb/tools/lldb-dap/OutputRedirector.cpp | 6 +++---
 lldb/tools/lldb-dap/OutputRedirector.h   | 8 +++++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lldb/tools/lldb-dap/OutputRedirector.cpp b/lldb/tools/lldb-dap/OutputRedirector.cpp
index 9ff25ad4149dd..fe278faca87bf 100644
--- a/lldb/tools/lldb-dap/OutputRedirector.cpp
+++ b/lldb/tools/lldb-dap/OutputRedirector.cpp
@@ -38,7 +38,7 @@ Expected<int> OutputRedirector::GetWriteFileDescriptor() {
   return m_fd;
 }
 
-Error OutputRedirector::RedirectTo(std::FILE *fileOverride,
+Error OutputRedirector::RedirectTo(std::FILE *file_override,
                                    std::function<void(StringRef)> callback) {
   assert(m_fd == kInvalidDescriptor && "Output readirector already started.");
   int new_fd[2];
@@ -56,8 +56,8 @@ Error OutputRedirector::RedirectTo(std::FILE *fileOverride,
   int read_fd = new_fd[0];
   m_fd = new_fd[1];
 
-  if (fileOverride) {
-    int override_fd = fileno(fileOverride);
+  if (file_override) {
+    int override_fd = fileno(file_override);
 
     // Backup the FD to restore once redirection is complete.
     m_original_fd = override_fd;
diff --git a/lldb/tools/lldb-dap/OutputRedirector.h b/lldb/tools/lldb-dap/OutputRedirector.h
index 45571c0d5f344..77b1b76ec4d89 100644
--- a/lldb/tools/lldb-dap/OutputRedirector.h
+++ b/lldb/tools/lldb-dap/OutputRedirector.h
@@ -24,10 +24,16 @@ class OutputRedirector {
   /// Creates writable file descriptor that will invoke the given callback on
   /// each write in a background thread.
   ///
+  /// \param[in] file_override
+  ///     Updates the file descriptor to the redirection pipe, if not null.
+  ///
+  /// \param[in] callback
+  ///     A callback invoked when any data is written to the file handle.
+  ///
   /// \return
   ///     \a Error::success if the redirection was set up correctly, or an error
   ///     otherwise.
-  llvm::Error RedirectTo(std::FILE *overrideFile,
+  llvm::Error RedirectTo(std::FILE *file_override,
                          std::function<void(llvm::StringRef)> callback);
 
   llvm::Expected<int> GetWriteFileDescriptor();

>From c02019141cfe4e2bacdfa67262e84eee838f2e38 Mon Sep 17 00:00:00 2001
From: Kiran Kumar T P <50909805+kiranktp at users.noreply.github.com>
Date: Fri, 7 Mar 2025 11:15:22 +0530
Subject: [PATCH 15/23] [LLVM-FLANG] [OpenMP] [Taskloop] - Add test case with
 cancel construct inside taskloop (#129862)

Added a test case with cancel construct inside taskloop. Currently
taskloop lowering is not supported so below error is issued: "not yet
implemented: Taskloop construct"
Once the lowering patch is merged, todo error should be issued for
cancel construct. "not yet implemented: OpenMPCancelConstruct"
---
 flang/test/Lower/OpenMP/Todo/taskloop-cancel.f90 | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop-cancel.f90

diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-cancel.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-cancel.f90
new file mode 100644
index 0000000000000..5045c621e4d77
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskloop-cancel.f90
@@ -0,0 +1,14 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Taskloop construct
+subroutine omp_taskloop
+integer :: i
+!$omp parallel
+    !$omp taskloop
+      do i = 1, 10
+      !$omp cancel taskgroup
+      end do
+    !$omp end taskloop
+!$omp end parallel
+end subroutine omp_taskloop

>From bedb9077c38cf01a3f9303d68599ea95677be5b7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 6 Mar 2025 21:55:47 -0800
Subject: [PATCH 16/23] [RISCV] Simplify how we find combinable cm.pop+ret.
 (#130204)

Instead of scanning the whole basic block for a POP, find the RET and
then look backwards for the POP. Using getFirstTerminator, we can do
this with less code and it's probably faster.
---
 .../Target/RISCV/RISCVPushPopOptimizer.cpp    | 29 +++++++++----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp
index 0ead9a4009fab..eae7e8697f0ad 100644
--- a/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPushPopOptimizer.cpp
@@ -69,16 +69,6 @@ static unsigned getPopRetOpcode(unsigned PopOpcode, bool IsReturnZero) {
   }
 }
 
-// Check if POP instruction was inserted into the MBB and return iterator to it.
-static MachineBasicBlock::iterator containsPop(MachineBasicBlock &MBB) {
-  for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBB.end();
-       MBBI = next_nodbg(MBBI, MBB.end()))
-    if (MBBI->getFlag(MachineInstr::FrameDestroy) && isPop(MBBI->getOpcode()))
-      return MBBI;
-
-  return MBB.end();
-}
-
 bool RISCVPushPopOpt::usePopRet(MachineBasicBlock::iterator &MBBI,
                                 MachineBasicBlock::iterator &NextI,
                                 bool IsReturnZero) {
@@ -150,19 +140,28 @@ bool RISCVPushPopOpt::runOnMachineFunction(MachineFunction &Fn) {
 
   TII = Subtarget->getInstrInfo();
   TRI = Subtarget->getRegisterInfo();
+
   // Resize the modified and used register unit trackers.  We do this once
   // per function and then clear the register units each time we determine
   // correct return value for the POP.
   ModifiedRegUnits.init(*TRI);
   UsedRegUnits.init(*TRI);
+
   bool Modified = false;
   for (auto &MBB : Fn) {
-    MachineBasicBlock::iterator MBBI = containsPop(MBB);
-    MachineBasicBlock::iterator NextI = next_nodbg(MBBI, MBB.end());
-    if (MBBI != MBB.end() && NextI != MBB.end() &&
-        NextI->getOpcode() == RISCV::PseudoRET)
-      Modified |= usePopRet(MBBI, NextI, adjustRetVal(MBBI));
+    // RET should be the only terminator.
+    auto RetMBBI = MBB.getFirstTerminator();
+    if (RetMBBI == MBB.end() || RetMBBI->getOpcode() != RISCV::PseudoRET ||
+        RetMBBI == MBB.begin())
+      continue;
+
+    // The previous instruction should be a POP.
+    auto PopMBBI = prev_nodbg(RetMBBI, MBB.begin());
+    if (isPop(PopMBBI->getOpcode()) &&
+        PopMBBI->getFlag(MachineInstr::FrameDestroy))
+      Modified |= usePopRet(PopMBBI, RetMBBI, adjustRetVal(PopMBBI));
   }
+
   return Modified;
 }
 

>From 733ad3fdebf782be5afffdb8310a0ce15675086c Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng at sifive.com>
Date: Fri, 7 Mar 2025 14:09:26 +0800
Subject: [PATCH 17/23] [LTO] Override TargetABI from module flags if present
 when creating TargetMachine (#126497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…argetMachine

RISC-V's data layout is determined by the ABI, not just the target
triple. However, the TargetMachine is created using the data layout from
the target triple, which is not always correct. This patch uses the
target ABI from the module and passes it to the TargetMachine, ensuring
that the data layout is set correctly according to the ABI.

The same problem will happen with other targets like MIPS, but
unfortunately, MIPS didn't emit the target-abi into the module flags, so
this patch only fixes the issue for RISC-V.

NOTE: MIPS with -mabi=n32 can trigger the same issue.

Another possible solution is add new parameter to the TargetMachine
constructor, but that would require changes in all the targets.
---
 llvm/include/llvm/IR/Module.h       |  3 +++
 llvm/lib/IR/Module.cpp              |  8 ++++++++
 llvm/lib/LTO/LTOBackend.cpp         |  7 ++++++-
 llvm/test/LTO/RISCV/lit.local.cfg   |  2 ++
 llvm/test/LTO/RISCV/riscv-ilp32e.ll | 21 +++++++++++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/LTO/RISCV/lit.local.cfg
 create mode 100644 llvm/test/LTO/RISCV/riscv-ilp32e.ll

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 2fd2d6887022c..91ccd76c41e07 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -1067,6 +1067,9 @@ class LLVM_ABI Module {
 
   /// Set the target variant version build SDK version metadata.
   void setDarwinTargetVariantSDKVersion(VersionTuple Version);
+
+  /// Returns target-abi from MDString, null if target-abi is absent.
+  StringRef getTargetABIFromMD();
 };
 
 /// Given "llvm.used" or "llvm.compiler.used" as a global name, collect the
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index c7b9f8744d8d3..c7daaafe13e3f 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -915,3 +915,11 @@ VersionTuple Module::getDarwinTargetVariantSDKVersion() const {
 void Module::setDarwinTargetVariantSDKVersion(VersionTuple Version) {
   addSDKVersionMD(Version, *this, "darwin.target_variant.SDK Version");
 }
+
+StringRef Module::getTargetABIFromMD() {
+  StringRef TargetABI;
+  if (auto *TargetABIMD =
+          dyn_cast_or_null<MDString>(getModuleFlag("target-abi")))
+    TargetABI = TargetABIMD->getString();
+  return TargetABI;
+}
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index b38252a3272e8..139c39abf8e6b 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -221,8 +221,13 @@ createTargetMachine(const Config &Conf, const Target *TheTarget, Module &M) {
   else
     CodeModel = M.getCodeModel();
 
+  TargetOptions TargetOpts = Conf.Options;
+  if (TargetOpts.MCOptions.ABIName.empty()) {
+    TargetOpts.MCOptions.ABIName = M.getTargetABIFromMD();
+  }
+
   std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
-      TheTriple.str(), Conf.CPU, Features.getString(), Conf.Options, RelocModel,
+      TheTriple.str(), Conf.CPU, Features.getString(), TargetOpts, RelocModel,
       CodeModel, Conf.CGOptLevel));
 
   assert(TM && "Failed to create target machine");
diff --git a/llvm/test/LTO/RISCV/lit.local.cfg b/llvm/test/LTO/RISCV/lit.local.cfg
new file mode 100644
index 0000000000000..a3d2298159063
--- /dev/null
+++ b/llvm/test/LTO/RISCV/lit.local.cfg
@@ -0,0 +1,2 @@
+if "RISCV" not in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/LTO/RISCV/riscv-ilp32e.ll b/llvm/test/LTO/RISCV/riscv-ilp32e.ll
new file mode 100644
index 0000000000000..bbca58e65556d
--- /dev/null
+++ b/llvm/test/LTO/RISCV/riscv-ilp32e.ll
@@ -0,0 +1,21 @@
+; Check that we don't crash on DataLayout incompatibility issue.
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-lto2 run -r %t.o,_start %t.o -o %t.elf
+; RUN: llvm-readobj -h %t.elf.0 | FileCheck %s --check-prefixes=CHECK
+; CHECK:  Machine: EM_RISCV (0xF3)
+; CHECK:  EF_RISCV_RVE (0x8)
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32-S32"
+target triple = "riscv32-unknown-unknown-elf"
+
+define dso_local i32 @_start() #0 {
+entry:
+  ret i32 0
+}
+
+attributes #0 = { "target-cpu"="generic-rv32" "target-features"="+32bit,+e" }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"target-abi", !"ilp32e"}

>From d933882ed369a68d118ca661488bb2c89028a2de Mon Sep 17 00:00:00 2001
From: Yanzuo Liu <zwuis at outlook.com>
Date: Fri, 7 Mar 2025 14:16:51 +0800
Subject: [PATCH 18/23] [Clang] Add test for CWG2285 "Issues with structured
 bindings" (#126421)

The resolution of [CWG2285](https://wg21.link/cwg2285) adds the point of
declaration of a structured binding, and was implemented in
https://github.com/llvm/llvm-project/commit/bdb84f374cde7736ca68d5db2c2ecf5468346710
.

Drive-by changes: modify comment and diagnostic messages mentioned in
CWG2285.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |  7 +++++--
 clang/test/CXX/drs/cwg22xx.cpp                | 19 +++++++++++++++----
 clang/test/Parser/cxx1z-decomposition.cpp     |  4 ++--
 clang/www/cxx_dr_status.html                  |  2 +-
 5 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index cf0e9846d4259..1b46920e09619 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -551,7 +551,7 @@ def err_decomp_decl_constraint : Error<
 def err_decomp_decl_parens : Error<
   "decomposition declaration cannot be declared with parentheses">;
 def err_decomp_decl_template : Error<
-  "decomposition declaration template not supported">;
+  "decomposition declaration cannot be a template">;
 def err_decomp_decl_not_alone : Error<
   "decomposition declaration must be the only declaration in its group">;
 def err_decomp_decl_requires_init : Error<
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index a3a028b9485d6..fd5f0443fa894 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -733,8 +733,11 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
   }
 
   if (!TemplateParamLists.empty()) {
-    // FIXME: There's no rule against this, but there are also no rules that
-    // would actually make it usable, so we reject it for now.
+    // C++17 [temp]/1:
+    //   A template defines a family of class, functions, or variables, or an
+    //   alias for a family of types.
+    //
+    // Structured bindings are not included.
     Diag(TemplateParamLists.front()->getTemplateLoc(),
          diag::err_decomp_decl_template);
     return nullptr;
diff --git a/clang/test/CXX/drs/cwg22xx.cpp b/clang/test/CXX/drs/cwg22xx.cpp
index d93070ef3804d..8c8ad9f7f74ee 100644
--- a/clang/test/CXX/drs/cwg22xx.cpp
+++ b/clang/test/CXX/drs/cwg22xx.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
 
 
 namespace cwg2211 { // cwg2211: 8
@@ -196,6 +196,17 @@ void g() {
 #endif
 } // namespace cwg2277
 
+namespace cwg2285 { // cwg2285: 4
+// Note: Clang 4 implements this DR but it set a wrong value of `__cplusplus`
+#if __cplusplus >= 201703L
+  void test() {
+    using T = int[1];
+    auto [a] = T{a};
+    // since-cxx17-error at -1 {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}
+  }
+#endif
+} // namespace cwg2285
+
 namespace cwg2292 { // cwg2292: 9
 #if __cplusplus >= 201103L
   template<typename T> using id = T;
diff --git a/clang/test/Parser/cxx1z-decomposition.cpp b/clang/test/Parser/cxx1z-decomposition.cpp
index acf3f99069185..f4a4dc5375bdc 100644
--- a/clang/test/Parser/cxx1z-decomposition.cpp
+++ b/clang/test/Parser/cxx1z-decomposition.cpp
@@ -136,8 +136,8 @@ namespace MultiDeclarator {
 
 namespace Template {
   int n[3];
-  // FIXME: There's no actual rule against this...
-  template<typename T> auto [a, b, c] = n; // expected-error {{decomposition declaration template not supported}}
+  // Structured binding template is not allowed.
+  template<typename T> auto [a, b, c] = n; // expected-error {{decomposition declaration cannot be a template}}
 }
 
 namespace Init {
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 919996bd700b4..b7888d2365acc 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -13537,7 +13537,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2285.html">2285</a></td>
     <td>CD5</td>
     <td>Issues with structured bindings</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2286">
     <td><a href="https://cplusplus.github.io/CWG/issues/2286.html">2286</a></td>

>From 5997cdb4bcace6d24e7c2fa04dd4f726c48cc561 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 7 Mar 2025 12:55:03 +0700
Subject: [PATCH 19/23] AMDGPU: Switch an undef in a test for poison

---
 llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
index 9558d9f0bc4c9..0aea41a190f1e 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
@@ -23,15 +23,15 @@ entry:
   br label %loop
 
 loop:
-  %promotealloca = phi <16 x i32> [ undef, %entry ], [ %0, %loop ]
+  %promotealloca = phi <16 x i32> [ poison, %entry ], [ %insert, %loop ]
   %inc = phi i32 [ 0, %entry ], [ %inc.i, %loop ]
-  %0 = insertelement <16 x i32> %promotealloca, i32 %inc, i32 %inc
+  %insert = insertelement <16 x i32> %promotealloca, i32 %inc, i32 %inc
   %inc.i = add i32 %inc, %B
   %cnd = icmp uge i32 %inc.i, 16
   br i1 %cnd, label %done, label %loop
 
 done:
-  %1 = extractelement <16 x i32> %0, i32 0
-  store i32 %1, ptr addrspace(1) %out, align 4
+  %extract.0 = extractelement <16 x i32> %insert, i32 0
+  store i32 %extract.0, ptr addrspace(1) %out, align 4
   ret void
 }

>From e4cbbd323c98c9d67c393f00d5a255c60a06025e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 7 Mar 2025 12:58:21 +0700
Subject: [PATCH 20/23] AMDGPU: Switch a test to generated checks

---
 llvm/test/CodeGen/AMDGPU/copy-to-reg.ll | 63 +++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
index e517f1c8e6628..931a14473c340 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -1,12 +1,69 @@
-; RUN: llc -mtriple=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -verify-machineinstrs < %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -disable-promote-alloca-to-vector -disable-promote-alloca-to-lds < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -disable-promote-alloca-to-vector -disable-promote-alloca-to-lds < %s | FileCheck -check-prefix=GFX8 %s
 
 ; Test that CopyToReg instructions don't have non-register operands prior
 ; to being emitted.
 
 ; Make sure this doesn't crash
-; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
+
 define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GFX7-LABEL: copy_to_reg_frameindex:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7-NEXT:    s_mov_b32 s14, -1
+; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
+; GFX7-NEXT:    s_add_u32 s12, s12, s11
+; GFX7-NEXT:    s_addc_u32 s13, s13, 0
+; GFX7-NEXT:    s_mov_b32 s0, 0
+; GFX7-NEXT:    s_mov_b32 s1, 0
+; GFX7-NEXT:  .LBB0_1: ; %loop
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_add_i32 s1, s1, 1
+; GFX7-NEXT:    s_add_i32 s0, s0, 4
+; GFX7-NEXT:    s_cmp_lt_u32 s1, 16
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; GFX7-NEXT:    s_cbranch_scc1 .LBB0_1
+; GFX7-NEXT:  ; %bb.2: ; %done
+; GFX7-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: copy_to_reg_frameindex:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT:    s_mov_b32 s90, -1
+; GFX8-NEXT:    s_mov_b32 s91, 0xe80000
+; GFX8-NEXT:    s_add_u32 s88, s88, s11
+; GFX8-NEXT:    s_addc_u32 s89, s89, 0
+; GFX8-NEXT:    s_mov_b32 s0, 0
+; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:  .LBB0_1: ; %loop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    s_add_i32 s1, s1, 1
+; GFX8-NEXT:    s_add_i32 s0, s0, 4
+; GFX8-NEXT:    s_cmp_lt_u32 s1, 16
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
+; GFX8-NEXT:    s_cbranch_scc1 .LBB0_1
+; GFX8-NEXT:  ; %bb.2: ; %done
+; GFX8-NEXT:    buffer_load_dword v2, off, s[88:91], 0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
 entry:
   %alloca = alloca [16 x i32], addrspace(5)
   br label %loop

>From d12b388ae2159b22641fc288ed5c766bd13deb41 Mon Sep 17 00:00:00 2001
From: Alaa Ali <alaaali at ah-alaaali-l.dhcp.mathworks.com>
Date: Thu, 6 Mar 2025 03:36:47 -0500
Subject: [PATCH 21/23] tosa.cast: fix answer mismatch to cast f64/f32 max
 value to i64/i32

---
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp  | 17 ++++++++---------
 .../TosaToLinalg/tosa-to-linalg.mlir          | 19 ++++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index a99cf293b9eac..8854b4690bdf5 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -618,12 +618,8 @@ static Value createLinalgBodyCalculationForElementwiseOp(
             loc, rewriter.getIntegerAttr(
                      getElementTypeOrSelf(dstTy),
                      APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
-        auto intMax = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(
-                     getElementTypeOrSelf(dstTy),
-                     APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
         auto maxClamped =
-            rewriter.create<arith::SelectOp>(loc, overflow, intMax, conv);
+            rewriter.create<arith::SelectOp>(loc, overflow, intMin, conv);
         return rewriter.create<arith::SelectOp>(loc, underflow, intMin,
                                                 maxClamped);
       }
@@ -647,8 +643,11 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                      APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
                          .getSExtValue()));
 
+        auto overflow = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UGT, rounded, intMaxFP);
+        Value maxClampedFP = rewriter.create<arith::SelectOp>(loc, overflow, intMinFP, rounded);
+
         Value clamped =
-            clampFloatHelper(loc, rounded, intMinFP, intMaxFP, rewriter);
+            clampFloatHelper(loc, maxClampedFP, intMinFP, intMaxFP, rewriter);
         return rewriter.create<arith::FPToSIOp>(loc, dstTy, clamped);
       }
 
@@ -664,17 +663,17 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                            .getSExtValue()) +
                        1.0f));
 
-      auto intMax = rewriter.create<arith::ConstantOp>(
+      auto intMin = rewriter.create<arith::ConstantOp>(
           loc, rewriter.getIntegerAttr(
                    getElementTypeOrSelf(dstTy),
-                   APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
+                   APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
       auto minClampedFP =
           rewriter.create<arith::MaximumFOp>(loc, rounded, intMinFP);
       auto minClamped =
           rewriter.create<arith::FPToSIOp>(loc, dstTy, minClampedFP);
       auto overflow = rewriter.create<arith::CmpFOp>(
           loc, arith::CmpFPredicate::UGE, rounded, intMaxPlusOneFP);
-      return rewriter.create<arith::SelectOp>(loc, overflow, intMax,
+      return rewriter.create<arith::SelectOp>(loc, overflow, intMin,
                                               minClamped);
     }
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 9ba9965315fd3..bd6381bedf65c 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -541,13 +541,13 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () {
 
   // CHECK: linalg.generic
   // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f32
-  // CHECK: [[CSTMIN:%.+]] = arith.constant -2.14748365E+9 : f32
+  // CHECK: [[CSTMINF:%.+]] = arith.constant -2.14748365E+9 : f32
   // CHECK: [[CSTMAXP1:%.+]] = arith.constant 2.14748365E+9 : f32
-  // CHECK: [[CSTMAX:%.+]] = arith.constant 2147483647 : i32
-  // CHECK: [[MAX:%.+]] = arith.maximumf [[ROUND]], [[CSTMIN]] : f32
+  // CHECK: [[CSTMIN:%.+]] = arith.constant -2147483648 : i32
+  // CHECK: [[MAX:%.+]] = arith.maximumf [[ROUND]], [[CSTMINF]] : f32
   // CHECK: [[CONV:%.+]] = arith.fptosi [[MAX]] : f32 to i32
   // CHECK: [[CMP:%.+]] = arith.cmpf uge, [[ROUND]], [[CSTMAXP1]] : f32
-  // CHECK: arith.select [[CMP]], [[CSTMAX]], [[CONV]] : i32
+  // CHECK: arith.select [[CMP]], [[CSTMIN]], [[CONV]] : i32
   %20 = tosa.cast %0 : (tensor<1xf32>) -> tensor<1xi32>
 
   // CHECK: linalg.generic
@@ -591,7 +591,9 @@ func.func @test_simple_f16(%arg0: tensor<1xf16>) -> () {
   // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f16
   // CHECK: [[CSTMIN:%.+]] = arith.constant -1.280000e+02 : f16
   // CHECK: [[CSTMAX:%.+]] = arith.constant 1.270000e+02 : f16
-  // CHECK: [[MIN:%.+]] = arith.minimumf [[ROUND]], [[CSTMAX]] : f16
+  // CHECK: [[OVERFLOW:%.+]] = arith.cmpf ugt, [[ROUND]], [[CSTMAX]] : f16
+  // CHECK: [[CLAMPMAX:%.+]] = arith.select [[OVERFLOW]], [[CSTMIN]], [[ROUND]] : f16
+  // CHECK: [[MIN:%.+]] = arith.minimumf [[CLAMPMAX]], [[CSTMAX]] : f16
   // CHECK: [[CLAMP:%.+]] = arith.maximumf [[MIN]], [[CSTMIN]] : f16
   // CHECK: arith.fptosi [[CLAMP]] : f16 to i8
   %1 = "tosa.cast"(%arg0) : (tensor<1xf16>) -> tensor<1xi8>
@@ -604,8 +606,7 @@ func.func @test_simple_f16(%arg0: tensor<1xf16>) -> () {
   // CHECK: [[OVERFLOW:%.+]] = arith.cmpf ueq, [[ROUND]], [[POSINF]] : f16
   // CHECK: [[UNDERFLOW:%.+]] = arith.cmpf ueq, [[ROUND]], [[NEGINF]] : f16
   // CHECK: [[MININT:%.+]] = arith.constant -2147483648 : i32
-  // CHECK: [[MAXINT:%.+]] = arith.constant 2147483647 : i32
-  // CHECK: [[CLAMPPOSINF:%.+]] = arith.select [[OVERFLOW]], [[MAXINT]], [[CONV]] : i32
+  // CHECK: [[CLAMPPOSINF:%.+]] = arith.select [[OVERFLOW]], [[MININT]], [[CONV]] : i32
   // CHECK: arith.select [[UNDERFLOW]], [[MININT]], [[CLAMPPOSINF]] : i32
   %2 = "tosa.cast"(%arg0) : (tensor<1xf16>) -> tensor<1xi32>
   return
@@ -1980,11 +1981,11 @@ func.func @test_dynamic_fft2d(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>
 // CHECK:             %[[ROUND_EVEN:.*]] = math.roundeven %[[IN]] : f32
 // CHECK:             %[[FP_INT_MIN:.*]] = arith.constant -9.22337203E+18 : f32
 // CHECK:             %[[FP_INT_MAX_PLUS_ONE:.*]] = arith.constant 9.22337203E+18 : f32
-// CHECK:             %[[INT_MAX:.*]] = arith.constant 9223372036854775807 : i64
+// CHECK:             %[[INT_MIN:.*]] = arith.constant -9223372036854775808 : i64
 // CHECK:             %[[MAX:.*]] = arith.maximumf %[[ROUND_EVEN]], %[[FP_INT_MIN]] : f32
 // CHECK:             %[[FPTOSI:.*]] = arith.fptosi %[[MAX]] : f32 to i64
 // CHECK:             %[[CMPF:.*]] = arith.cmpf uge, %[[ROUND_EVEN]], %[[FP_INT_MAX_PLUS_ONE]] : f32
-// CHECK:             %[[SELECT:.*]] = arith.select %[[CMPF]], %[[INT_MAX]], %[[FPTOSI]] : i64
+// CHECK:             %[[SELECT:.*]] = arith.select %[[CMPF]], %[[INT_MIN]], %[[FPTOSI]] : i64
 // CHECK:             linalg.yield %[[SELECT]] : i64
 // CHECK:           } -> tensor<1xi64>
 // CHECK:           return %[[RESULT]] : tensor<1xi64>

>From 5aa0854deff01c9ad7dde3781ed568acb97f984e Mon Sep 17 00:00:00 2001
From: Alaa Ali <alaaali at ah-alaaali-l.dhcp.mathworks.com>
Date: Thu, 6 Mar 2025 22:46:57 -0500
Subject: [PATCH 22/23] clear the code formatting errors

---
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 8854b4690bdf5..17ebc7dc32372 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -643,8 +643,10 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                      APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
                          .getSExtValue()));
 
-        auto overflow = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UGT, rounded, intMaxFP);
-        Value maxClampedFP = rewriter.create<arith::SelectOp>(loc, overflow, intMinFP, rounded);
+        auto overflow = rewriter.create<arith::CmpFOp>(
+            loc, arith::CmpFPredicate::UGT, rounded, intMaxFP);
+        Value maxClampedFP =
+            rewriter.create<arith::SelectOp>(loc, overflow, intMinFP, rounded);
 
         Value clamped =
             clampFloatHelper(loc, maxClampedFP, intMinFP, intMaxFP, rewriter);

>From b810ec3f047ba0074708d0d324040ef1363e7213 Mon Sep 17 00:00:00 2001
From: Alaa Ali <alaaali at ah-alaaali-l.dhcp.mathworks.com>
Date: Fri, 7 Mar 2025 01:23:35 -0500
Subject: [PATCH 23/23] Add test case to cast f64 to i64/i32

---
 .../TosaToLinalg/tosa-to-linalg.mlir          | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index bd6381bedf65c..180db212b5448 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -1996,6 +1996,34 @@ func.func @test_cast_fp32_i64(%arg0: tensor<1xf32>) -> (tensor<1xi64>) {
 
 // -----
 
+// CHECK-LABEL: @test_simple_f64
+func.func @test_simple_f64(%arg0: tensor<1xf64>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f64
+  // CHECK: [[CSTMINF:%.+]] = arith.constant -9.2233720368547758E+18 : f64
+  // CHECK: [[CSTMAXP1:%.+]] = arith.constant 9.2233720368547758E+18 : f64
+  // CHECK: [[CSTMIN:%.+]] = arith.constant -9223372036854775808 : i64
+  // CHECK: [[MAX:%.+]] = arith.maximumf [[ROUND]], [[CSTMINF]] : f64
+  // CHECK: [[CONV:%.+]] = arith.fptosi [[MAX]] : f64 to i64
+  // CHECK: [[CMP:%.+]] = arith.cmpf uge, [[ROUND]], [[CSTMAXP1]] : f64
+  // CHECK: arith.select [[CMP]], [[CSTMIN]], [[CONV]] : i64
+  %0 = tosa.cast %arg0 : (tensor<1xf64>) -> tensor<1xi64>
+
+  // CHECK: linalg.generic
+  // CHECK: [[ROUND:%.+]] = math.roundeven {{%.+}} : f64
+  // CHECK: [[CSTMIN:%.+]] = arith.constant 0xC1E0000000000000 : f64
+  // CHECK: [[CSTMAX:%.+]] = arith.constant 0x41DFFFFFFFC00000 : f64
+  // CHECK: [[OVERFLOW:%.+]] = arith.cmpf ugt, [[ROUND]], [[CSTMAX]] : f64
+  // CHECK: [[CLAMPMAX:%.+]] = arith.select [[OVERFLOW]], [[CSTMIN]], [[ROUND]] : f64
+  // CHECK: [[MIN:%.+]] = arith.minimumf [[CLAMPMAX]], [[CSTMAX]] : f64
+  // CHECK: [[CLAMP:%.+]] = arith.maximumf [[MIN]], [[CSTMIN]] : f64
+  // CHECK: arith.fptosi [[CLAMP]] : f64 to i32
+  %1 = tosa.cast %arg0 : (tensor<1xf64>) -> tensor<1xi32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @reduce_min_nan_propagate
 func.func @reduce_min_nan_propagate(%arg0: tensor<5x4xf32>, %arg1: tensor<5x4xf32>) -> () {
   // CHECK: linalg.reduce