[llvm] [mlir] [openmp] [Flang][OpenMP] Add support for schedule clause for GPU (PR #81618)

Fri Mar 1 11:01:44 PST 2024

https://github.com/DominikAdamski updated https://github.com/llvm/llvm-project/pull/81618

>From df9980af2f70634b4de13ec505b4af607bf24a90 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Fri, 9 Feb 2024 08:16:06 -0600
Subject: [PATCH 1/2] [Flang][OpenMP] Add support for schedule clause for GPU

Scope of changes:
1) Fixed handling of loop chunking in OpenMP runtime.
2) Pass chunk value from MLIR to OpenMP runtime.
3) Added explicit check that only static schedule is supported
   for target loops.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  6 ++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 49 ++++++++++++++-----
 mlir/test/Target/LLVMIR/omptarget-wsloop.mlir | 18 +++++++
 .../libomptarget/DeviceRTL/src/Workshare.cpp  | 17 ++++---
 ...rget-parallel-do-schedule-static-chunk.f90 | 33 +++++++++++++
 5 files changed, 104 insertions(+), 19 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2288969ecc95c4..8d74b12dbc4ba1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -925,11 +925,15 @@ class OpenMPIRBuilder {
   ///                 preheader of the loop.
   /// \param LoopType Information about type of loop worksharing.
   ///                 It corresponds to type of loop workshare OpenMP pragma.
+  /// \param ScheduleType Information about scheduling type.
+  /// \param ChunkSize    Value of chunk size for static schedule.
   ///
   /// \returns Point where to insert code after the workshare construct.
   InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
                                          InsertPointTy AllocaIP,
-                                         omp::WorksharingLoopType LoopType);
+                                         omp::WorksharingLoopType LoopType,
+                                         omp::OMPScheduleType ScheduleType,
+                                         Value *ChunkSize);
 
   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02b333e9ccd567..f9cbc39a24016d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
 static void createTargetLoopWorkshareCall(
     OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
     BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
-    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn,
+    Value *ThreadChunkSize) {
   Type *TripCountTy = TripCount->getType();
   Module &M = OMPBuilder->M;
   IRBuilder<> &Builder = OMPBuilder->Builder;
@@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall(
 
   RealArgs.push_back(
       Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
-  RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
-  if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
+  switch (LoopType) {
+  case WorksharingLoopType::DistributeForStaticLoop:
+    RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::DistributeStaticLoop:
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::ForStaticLoop:
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
   }
 
   Builder.CreateCall(RTLFn, RealArgs);
@@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
                             CanonicalLoopInfo *CLI, Value *Ident,
                             Function &OutlinedFn, Type *ParallelTaskPtr,
                             const SmallVector<Instruction *, 4> &ToBeDeleted,
-                            WorksharingLoopType LoopType) {
+                            WorksharingLoopType LoopType, Value *ChunkSize) {
   IRBuilder<> &Builder = OMPIRBuilder->Builder;
   BasicBlock *Preheader = CLI->getPreheader();
   Value *TripCount = CLI->getTripCount();
@@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
 
   createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
                                 LoopBodyArg, ParallelTaskPtr, TripCount,
-                                OutlinedFn);
+                                OutlinedFn, ChunkSize);
 
   for (auto &ToBeDeletedItem : ToBeDeleted)
     ToBeDeletedItem->eraseFromParent();
   CLI->invalidate();
 }
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
-                                          InsertPointTy AllocaIP,
-                                          WorksharingLoopType LoopType) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
+    DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+    WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType,
+    Value *ChunkSize) {
+
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   // Instructions which need to be deleted at the end of code generation
   SmallVector<Instruction *, 4> ToBeDeleted;
 
+  // TODO: Add support for dynamic scheduling
+  switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
+  case OMPScheduleType::BaseStatic:
+  case OMPScheduleType::BaseStaticChunked:
+    break;
+  default:
+    report_fatal_error(
+        "Unknown/unimplemented schedule kind for target workshare loop", false);
+  }
+
   OI.OuterAllocaBB = AllocaIP.getBlock();
 
   // Mark the body loop as region which needs to be extracted
@@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   OI.PostOutlineCB = [=, ToBeDeletedVec =
                              std::move(ToBeDeleted)](Function &OutlinedFn) {
     workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
-                                ToBeDeletedVec, LoopType);
+                                ToBeDeletedVec, LoopType, ChunkSize);
   };
   addOutlineInfo(std::move(OI));
   return CLI->getAfterIP();
@@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
     bool HasSimdModifier, bool HasMonotonicModifier,
     bool HasNonmonotonicModifier, bool HasOrderedClause,
     WorksharingLoopType LoopType) {
-  if (Config.isTargetDevice())
-    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
   OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
       SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
       HasNonmonotonicModifier, HasOrderedClause);
+  if (Config.isTargetDevice())
+    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType,
+                                    EffectiveScheduleType, ChunkSize);
 
   bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
                    OMPScheduleType::ModifierOrdered;
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index 220eb85b3483ec..a5f5d07262c8d9 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
       }
     llvm.return
   }
+
+  llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){
+      %loop_ub = llvm.mlir.constant(9 : i32) : i32
+      %loop_lb = llvm.mlir.constant(0 : i32) : i32
+      %loop_step = llvm.mlir.constant(1 : i32) : i32
+      %chunk = llvm.mlir.constant(2 : i32) : i32
+      omp.wsloop schedule(static = %chunk : i32) for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+        omp.yield
+      }
+    llvm.return
+  }
 }
 
 // CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]])
@@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
 
 // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
+
+// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]])
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2)
+
+// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a185..836d4f7f4934b4 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -685,17 +685,22 @@ template <typename Ty> class StaticLoopChunker {
     Ty KernelIteration = NumBlocks * BlockChunk;
 
     // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId;
+    Ty IV = BId * BlockChunk + TId * ThreadChunk;
     ASSERT(IV >= 0, "Bad index");
 
+    // Make sure the starting index is within the kernel iteration boundaries.
+    if (IV >= KernelIteration)
+      return;
+
     // Cover the entire iteration space, assumptions in the caller might allow
     // to simplify this loop to a conditional.
     do {
 
       Ty BlockChunkLeft =
           BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty ThreadChunkLeft =
+      Ty EffectiveThreadChunk =
           ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+      Ty ThreadChunkLeft = EffectiveThreadChunk;
 
       while (ThreadChunkLeft--) {
 
@@ -711,8 +716,8 @@ template <typename Ty> class StaticLoopChunker {
 
         ++IV;
       }
-
-      IV += KernelIteration;
+      // Start the new kernel iteration before the first thread chunk
+      IV += (KernelIteration - EffectiveThreadChunk);
 
     } while (IV < NumIters);
   }
@@ -731,8 +736,8 @@ template <typename Ty> class StaticLoopChunker {
     // from the `omp` getter and not the mapping directly.
     Ty TId = omp_get_thread_num();
 
-    // There are no blocks involved here.
-    Ty BlockChunk = 0;
+    // There is only one block for the whole iteration space.
+    Ty BlockChunk = NumIters;
     Ty NumBlocks = 1;
     Ty BId = 0;
 
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
new file mode 100644
index 00000000000000..f0b444f6ddc66f
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
@@ -0,0 +1,33 @@
+! Basic offloading test with a target region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+   use omp_lib
+   integer :: x(100)
+   integer :: errors = 0
+   integer :: i
+
+   !$omp target parallel do schedule(static, 5) map(from: x)
+   do i = 1, 100
+       x(i) = i
+   end do
+   !$omp end target parallel do
+   do i = 1, 100
+       if ( x(i) .ne. i ) then
+           errors = errors + 1
+       end if
+   end do
+
+   print *,"number of errors: ", errors
+
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  number of errors: 0

>From b7b16308d9c56766ef1735c1d7bec2575c3102bf Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Tue, 27 Feb 2024 06:50:08 -0600
Subject: [PATCH 2/2] Adjust chunking policy

---
 .../libomptarget/DeviceRTL/src/Workshare.cpp  | 57 +++++++++----------
 ...rget-parallel-do-schedule-static-chunk.f90 | 33 -----------
 .../target_workshare_loop_static_chunk.f90    | 46 +++++++++++++++
 3 files changed, 72 insertions(+), 64 deletions(-)
 delete mode 100644 openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
 create mode 100644 openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90

diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 836d4f7f4934b4..ee9ee9a14056d8 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -683,43 +683,38 @@ template <typename Ty> class StaticLoopChunker {
                                         Ty NumIters,
                                         bool OneIterationPerThread) {
     Ty KernelIteration = NumBlocks * BlockChunk;
+    Ty BlockIV = BId * BlockChunk;
 
-    // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId * ThreadChunk;
-    ASSERT(IV >= 0, "Bad index");
-
-    // Make sure the starting index is within the kernel iteration boundaries.
-    if (IV >= KernelIteration)
-      return;
-
+    ASSERT((BlockIV + TId * ThreadChunk) >= 0, "Bad index");
     // Cover the entire iteration space, assumptions in the caller might allow
     // to simplify this loop to a conditional.
     do {
-
-      Ty BlockChunkLeft =
-          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty EffectiveThreadChunk =
-          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
-      Ty ThreadChunkLeft = EffectiveThreadChunk;
-
-      while (ThreadChunkLeft--) {
-
-        // Given the blocking it's hard to keep track of what to execute.
-        if (IV >= NumIters)
-          return;
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        if (OneIterationPerThread)
-          return;
-
-        ++IV;
+      Ty ThreadIV = TId * ThreadChunk;
+      // Cover the block space
+      while (ThreadIV < BlockChunk) {
+        Ty ThreadCnt = 0;
+        // Cover the thread space
+        while ((ThreadCnt < ThreadChunk) &&
+               ((ThreadIV + ThreadCnt) < BlockChunk)) {
+          // Index in the chunked space.
+          Ty IV = BlockIV + ThreadIV + ThreadCnt;
+
+          // Given the blocking it's hard to keep track of what to execute.
+          if (IV >= NumIters)
+            return;
+
+          // Execute the loop body.
+          LoopBody(IV, Arg);
+
+          if (OneIterationPerThread)
+            return;
+          ++ThreadCnt;
+        };
+        ThreadIV += (NumThreads * ThreadChunk);
       }
-      // Start the new kernel iteration before the first thread chunk
-      IV += (KernelIteration - EffectiveThreadChunk);
 
-    } while (IV < NumIters);
+      BlockIV += KernelIteration;
+    } while (BlockIV < NumIters);
   }
 
 public:
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
deleted file mode 100644
index f0b444f6ddc66f..00000000000000
--- a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
+++ /dev/null
@@ -1,33 +0,0 @@
-! Basic offloading test with a target region
-! REQUIRES: flang
-! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-! UNSUPPORTED: aarch64-unknown-linux-gnu
-! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-! UNSUPPORTED: x86_64-pc-linux-gnu
-! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-! RUN: %libomptarget-compile-fortran-generic
-! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
-program main
-   use omp_lib
-   integer :: x(100)
-   integer :: errors = 0
-   integer :: i
-
-   !$omp target parallel do schedule(static, 5) map(from: x)
-   do i = 1, 100
-       x(i) = i
-   end do
-   !$omp end target parallel do
-   do i = 1, 100
-       if ( x(i) .ne. i ) then
-           errors = errors + 1
-       end if
-   end do
-
-   print *,"number of errors: ", errors
-
-end program main
-
-! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
-! CHECK:  number of errors: 0
diff --git a/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90
new file mode 100644
index 00000000000000..6b539e7a3837be
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90
@@ -0,0 +1,46 @@
+! Offloading test with a target region and chunks
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+program main
+  use omp_lib
+        integer :: A(100)
+!$omp target map(from:A)
+!$omp parallel do schedule(static,2) num_threads(10)
+        do index_ = 1, 100
+          A(index_) = omp_get_team_num() * 1000 + omp_get_thread_num()
+        end do
+!$omp end target
+        write(*,"(A)"), "omp target parallel for thread chunk size 2"
+        call printArray(A)
+
+end program main
+
+subroutine printArray(Array)
+        integer :: Array(*)
+        do i = 1, 100
+            write(*, "(A, I0, A, I0, A)", advance="no") "B",Array(i)/1000,"T",modulo(Array(i),1000)," "
+        end do
+        write(*,'(/)')
+end subroutine printArray
+
+!CHECK:      omp target parallel for thread chunk size 2
+
+!CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+