[flang-commits] [flang] [OpenMPIRBuilder] Added `createTeams` (PR #65767)

Fri Sep 8 10:00:08 PDT 2023

=?utf-8?q?Björn?= Pettersson <bjorn.a.pettersson at ericsson.com>,David
 Spickett <david.spickett at linaro.org>,Aaron Ballman <aaron at aaronballman.com>,Tobias
 Hieta <tobias at hieta.se>,Philip Reames <preames at rivosinc.com>,Tom Stellard
 <tstellar at redhat.com>,"Mikhail R. Gadelha" <mikhail at igalia.com>,yronglin
 <yronglin777 at gmail.com>,Tom Stellard <tstellar at redhat.com>,Hongtao Yu
 <hoy at fb.com>,Shraiysh <Shraiysh.Vaishay at amd.com>,Shraiysh Vaishay
 <shraiysh.vaishay at amd.com>,Shraiysh Vaishay <shraiysh.vaishay at amd.com>,Shraiysh
 Vaishay <shraiysh.vaishay at amd.com>


https://github.com/shraiysh updated https://github.com/llvm/llvm-project/pull/65767:

>From 2a01a4999621f79e08ad6fef6544413c90c3012f Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <shraiysh.vaishay at amd.com>
Date: Thu, 7 Sep 2023 15:45:52 -0500
Subject: [PATCH 01/32] [OpenMPIRBuilder] Added `createTeams`

This patch adds a generator for the teams construct. The generated IR looks like the following:

```
current_fn() {
  ...
  call @__kmpc_fork_teams(ptr @ident, i32 num_args, ptr @outlined_omp_teams, ...args)
  ...
}
outlined_omp_teams(ptr %global_tid, ptr %bound_tid, ...args) {
  ; teams body
}
```

It does this by first generating the body in the current function. Then we outline the
body in a temporary function. We then create the @outlined_omp_teams function and embed
the temporary outlined function in this function. We then emit the call to runtime
function.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  11 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 188 ++++++++++++++++--
 .../Frontend/OpenMPIRBuilderTest.cpp          |  73 +++++++
 3 files changed, 248 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index ea1035f1907e492..d26ac60939031ec 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1159,8 +1159,8 @@ class OpenMPIRBuilder {
                                 InsertPointTy AllocaIP,
                                 BodyGenCallbackTy BodyGenCB);
 
-
-  using FileIdentifierInfoCallbackTy = std::function<std::tuple<std::string, uint64_t>()>;
+  using FileIdentifierInfoCallbackTy =
+      std::function<std::tuple<std::string, uint64_t>()>;
 
   /// Creates a unique info for a target entry when provided a filename and
   /// line number from.
@@ -2005,6 +2005,13 @@ class OpenMPIRBuilder {
   /// \param Loc The insert and source location description.
   void createTargetDeinit(const LocationDescription &Loc);
 
+  /// Generator for `#omp teams`
+  ///
+  /// \param Loc The location where the task construct was encountered.
+  /// \param BodyGenCB Callback that will generate the region code.
+  InsertPointTy createTeams(const LocationDescription &Loc,
+                            BodyGenCallbackTy BodyGenCB);
+
   ///}
 
 private:
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2cfb36d11dcf898..0f2203f0c1ac84c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -36,10 +36,12 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Target/TargetMachine.h"
@@ -390,9 +392,9 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
       if (Param) {
         if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
           FnAS = FnAS.addAttribute(Ctx, AK);
-      } else
-        if (auto AK = TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
-          FnAS = FnAS.addAttribute(Ctx, AK);
+      } else if (auto AK =
+                     TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
+        FnAS = FnAS.addAttribute(Ctx, AK);
     } else {
       FnAS = FnAS.addAttributes(Ctx, AS);
     }
@@ -406,7 +408,7 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets)                \
   case Enum:                                                                   \
     FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet);                           \
-    addAttrSet(RetAttrs, RetAttrSet, /*Param*/false);                          \
+    addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false);                         \
     for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo)                \
       addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]);                         \
     Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs));    \
@@ -4927,8 +4929,8 @@ void OpenMPIRBuilder::emitOffloadingArrays(
             static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
                 CombinedInfo.Types[I] &
                 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
-          ConstSizes[I] = ConstantInt::get(Int64Ty,
-                                           CombinedInfo.NonContigInfo.Dims[I]);
+          ConstSizes[I] =
+              ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
         else
           ConstSizes[I] = CI;
         continue;
@@ -4991,8 +4993,8 @@ void OpenMPIRBuilder::emitOffloadingArrays(
         createOffloadMapnames(CombinedInfo.Names, MapnamesName);
     Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
   } else {
-    Info.RTArgs.MapNamesArray = Constant::getNullValue(
-        PointerType::getUnqual(Builder.getContext()));
+    Info.RTArgs.MapNamesArray =
+        Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
   }
 
   // If there's a present map type modifier, it must not be applied to the end
@@ -5017,10 +5019,10 @@ void OpenMPIRBuilder::emitOffloadingArrays(
   for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
     Value *BPVal = CombinedInfo.BasePointers[I];
     Value *BP = Builder.CreateConstInBoundsGEP2_32(
-        ArrayType::get(PtrTy, Info.NumberOfPtrs),
-        Info.RTArgs.BasePointersArray, 0, I);
-    Builder.CreateAlignedStore(
-        BPVal, BP, M.getDataLayout().getPrefTypeAlign(PtrTy));
+        ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
+        0, I);
+    Builder.CreateAlignedStore(BPVal, BP,
+                               M.getDataLayout().getPrefTypeAlign(PtrTy));
 
     if (Info.requiresDevicePointerInfo()) {
       if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
@@ -5039,21 +5041,21 @@ void OpenMPIRBuilder::emitOffloadingArrays(
 
     Value *PVal = CombinedInfo.Pointers[I];
     Value *P = Builder.CreateConstInBoundsGEP2_32(
-        ArrayType::get(PtrTy, Info.NumberOfPtrs),
-        Info.RTArgs.PointersArray, 0, I);
+        ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
+        I);
     // TODO: Check alignment correct.
-    Builder.CreateAlignedStore(
-        PVal, P, M.getDataLayout().getPrefTypeAlign(PtrTy));
+    Builder.CreateAlignedStore(PVal, P,
+                               M.getDataLayout().getPrefTypeAlign(PtrTy));
 
     if (RuntimeSizes.test(I)) {
       Value *S = Builder.CreateConstInBoundsGEP2_32(
           ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
           /*Idx0=*/0,
           /*Idx1=*/I);
-      Builder.CreateAlignedStore(
-          Builder.CreateIntCast(CombinedInfo.Sizes[I], Int64Ty,
-                                /*isSigned=*/true),
-          S, M.getDataLayout().getPrefTypeAlign(PtrTy));
+      Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
+                                                       Int64Ty,
+                                                       /*isSigned=*/true),
+                                 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
     }
     // Fill up the mapper array.
     unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
@@ -5655,8 +5657,8 @@ GlobalVariable *
 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
                                        std::string VarName) {
   llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
-      llvm::ArrayType::get(
-          llvm::PointerType::getUnqual(M.getContext()), Names.size()),
+      llvm::ArrayType::get(llvm::PointerType::getUnqual(M.getContext()),
+                           Names.size()),
       Names);
   auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
       M, MapNamesArrayInit->getType(),
@@ -6106,6 +6108,148 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
   }
 }
 
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
+                             BodyGenCallbackTy BodyGenCB) {
+  if (!updateToLocation(Loc)) {
+    return Loc.IP;
+  }
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  // Splitting a basic block expects a terminator. Hence, creating an
+  // unreachable instruction, which will be deleted later.
+  UnreachableInst *UI = Builder.CreateUnreachable();
+  BasicBlock *CurrentBasicBlock = Builder.GetInsertBlock();
+
+  // The current basic block is split into four basic blocks. After outlining,
+  // they will be mapped as follows:
+  // ```
+  // def current_fn() {
+  //   current_basic_block:
+  //     br label %teams.exit
+  //   teams.exit:
+  //     ; instructions after task
+  // }
+  // def outlined_fn() {
+  //   teams.alloca:
+  //     br label %teams.body
+  //   teams.body:
+  //     ; instructions within teams body
+  // }
+  // ```
+  BasicBlock *AllocaBB = CurrentBasicBlock->splitBasicBlock(UI, "teams.alloca");
+  BasicBlock *BodyBB = AllocaBB->splitBasicBlock(UI, "teams.body");
+  BasicBlock *ExitBB = BodyBB->splitBasicBlock(UI, "teams.exit");
+
+  UI->eraseFromParent();
+
+  // Generate the body of teams.
+  InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
+  InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
+  BodyGenCB(AllocaIP, CodeGenIP);
+
+  OutlineInfo OI;
+  OI.EntryBB = AllocaBB;
+  OI.ExitBB = ExitBB;
+  OI.PostOutlineCB = [this, Ident](Function &OutlinedFn) {
+    // The input IR here looks like the following-
+    // ```
+    // func @current_fn() {
+    //   outlined_fn(%args)
+    // }
+    // func @outlined_fn(%args) {
+    //   ; teams body
+    // }
+    // ```
+    //
+    // This is changed to the following-
+    //
+    // ```
+    // func @current_fn() {
+    //   runtime_call(..., wrapper_fn, ...)
+    // }
+    // func @wrapper_fn(..., %args) {
+    //   ; teams body
+    // }
+    // ```
+
+    // The outlined function has different inputs than what is expected from it.
+    // So, a wrapper function with expected signature is created and the
+    // required arguments are passed to the outlined function. The stale call
+    // instruction in current function will be replaced with a new call
+    // instruction for runtime call with the wrapper function. The outlined
+    // function is then inlined in the wrapper function and the call from the
+    // current function is removed.
+
+    assert(OutlinedFn.getNumUses() == 1 &&
+           "there must be a single user for the outlined function");
+    CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+    assert(StaleCI && "Error while outlining - no CallInst user found for the "
+                      "outlined function.");
+    OutlinedFn.addFnAttr(Attribute::AttrKind::AlwaysInline);
+
+    // Create the wrapper function.
+    Builder.SetInsertPoint(StaleCI);
+    SmallVector<Type *> WrapperArgTys{Builder.getPtrTy(), Builder.getPtrTy()};
+    for (auto &Arg : OutlinedFn.args()) {
+      WrapperArgTys.push_back(Arg.getType());
+    }
+    FunctionCallee WrapperFuncVal = M.getOrInsertFunction(
+        "outlined_omp_teams",
+        FunctionType::get(Builder.getVoidTy(), WrapperArgTys, false));
+    Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee());
+    WrapperFunc->getArg(0)->setName("global_tid");
+    WrapperFunc->getArg(1)->setName("bound_tid");
+    WrapperFunc->getArg(2)->setName("data");
+
+    // Emit the body of the wrapper function - just a call to outlined function
+    // and return statement.
+    BasicBlock *WrapperEntryBB =
+        BasicBlock::Create(M.getContext(), "entrybb", WrapperFunc);
+    Builder.SetInsertPoint(WrapperEntryBB);
+    SmallVector<Value *> Args;
+    for (size_t ArgIndex = 2; ArgIndex < WrapperFunc->arg_size(); ArgIndex++) {
+      Args.push_back(WrapperFunc->getArg(ArgIndex));
+    }
+    CallInst *OutlinedFnCall = Builder.CreateCall(&OutlinedFn, Args);
+    Builder.CreateRetVoid();
+
+    // Call to the runtime function for teams in the current function.
+    Builder.SetInsertPoint(StaleCI);
+    Args = {Ident, Builder.getInt32(StaleCI->arg_size()), WrapperFunc};
+    for (Use &Arg : StaleCI->args()) {
+      Args.push_back(Arg);
+    }
+    Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
+                           omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
+                       Args);
+    StaleCI->eraseFromParent();
+
+    // Inlining the outlined teams function in the wrapper. This wrapper is the
+    // argument for the runtime call.
+    assert(OutlinedFn.getNumUses() == 1 &&
+           "More than one use for the outlined function found. Expected only "
+           "one use.");
+    InlineFunctionInfo IFI;
+    InlineResult IR = InlineFunction(*OutlinedFnCall, IFI);
+    LLVM_DEBUG(if (!IR.isSuccess()) {
+      dbgs() << "Attempt to merge the outlined function in the wrapper failed: "
+             << IR.getFailureReason() << "\n";
+    });
+    assert(IR.isSuccess() && "Inlining outlined omp teams failed");
+    OutlinedFn.eraseFromParent();
+  };
+
+  addOutlineInfo(std::move(OI));
+
+  Builder.SetInsertPoint(ExitBB);
+
+  return Builder.saveIP();
+}
+
 bool OffloadEntriesInfoManager::empty() const {
   return OffloadEntriesTargetRegion.empty() &&
          OffloadEntriesDeviceGlobalVar.empty();
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 5870457956b5433..b189559b3430461 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6033,4 +6033,77 @@ TEST_F(OpenMPIRBuilderTest, createGPUOffloadEntry) {
   EXPECT_TRUE(Fn->hasFnAttribute(Attribute::MustProgress));
 }
 
+TEST_F(OpenMPIRBuilderTest, createTeams) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  AllocaInst *ValPtr32 = Builder.CreateAlloca(Builder.getInt32Ty());
+  AllocaInst *ValPtr128 = Builder.CreateAlloca(Builder.getInt128Ty());
+  Value *Val128 =
+      Builder.CreateLoad(Builder.getInt128Ty(), ValPtr128, "bodygen.load");
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    Builder.restoreIP(AllocaIP);
+    AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr,
+                                                "bodygen.alloca128");
+
+    Builder.restoreIP(CodeGenIP);
+    // Loading and storing captured pointer and values
+    Builder.CreateStore(Val128, Local128);
+    Value *Val32 = Builder.CreateLoad(ValPtr32->getAllocatedType(), ValPtr32,
+                                      "bodygen.load32");
+
+    LoadInst *PrivLoad128 = Builder.CreateLoad(
+        Local128->getAllocatedType(), Local128, "bodygen.local.load128");
+    Value *Cmp = Builder.CreateICmpNE(
+        Val32, Builder.CreateTrunc(PrivLoad128, Val32->getType()));
+    Instruction *ThenTerm, *ElseTerm;
+    SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(),
+                                  &ThenTerm, &ElseTerm);
+  };
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+  Builder.restoreIP(OMPBuilder.createTeams(Builder, BodyGenCB));
+  OMPBuilder.finalize();
+  Builder.CreateRetVoid();
+
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  CallInst *TeamsForkCall = dyn_cast<CallInst>(
+      OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams)
+          ->user_back());
+
+  // Verify the Ident argument
+  GlobalVariable *Ident = cast<GlobalVariable>(TeamsForkCall->getArgOperand(0));
+  ASSERT_NE(Ident, nullptr);
+  EXPECT_TRUE(Ident->hasInitializer());
+  Constant *Initializer = Ident->getInitializer();
+  GlobalVariable *SrcStrGlob =
+      cast<GlobalVariable>(Initializer->getOperand(4)->stripPointerCasts());
+  ASSERT_NE(SrcStrGlob, nullptr);
+  ConstantDataArray *SrcSrc =
+      dyn_cast<ConstantDataArray>(SrcStrGlob->getInitializer());
+  ASSERT_NE(SrcSrc, nullptr);
+
+  // Verify the outlined function signature.
+  Function *OutlinedFn =
+      dyn_cast<Function>(TeamsForkCall->getArgOperand(2)->stripPointerCasts());
+  ASSERT_NE(OutlinedFn, nullptr);
+  EXPECT_FALSE(OutlinedFn->isDeclaration());
+  EXPECT_TRUE(OutlinedFn->arg_size() >= 3);
+  EXPECT_EQ(OutlinedFn->getArg(0)->getType(), Builder.getPtrTy()); // global_tid
+  EXPECT_EQ(OutlinedFn->getArg(1)->getType(), Builder.getPtrTy()); // bound_tid
+  EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
+            Builder.getPtrTy()); // captured args
+
+  // Check for TruncInst and ICmpInst in the outlined function.
+  EXPECT_TRUE(any_of(instructions(OutlinedFn),
+                     [](Instruction &inst) { return isa<TruncInst>(&inst); }));
+  EXPECT_TRUE(any_of(instructions(OutlinedFn),
+                     [](Instruction &inst) { return isa<ICmpInst>(&inst); }));
+}
+
 } // namespace

>From 4bb902439806f023a0eb30841ef485ee3aed478b Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Fri, 8 Sep 2023 22:59:45 +0800
Subject: [PATCH 02/32] [NFC][Clang][RISCV] Remove trailing whitespaces in
 riscv_vector.td (#65762)

This whitespaces breaking `Format` CI.
```
$ trap 'kill -- $$' INT TERM QUIT; clang/utils/ci/run-buildbot check-format
+ set -o pipefail
+ unset LANG
+ unset LC_ALL
+ unset LC_COLLATE
++ basename clang/utils/ci/run-buildbot
+ PROGNAME=run-buildbot
+ [[ 1 == 0 ]]
+ [[ 1 -gt 0 ]]
+ case ${1} in
+ BUILDER=check-format
+ shift
+ [[ 0 -gt 0 ]]
++ git rev-parse --show-toplevel
+ MONOREPO_ROOT=/var/lib/buildkite-agent/builds/linux-56-fdf668759-vtv88-1/llvm-project/clang-ci
+ BUILD_DIR=/var/lib/buildkite-agent/builds/linux-56-fdf668759-vtv88-1/llvm-project/clang-ci/build/check-format
+ INSTALL_DIR=/var/lib/buildkite-agent/builds/linux-56-fdf668759-vtv88-1/llvm-project/clang-ci/build/check-format/install
+ cmake --version
cmake version 3.23.3
CMake suite maintained and supported by Kitware (kitware.com/cmake).
+ ninja --version
1.10.1
+ case "${BUILDER}" in
+ grep -rnI '[[:blank:]]$' clang/lib clang/include clang/docs
clang/include/clang/Basic/riscv_vector.td:580:        let Name = op # eew # "_v", IRName = op, MaskedIRName = op # "_mask",
clang/include/clang/Basic/riscv_vector.td:1786:    let RequiredFeatures = ["ZvfhminOrZvfh"] in
```

Signed-off-by: yronglin <yronglin777 at gmail.com>
---
 clang/include/clang/Basic/riscv_vector.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index e52ac661a61333a..9b941e1cca85014 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -577,7 +577,7 @@ multiclass RVVIndexedLoad<string op> {
       foreach eew_list = EEWList[0-2] in {
         defvar eew = eew_list[0];
         defvar eew_type = eew_list[1];
-        let Name = op # eew # "_v", IRName = op, MaskedIRName = op # "_mask", 
+        let Name = op # eew # "_v", IRName = op, MaskedIRName = op # "_mask",
             RequiredFeatures = !if(!eq(type, "x"), ["ZvfhminOrZvfh"],
                                                    []<string>) in {
           def: RVVOutOp1Builtin<"v", "vPCe" # eew_type # "Uv", type>;
@@ -1783,7 +1783,7 @@ let HasMasked = false,
                                    [["v", "Uv", "UvUv"]]>;
     defm vmv_v : RVVOutBuiltinSet<"vmv_v_v", "csilfd",
                                    [["v", "v", "vv"]]>;
-    let RequiredFeatures = ["ZvfhminOrZvfh"] in                               
+    let RequiredFeatures = ["ZvfhminOrZvfh"] in
       defm vmv_v : RVVOutBuiltinSet<"vmv_v_v", "x",
                                     [["v", "v", "vv"]]>;
   let SupportOverloading = false in

>From 0f50d0108c7ee8b081574a76816a428e30c6701a Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Fri, 8 Sep 2023 07:58:51 -0700
Subject: [PATCH 03/32] Revert RTSanitizerCommonSymbolizerInternal changes

This reverts the following commits:

 - 5d7b75e2a5846f72f04a6fdb25a0df338f1825a1
  [NFC][memprof] Temporarly remove RTSanitizerCommonSymbolizerInternal

 - edb211cb78317ad73aa4bd2d3df75194b7f23a72
   [NFC][memprof] Temporarly remove RTSanitizerCommonSymbolizerInternal

 - 4d14b4a872577bf7ab5ef5bb6f8a2f10781a5f18
   [sanitizer] Add CMake flag to build with internal symbolizer

They break macOS nodes because CMake can't evaluate generator expressions:

  Error evaluating generator expression:

    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.osx>
    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.ios>
    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.iossim>
---
 compiler-rt/CMakeLists.txt                    |  3 --
 compiler-rt/lib/hwasan/CMakeLists.txt         |  4 ---
 compiler-rt/lib/memprof/CMakeLists.txt        |  5 +--
 .../lib/sanitizer_common/CMakeLists.txt       |  4 ---
 .../symbolizer/CMakeLists.txt                 | 27 ----------------
 .../symbolizer/scripts/build_symbolizer.sh    | 32 ++++++++++++++++---
 compiler-rt/lib/stats/CMakeLists.txt          |  1 -
 compiler-rt/lib/ubsan/CMakeLists.txt          |  7 ----
 8 files changed, 29 insertions(+), 54 deletions(-)
 delete mode 100644 compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 737ba774a172805..3888995811d8945 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -727,9 +727,6 @@ endif()
 pythonize_bool(COMPILER_RT_HAS_LLD)
 pythonize_bool(COMPILER_RT_TEST_USE_LLD)
 
-option(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER "Build Compiler RT linked with in LLVM symbolizer" OFF)
-mark_as_advanced(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
-
 add_subdirectory(lib)
 
 if(COMPILER_RT_INCLUDE_TESTS)
diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt
index e7b3f5e005f8e2e..1b5775d9435d7e0 100644
--- a/compiler-rt/lib/hwasan/CMakeLists.txt
+++ b/compiler-rt/lib/hwasan/CMakeLists.txt
@@ -164,8 +164,6 @@ function(add_hwasan_runtimes arch use_aliases)
                 RTSanitizerCommonLibc
                 RTSanitizerCommonCoverage
                 RTSanitizerCommonSymbolizer
-                # FIXME: disable tagging when in symbolizer.
-                # RTSanitizerCommonSymbolizerInternal
                 RTLSanCommon
                 RTUbsan
     CFLAGS ${hwasan_rtl_flags}
@@ -203,8 +201,6 @@ function(add_hwasan_runtimes arch use_aliases)
             RTSanitizerCommonLibc
             RTSanitizerCommonCoverage
             RTSanitizerCommonSymbolizer
-            # FIXME: disable tagging when in symbolizer.
-            # RTSanitizerCommonSymbolizerInternal
             RTLSanCommon
             RTUbsan
             RTUbsan_cxx
diff --git a/compiler-rt/lib/memprof/CMakeLists.txt b/compiler-rt/lib/memprof/CMakeLists.txt
index 3f55c2f5e075eed..2459ce13ab743cb 100644
--- a/compiler-rt/lib/memprof/CMakeLists.txt
+++ b/compiler-rt/lib/memprof/CMakeLists.txt
@@ -127,10 +127,7 @@ set(MEMPROF_COMMON_RUNTIME_OBJECT_LIBS
   RTSanitizerCommon
   RTSanitizerCommonLibc
   RTSanitizerCommonCoverage
-  RTSanitizerCommonSymbolizer
-  # FIXME: hangs.
-  # RTSanitizerCommonSymbolizerInternal
-)
+  RTSanitizerCommonSymbolizer)
 
 add_compiler_rt_runtime(clang_rt.memprof
   STATIC
diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index 25304b71c0c7061..d8517f3de06a493 100644
--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -362,7 +362,3 @@ foreach(arch ${SANITIZER_COMMON_SUPPORTED_ARCH})
   add_library(RTSanitizerCommonSymbolizerInternal.${arch}
               OBJECT IMPORTED GLOBAL)
 endforeach()
-
-if (COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
-  add_subdirectory(symbolizer)
-endif()
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt
deleted file mode 100644
index 49ff201066bcf92..000000000000000
--- a/compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-foreach(arch ${SANITIZER_COMMON_SUPPORTED_ARCH})
-  get_target_flags_for_arch(${arch} TARGET_CFLAGS)
-
-  set(RTSanitizerCommonSymbolizerInternalDir
-      "${CMAKE_CURRENT_BINARY_DIR}/RTSanitizerCommonSymbolizerInternal.${arch}")
-  add_custom_command(OUTPUT ${RTSanitizerCommonSymbolizerInternalDir}
-                     COMMAND ${CMAKE_COMMAND} -E make_directory ${RTSanitizerCommonSymbolizerInternalDir})
-
-  add_custom_command(OUTPUT RTSanitizerCommonSymbolizerInternal.${arch}.o
-                     DEPENDS ${RTSanitizerCommonSymbolizerInternalDir} 
-                             clang lld llvm-tblgen opt llvm-ar llvm-link llvm-ranlib llvm-symbolizer
-                             sanitizer_wrappers.cpp
-                             sanitizer_symbolize.cpp
-                             scripts/build_symbolizer.sh
-                     WORKING_DIRECTORY ${RTSanitizerCommonSymbolizerInternalDir}
-                     COMMAND FLAGS=${TARGET_CFLAGS}
-                             CLANG=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang
-                             ${CMAKE_CURRENT_SOURCE_DIR}/scripts/build_symbolizer.sh
-                             ${CMAKE_CURRENT_BINARY_DIR}/RTSanitizerCommonSymbolizerInternal.${arch}.o
-                     USES_TERMINAL)
-
-  add_custom_target(RTSanitizerCommonSymbolizerInternalObj.${arch}
-                    DEPENDS RTSanitizerCommonSymbolizerInternal.${arch}.o)
-
-  set_property(TARGET RTSanitizerCommonSymbolizerInternal.${arch}
-               PROPERTY IMPORTED_OBJECTS ${CMAKE_CURRENT_BINARY_DIR}/RTSanitizerCommonSymbolizerInternal.${arch}.o)
-endforeach()
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index 524ddca1b9f3e4f..ec3f73ab21fb4b2 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -3,16 +3,26 @@
 # Run as: CLANG=bin/clang build_symbolizer.sh out.o
 # zlib can be downloaded from http://www.zlib.net.
 #
-# Script compiles self-contained object file with symbolization code.
+# Script compiles self-contained object file with symbolization code and injects
+# it into the given set of runtime libraries. Script updates only libraries
+# which has unresolved __sanitizer_symbolize_* symbols and matches architecture.
+# Object file is be compiled from LLVM sources with dependencies like libc++ and
+# zlib. Then it internalizes symbols in the file, so that it can be linked
+# into arbitrary programs, avoiding conflicts with the program own symbols and
+# avoiding dependencies on any program symbols. The only acceptable dependencies
+# are libc and __sanitizer::internal_* from sanitizer runtime.
 #
 # Symbols exported by the object file will be used by Sanitizer runtime
 # libraries to symbolize code/data in-process.
 #
+# The script will modify the output directory which is given as the first
+# argument to the script.
+#
 # FIXME: We should really be using a simpler approach to building this object
 # file, and it should be available as a regular cmake rule. Conceptually, we
 # want to be doing "ld -r" followed by "objcopy -G" to create a relocatable
 # object file with only our entry points exposed. However, this does not work at
-# present, see https://github.com/llvm/llvm-project/issues/30098.
+# present, see PR30750.
 
 set -x
 set -e
@@ -20,7 +30,7 @@ set -u
 
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 SRC_DIR=$(readlink -f $SCRIPT_DIR/..)
-OUTPUT=$(readlink -f $1)
+TARGE_DIR=$(readlink -f $1)
 COMPILER_RT_SRC=$(readlink -f ${SCRIPT_DIR}/../../../..)
 LLVM_SRC=${LLVM_SRC:-${COMPILER_RT_SRC}/../llvm}
 LLVM_SRC=$(readlink -f $LLVM_SRC)
@@ -176,6 +186,20 @@ nm -f posix -g symbolizer.o | cut -f 1,2 -d \  | LC_COLLATE=C sort -u > undefine
 (diff -u $SCRIPT_DIR/global_symbols.txt undefined.new | grep -E "^\+[^+]") && \
   (echo "Failed: unexpected symbols"; exit 1)
 
-cp -f symbolizer.o $OUTPUT
+arch() {
+  objdump -f $1 | grep -m1 -Po "(?<=file format ).*$"
+}
+
+SYMBOLIZER_FORMAT=$(arch symbolizer.o)
+echo "Injecting $SYMBOLIZER_FORMAT symbolizer..."
+for A in $TARGE_DIR/libclang_rt.*san*.a; do
+  A_FORMAT=$(arch $A)
+  if [[ "$A_FORMAT" != "$SYMBOLIZER_FORMAT" ]] ; then
+    continue
+  fi
+  (nm -u $A 2>/dev/null | grep -E "__sanitizer_symbolize_code" >/dev/null) || continue
+  echo "$A"
+  $AR rcs $A symbolizer.o
+done
 
 echo "Success!"
diff --git a/compiler-rt/lib/stats/CMakeLists.txt b/compiler-rt/lib/stats/CMakeLists.txt
index 61b5b4524896cbb..60c02556f80d615 100644
--- a/compiler-rt/lib/stats/CMakeLists.txt
+++ b/compiler-rt/lib/stats/CMakeLists.txt
@@ -29,7 +29,6 @@ add_compiler_rt_runtime(clang_rt.stats
   OBJECT_LIBS RTSanitizerCommon
               RTSanitizerCommonLibc
               RTSanitizerCommonSymbolizer
-              RTSanitizerCommonSymbolizerInternal
   CFLAGS ${SANITIZER_COMMON_CFLAGS}
   LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
   LINK_LIBS ${STATS_LINK_LIBS}
diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt
index 93730561523a94c..520a024fbedee59 100644
--- a/compiler-rt/lib/ubsan/CMakeLists.txt
+++ b/compiler-rt/lib/ubsan/CMakeLists.txt
@@ -71,9 +71,6 @@ append_list_if(COMPILER_RT_HAS_LIBDL dl UBSAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBLOG log UBSAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBRT rt UBSAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBPTHREAD pthread UBSAN_DYNAMIC_LIBS)
-if (COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
-  append_list_if(COMPILER_RT_HAS_LIBM m UBSAN_DYNAMIC_LIBS)
-endif()
 
 add_compiler_rt_component(ubsan)
 
@@ -207,8 +204,6 @@ else()
               RTSanitizerCommonLibc
               RTSanitizerCommonCoverage
               RTSanitizerCommonSymbolizer
-              # FIXME: Some wrong with C++ demangling.
-              # RTSanitizerCommonSymbolizerInternal
               RTUbsan
               RTUbsan_standalone
               RTInterception
@@ -253,8 +248,6 @@ else()
                 RTSanitizerCommonLibc
                 RTSanitizerCommonCoverage
                 RTSanitizerCommonSymbolizer
-                # FIXME: Some wrong with C++ demangling.
-                # RTSanitizerCommonSymbolizerInternal
                 RTUbsan
                 RTUbsan_cxx
                 RTUbsan_standalone

>From 6f95737ed4645bf20b33f2d21c93aadd3dcaba27 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 8 Sep 2023 16:11:19 +0100
Subject: [PATCH 04/32] [mlir][TOSA] Fix interpretation of
 --tosa-validate='profile=undefined' (#65738)

Due to a copy-paste error in 32b7c1ff this was incorrectly mapped to
`TosaProfileEnum::MainTraining` rather than `TosaProfileEnum::Undefined`.
---
 mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index bc30b88ea2af6a9..18402b3e70647a9 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -101,7 +101,7 @@ def TosaValidation : Pass<"tosa-validate", "func::FuncOp"> {
                 "Use Main Inference profile."),
                clEnumValN(mlir::tosa::TosaProfileEnum::MainTraining, "mt",
                 "Use Main Training profile."),
-               clEnumValN(mlir::tosa::TosaProfileEnum::MainTraining, "undefined",
+               clEnumValN(mlir::tosa::TosaProfileEnum::Undefined, "undefined",
                 "Do not define a profile.")
               )}]>,
       Option<"StrictOperationSpecAlignment", "strict-op-spec-alignment", "bool",

>From 8669a9f93adb2d3d3ca44a0b0873000927762177 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 8 Sep 2023 16:16:02 +0100
Subject: [PATCH 05/32] [AMDGPU] Cope with SelectionDAG::UpdateNodeOperands
 returning a different SDNode (#65765)

SITargetLowering::adjustWritemask calls SelectionDAG::UpdateNodeOperands
to update an EXTRACT_SUBREG node in-place to refer to a new IMAGE_LOAD
instruction, before we delete the old IMAGE_LOAD instruction. But in
UpdateNodeOperands can do CSE on the fly and return a different
EXTRACT_SUBREG node, so the original EXTRACT_SUBREG node would still
exist and would refer to the old deleted IMAGE_LOAD instruction. This
caused errors like:

t31: v3i32,ch = <<Deleted Node!>> # D:1
This target-independent node should have been selected!
UNREACHABLE executed at lib/CodeGen/SelectionDAG/InstrEmitter.cpp:1209!

Fix it by detecting the CSE case and replacing all uses of the original
EXTRACT_SUBREG node with the CSE'd one.

Recommit with a fix for a use-after-free bug in the first version of
this patch (#65340) which was caught by asan.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  6 +++-
 .../CodeGen/AMDGPU/adjust-writemask-cse.ll    | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 85c9ed489e926ce..b5af88af1d558f5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13464,7 +13464,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
         continue;
     } else {
       SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
-      DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+      SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+      if (NewUser != User) {
+        DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
+        DAG.RemoveDeadNode(User);
+      }
     }
 
     switch (Idx) {
diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
new file mode 100644
index 000000000000000..fbbb786576e55ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -stop-after=finalize-isel < %s | FileCheck %s -check-prefix=GFX10
+
+define float @test() {
+  ; GFX10-LABEL: name: test
+  ; GFX10: bb.0.bb:
+  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
+  ; GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[IMAGE_LOAD_V2_V2_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_gfx10_]].sub1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_gfx10_]].sub0
+  ; GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_1]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
+bb:
+  %v0 = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 3, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
+  %v1 = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 7, i32 0, i32 0, <8 x i32> zeroinitializer, i32 0, i32 0)
+  %e0 = extractelement <2 x float> %v0, i64 1
+  %e1 = extractelement <3 x float> %v1, i64 0
+  %e2 = extractelement <3 x float> %v1, i64 1
+  %a0 = fadd float %e0, %e1
+  %a1 = fadd float %a0, %e2
+  ret float %a1
+}
+
+declare <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)
+declare <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg)

>From ffb8434f6a514fb3c06e0bdaff6d4ee410924ff6 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Fri, 8 Sep 2023 17:15:22 +0200
Subject: [PATCH 06/32] [clang] Prevent unnecessary copies in
 `SymbolGraphSerializer` (NFC)

---
 clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index 229bf04c77fae44..6ee6e72d99ec57b 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -597,7 +597,7 @@ std::optional<Object> serializeTemplateMixinImpl(const RecordTy &Record,
 
   Object Generics;
   Array GenericParameters;
-  for (const auto Param : Template.getParameters()) {
+  for (const auto &Param : Template.getParameters()) {
     Object Parameter;
     Parameter["name"] = Param.Name;
     Parameter["index"] = Param.Index;
@@ -608,7 +608,7 @@ std::optional<Object> serializeTemplateMixinImpl(const RecordTy &Record,
     Generics["parameters"] = std::move(GenericParameters);
 
   Array GenericConstraints;
-  for (const auto Constr : Template.getConstraints()) {
+  for (const auto &Constr : Template.getConstraints()) {
     Object Constraint;
     Constraint["kind"] = Constr.Kind;
     Constraint["lhs"] = Constr.LHS;

>From 8c03239934d38790620bc222144280939a21f866 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard at amd.com>
Date: Fri, 8 Sep 2023 16:24:10 +0100
Subject: [PATCH 07/32] [AMDGPU] New intrinsic void llvm.amdgcn.s.nop(i16)
 (#65757)

This allows front ends to insert s_nops - this is most often when a
delay less
than s_sleep 1 is required.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  5 ++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  3 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll | 30 +++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e9b13c3adcbaa54..124f22c1a9b27c7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1677,6 +1677,11 @@ def int_amdgcn_s_sleep :
                                 IntrHasSideEffects]> {
 }
 
+def int_amdgcn_s_nop :
+  DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
+                                IntrHasSideEffects]> {
+}
+
 def int_amdgcn_s_incperflevel :
   ClangBuiltin<"__builtin_amdgcn_s_incperflevel">,
   DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 38fa90bdc937276..175045a8a893e92 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1161,7 +1161,8 @@ multiclass SOPP_With_Relaxation <string opName, dag ins,
   def _pad_s_nop : SOPP_Pseudo <opName # "_pad_s_nop", ins, asmOps, pattern, " ", opName>;
 }
 
-def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16"> {
+def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16",
+  [(int_amdgcn_s_nop timm:$simm16)]> {
   let hasSideEffects = 1;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll
new file mode 100644
index 000000000000000..a625f973c0b8f55
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.nop.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @test_s_nop() {
+; GCN-LABEL: test_s_nop:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 4
+; GCN-NEXT:    s_nop 5
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 63
+; GCN-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.nop(i16 0)
+  call void @llvm.amdgcn.s.nop(i16 1)
+  call void @llvm.amdgcn.s.nop(i16 2)
+  call void @llvm.amdgcn.s.nop(i16 3)
+  call void @llvm.amdgcn.s.nop(i16 4)
+  call void @llvm.amdgcn.s.nop(i16 5)
+  call void @llvm.amdgcn.s.nop(i16 6)
+  call void @llvm.amdgcn.s.nop(i16 7)
+  call void @llvm.amdgcn.s.nop(i16 63)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.nop(i16)

>From e5fe3d27fc1d5f27fe107f3380f3727362bac66c Mon Sep 17 00:00:00 2001
From: Pavel Iliin <61020334+ilinpv at users.noreply.github.com>
Date: Fri, 8 Sep 2023 16:26:39 +0100
Subject: [PATCH 08/32] [AArch64] Fix FMV crash on unspecified number of
 parameters function (#65671)

Fix Function Multi Versioning crash reported in
https://github.com/llvm/llvm-project/issues/65669
---
 clang/lib/Sema/SemaDecl.cpp           | 28 ++++++++++++++-------------
 clang/test/Sema/attr-target-version.c |  5 +++++
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index d94366dac102a2a..027c6c3e4222f07 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11680,20 +11680,22 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
   FunctionDecl *OldFD = OldDecl->getAsFunction();
 
   if (!OldFD->isMultiVersion() && MVKind == MultiVersionKind::None) {
-    // No target_version attributes mean default
-    if (!NewTVA) {
-      const auto *OldTVA = OldFD->getAttr<TargetVersionAttr>();
-      if (OldTVA) {
-        NewFD->addAttr(TargetVersionAttr::CreateImplicit(
-            S.Context, "default", NewFD->getSourceRange()));
-        NewFD->setIsMultiVersion();
-        OldFD->setIsMultiVersion();
-        OldDecl = OldFD;
-        Redeclaration = true;
-        return true;
-      }
+    if (NewTVA || !OldFD->getAttr<TargetVersionAttr>())
+      return false;
+    if (!NewFD->getType()->getAs<FunctionProtoType>()) {
+      // Multiversion declaration doesn't have prototype.
+      S.Diag(NewFD->getLocation(), diag::err_multiversion_noproto);
+      NewFD->setInvalidDecl();
+    } else {
+      // No "target_version" attribute is equivalent to "default" attribute.
+      NewFD->addAttr(TargetVersionAttr::CreateImplicit(
+          S.Context, "default", NewFD->getSourceRange()));
+      NewFD->setIsMultiVersion();
+      OldFD->setIsMultiVersion();
+      OldDecl = OldFD;
+      Redeclaration = true;
     }
-    return false;
+    return true;
   }
 
   // Multiversioned redeclarations aren't allowed to omit the attribute, except
diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c
index 0cfec5ecb49ce7c..587c721de5e3226 100644
--- a/clang/test/Sema/attr-target-version.c
+++ b/clang/test/Sema/attr-target-version.c
@@ -89,3 +89,8 @@ float __attribute__((target_version("rdm"))) rtype(int);
 int __attribute__((target_version("sha2"))) combine(void) { return 1; }
 // expected-error at +1 {{multiversioned function declaration has a different calling convention}}
 int __attribute__((aarch64_vector_pcs, target_version("sha3"))) combine(void) { return 2; }
+
+int __attribute__((target_version("fp+aes+pmull+rcpc"))) unspec_args() { return -1; }
+// expected-error at +1 {{multiversioned function must have a prototype}}
+int __attribute__((target_version("default"))) unspec_args() { return 0; }
+int cargs() { return unspec_args(); }

>From 08de6508ab6af53779d2daf276295473c5b0906e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 8 Sep 2023 16:29:09 +0100
Subject: [PATCH 09/32] [LV] Return debug loc directly from
 getDebugLocFromInstrOrOps (NFCI)

The return value of the function is only used to get the debug location.
Directly return the debug location, as this avoids an extra null
check in the caller.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a36c5fb95d38f0b..123636f5bc38300 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -932,21 +932,21 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
 
 /// Look for a meaningful debug location on the instruction or it's
 /// operands.
-static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
   if (!I)
-    return I;
+    return DebugLoc();
 
   DebugLoc Empty;
   if (I->getDebugLoc() != Empty)
-    return I;
+    return I->getDebugLoc();
 
   for (Use &Op : I->operands()) {
     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
       if (OpInst->getDebugLoc() != Empty)
-        return OpInst;
+        return OpInst->getDebugLoc();
   }
 
-  return I;
+  return I->getDebugLoc();
 }
 
 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
@@ -8814,10 +8814,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   for (ElementCount VF : Range)
     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
 
-  Instruction *DLInst =
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
-  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
-                        DLInst ? DLInst->getDebugLoc() : DebugLoc(),
+  DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
+  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DL,
                         CM.getTailFoldingStyle(IVUpdateMayOverflow));
 
   // Proactively create header mask. Masks for other blocks are created on

>From 679c0b48d7418b40996e5dcab61c0ffa73089718 Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern at gmail.com>
Date: Fri, 8 Sep 2023 09:19:02 -0400
Subject: [PATCH 10/32] [libc++abi] Refactor around __dynamic_cast

This commit contains refactorings around __dynamic_cast without changing
its behavior. Some important changes include:

- Refactor __dynamic_cast into various small helper functions;
- Move dynamic_cast_stress.pass.cpp to libcxx/benchmarks and refactor
  it into a benchmark. The benchmark performance numbers are updated
  as well.

Differential Revision: https://reviews.llvm.org/D138006
---
 libcxx/benchmarks/CMakeLists.txt              |   1 +
 .../dynamic_cast_old_stress.bench.cpp         |  77 ++++
 libcxxabi/src/private_typeinfo.cpp            | 422 +++++++++++-------
 libcxxabi/test/dynamic_cast_stress.pass.cpp   |  82 ----
 4 files changed, 349 insertions(+), 233 deletions(-)
 create mode 100644 libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
 delete mode 100644 libcxxabi/test/dynamic_cast_stress.pass.cpp

diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index d79c6478c266ed7..66cc2930257643b 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -183,6 +183,7 @@ set(BENCHMARK_TESTS
     algorithms/sort_heap.bench.cpp
     algorithms/stable_sort.bench.cpp
     libcxxabi/dynamic_cast.bench.cpp
+    libcxxabi/dynamic_cast_old_stress.bench.cpp
     allocation.bench.cpp
     deque.bench.cpp
     deque_iterator.bench.cpp
diff --git a/libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp b/libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
new file mode 100644
index 000000000000000..df4daf7409b8f79
--- /dev/null
+++ b/libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
+#include "benchmark/benchmark.h"
+
+template <std::size_t Indx, std::size_t Depth>
+struct C : public virtual C<Indx, Depth - 1>, public virtual C<Indx + 1, Depth - 1> {
+  virtual ~C() {}
+};
+
+template <std::size_t Indx>
+struct C<Indx, 0> {
+  virtual ~C() {}
+};
+
+template <std::size_t Indx, std::size_t Depth>
+struct B : public virtual C<Indx, Depth - 1>, public virtual C<Indx + 1, Depth - 1> {};
+
+template <class Indx, std::size_t Depth>
+struct makeB;
+
+template <std::size_t... Indx, std::size_t Depth>
+struct makeB<std::index_sequence<Indx...>, Depth> : public B<Indx, Depth>... {};
+
+template <std::size_t Width, std::size_t Depth>
+struct A : public makeB<std::make_index_sequence<Width>, Depth> {};
+
+constexpr std::size_t Width = 10;
+constexpr std::size_t Depth = 5;
+
+template <typename Destination>
+void CastTo(benchmark::State& state) {
+  A<Width, Depth> a;
+  auto base = static_cast<C<Width / 2, 0>*>(&a);
+
+  Destination* b = nullptr;
+  for (auto _ : state) {
+    b = dynamic_cast<Destination*>(base);
+    benchmark::DoNotOptimize(b);
+  }
+
+  assert(b != 0);
+}
+
+BENCHMARK(CastTo<B<Width / 2, Depth>>);
+BENCHMARK(CastTo<A<Width, Depth>>);
+
+BENCHMARK_MAIN();
+
+/**
+ * Benchmark results: (release builds)
+ *
+ * libcxxabi:
+ * ----------------------------------------------------------------------
+ * Benchmark                            Time             CPU   Iterations
+ * ----------------------------------------------------------------------
+ * CastTo<B<Width / 2, Depth>>       1997 ns         1997 ns       349247
+ * CastTo<A<Width, Depth>>            256 ns          256 ns      2733871
+ *
+ * libsupc++:
+ * ----------------------------------------------------------------------
+ * Benchmark                            Time             CPU   Iterations
+ * ----------------------------------------------------------------------
+ * CastTo<B<Width / 2, Depth>>       5240 ns         5240 ns       133091
+ * CastTo<A<Width, Depth>>            866 ns          866 ns       808600
+ *
+ *
+ */
diff --git a/libcxxabi/src/private_typeinfo.cpp b/libcxxabi/src/private_typeinfo.cpp
index c737d96f418d8a8..82db4bbec1ada2e 100644
--- a/libcxxabi/src/private_typeinfo.cpp
+++ b/libcxxabi/src/private_typeinfo.cpp
@@ -75,6 +75,254 @@ static inline ptrdiff_t update_offset_to_base(const char* vtable,
 namespace __cxxabiv1
 {
 
+namespace {
+
+struct derived_object_info {
+    const void* dynamic_ptr;
+    const __class_type_info* dynamic_type;
+    std::ptrdiff_t offset_to_derived;
+};
+
+/// A helper function that gets (dynamic_ptr, dynamic_type, offset_to_derived) from static_ptr.
+void dyn_cast_get_derived_info(derived_object_info* info, const void* static_ptr)
+{
+#if __has_feature(cxx_abi_relative_vtable)
+    // The vtable address will point to the first virtual function, which is 8
+    // bytes after the start of the vtable (4 for the offset from top + 4 for
+    // the typeinfo component).
+    const int32_t* vtable =
+        *reinterpret_cast<const int32_t* const*>(static_ptr);
+    info->offset_to_derived = static_cast<std::ptrdiff_t>(vtable[-2]);
+    info->dynamic_ptr = static_cast<const char*>(static_ptr) + info->offset_to_derived;
+
+    // The typeinfo component is now a relative offset to a proxy.
+    int32_t offset_to_ti_proxy = vtable[-1];
+    const uint8_t* ptr_to_ti_proxy =
+        reinterpret_cast<const uint8_t*>(vtable) + offset_to_ti_proxy;
+    info->dynamic_type = *(reinterpret_cast<const __class_type_info* const*>(ptr_to_ti_proxy));
+#else
+    void **vtable = *static_cast<void ** const *>(static_ptr);
+    info->offset_to_derived = reinterpret_cast<ptrdiff_t>(vtable[-2]);
+    info->dynamic_ptr = static_cast<const char*>(static_ptr) + info->offset_to_derived;
+    info->dynamic_type = static_cast<const __class_type_info*>(vtable[-1]);
+#endif
+}
+
+/// A helper function for __dynamic_cast that casts a base sub-object pointer
+/// to the object's dynamic type.
+///
+/// This function returns the casting result directly. No further processing
+/// required.
+///
+/// Specifically, this function can only be called if the following pre-
+/// condition holds:
+///   * The dynamic type of the object pointed to by `static_ptr` is exactly
+///     the same as `dst_type`.
+const void* dyn_cast_to_derived(const void* static_ptr,
+                                const void* dynamic_ptr,
+                                const __class_type_info* static_type,
+                                const __class_type_info* dst_type,
+                                std::ptrdiff_t offset_to_derived,
+                                std::ptrdiff_t src2dst_offset)
+{
+    // We're downcasting from src_type to the complete object's dynamic type.
+    //   This is a really hot path that can be further optimized with the
+    //   `src2dst_offset` hint.
+    // In such a case, dynamic_ptr already gives the casting result if the
+    //   casting ever succeeds. All we have to do now is to check static_ptr
+    //   points to a public base sub-object of dynamic_ptr.
+
+    if (src2dst_offset >= 0)
+    {
+        // The static type is a unique public non-virtual base type of
+        //   dst_type at offset `src2dst_offset` from the origin of dst.
+        // Note that there might be other non-public static_type bases. The
+        //   hint only guarantees that the public base is non-virtual and
+        //   unique. So we have to check whether static_ptr points to that
+        //   unique public base sub-object.
+        if (offset_to_derived != -src2dst_offset)
+            return nullptr;
+        return dynamic_ptr;
+    }
+
+    if (src2dst_offset == -2)
+    {
+        // static_type is not a public base of dst_type.
+        return nullptr;
+    }
+
+    // If src2dst_offset == -3, then:
+    //   src_type is a multiple public base type but never a virtual
+    //   base type. We can't conclude that static_ptr points to those
+    //   public base sub-objects because there might be other non-
+    //   public static_type bases. The search is inevitable.
+
+    // Fallback to the slow path to check that static_type is a public
+    //   base type of dynamic_type.
+    // Using giant short cut.  Add that information to info.
+    __dynamic_cast_info info = {
+        dst_type,
+        static_ptr,
+        static_type,
+        src2dst_offset,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        1,  // number_of_dst_type
+        false, false, false
+    };
+    // Do the  search
+    dst_type->search_above_dst(&info, dynamic_ptr, dynamic_ptr, public_path, false);
+#ifdef _LIBCXXABI_FORGIVING_DYNAMIC_CAST
+    // The following if should always be false because we should
+    //   definitely find (static_ptr, static_type), either on a public
+    //   or private path
+    if (info.path_dst_ptr_to_static_ptr == unknown)
+    {
+        // We get here only if there is some kind of visibility problem
+        //   in client code.
+        static_assert(std::atomic<size_t>::is_always_lock_free, "");
+        static std::atomic<size_t> error_count(0);
+        size_t error_count_snapshot = error_count.fetch_add(1, std::memory_order_relaxed);
+        if ((error_count_snapshot & (error_count_snapshot-1)) == 0)
+            syslog(LOG_ERR, "dynamic_cast error 1: Both of the following type_info's "
+                    "should have public visibility. At least one of them is hidden. %s"
+                    ", %s.\n", static_type->name(), dst_type->name());
+        // Redo the search comparing type_info's using strcmp
+        info = {
+            dst_type,
+            static_ptr,
+            static_type,
+            src2dst_offset,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, false, false, false
+        };
+        info.number_of_dst_type = 1;
+        dst_type->search_above_dst(&info, dynamic_ptr, dynamic_ptr, public_path, true);
+    }
+#endif // _LIBCXXABI_FORGIVING_DYNAMIC_CAST
+    // Query the search.
+    if (info.path_dst_ptr_to_static_ptr != public_path)
+        return nullptr;
+
+    return dynamic_ptr;
+}
+
+/// A helper function for __dynamic_cast that tries to perform a downcast
+/// before giving up and falling back to the slow path.
+const void* dyn_cast_try_downcast(const void* static_ptr,
+                                  const void* dynamic_ptr,
+                                  const __class_type_info* dst_type,
+                                  const __class_type_info* dynamic_type,
+                                  std::ptrdiff_t src2dst_offset)
+{
+    if (src2dst_offset < 0)
+    {
+        // We can only optimize the case if the static type is a unique public
+        //   base of dst_type. Give up.
+        return nullptr;
+    }
+
+    // Pretend there is a dst_type object that leads to static_ptr. Later we
+    //   will check whether this imagined dst_type object exists. If it exists
+    //   then it will be the casting result.
+    const void* dst_ptr_to_static = reinterpret_cast<const char*>(static_ptr) - src2dst_offset;
+
+    if (reinterpret_cast<std::intptr_t>(dst_ptr_to_static) < reinterpret_cast<std::intptr_t>(dynamic_ptr))
+    {
+        // The imagined dst_type object does not exist. Bail-out quickly.
+        return nullptr;
+    }
+
+    // Try to search a path from dynamic_type to dst_type.
+    __dynamic_cast_info dynamic_to_dst_info = {
+        dynamic_type,
+        dst_ptr_to_static,
+        dst_type,
+        src2dst_offset,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        1,  // number_of_dst_type
+        false, false, false
+    };
+    dynamic_type->search_above_dst(&dynamic_to_dst_info, dynamic_ptr, dynamic_ptr, public_path, false);
+    if (dynamic_to_dst_info.path_dst_ptr_to_static_ptr != unknown) {
+        // We have found at least one path from dynamic_ptr to dst_ptr. The
+        //   downcast can succeed.
+        return dst_ptr_to_static;
+    }
+
+    return nullptr;
+}
+
+const void* dyn_cast_slow(const void* static_ptr,
+                          const void* dynamic_ptr,
+                          const __class_type_info* static_type,
+                          const __class_type_info* dst_type,
+                          const __class_type_info* dynamic_type,
+                          std::ptrdiff_t src2dst_offset)
+{
+    // Not using giant short cut.  Do the search
+
+    // Initialize info struct for this search.
+    __dynamic_cast_info info = {
+        dst_type,
+        static_ptr,
+        static_type,
+        src2dst_offset,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, false, false, false
+    };
+
+    dynamic_type->search_below_dst(&info, dynamic_ptr, public_path, false);
+#ifdef _LIBCXXABI_FORGIVING_DYNAMIC_CAST
+    // The following if should always be false because we should
+    //   definitely find (static_ptr, static_type), either on a public
+    //   or private path
+    if (info.path_dst_ptr_to_static_ptr == unknown &&
+        info.path_dynamic_ptr_to_static_ptr == unknown)
+    {
+        static_assert(std::atomic<size_t>::is_always_lock_free, "");
+        static std::atomic<size_t> error_count(0);
+        size_t error_count_snapshot = error_count.fetch_add(1, std::memory_order_relaxed);
+        if ((error_count_snapshot & (error_count_snapshot-1)) == 0)
+            syslog(LOG_ERR, "dynamic_cast error 2: One or more of the following type_info's "
+                            "has hidden visibility or is defined in more than one translation "
+                            "unit. They should all have public visibility. "
+                            "%s, %s, %s.\n", static_type->name(), dynamic_type->name(),
+                    dst_type->name());
+        // Redo the search comparing type_info's using strcmp
+        info = {
+            dst_type,
+            static_ptr,
+            static_type,
+            src2dst_offset,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, false, false, false
+        };
+        dynamic_type->search_below_dst(&info, dynamic_ptr, public_path, true);
+    }
+#endif // _LIBCXXABI_FORGIVING_DYNAMIC_CAST
+    // Query the search.
+    switch (info.number_to_static_ptr)
+    {
+    case 0:
+        if (info.number_to_dst_ptr == 1 &&
+                info.path_dynamic_ptr_to_static_ptr == public_path &&
+                info.path_dynamic_ptr_to_dst_ptr == public_path)
+            return info.dst_ptr_not_leading_to_static_ptr;
+        break;
+    case 1:
+        if (info.path_dst_ptr_to_static_ptr == public_path ||
+            (
+                info.number_to_dst_ptr == 0 &&
+                info.path_dynamic_ptr_to_static_ptr == public_path &&
+                info.path_dynamic_ptr_to_dst_ptr == public_path
+            )
+        )
+            return info.dst_ptr_leading_to_static_ptr;
+        break;
+    }
+
+    return nullptr;
+}
+
+}  // namespace
+
 // __shim_type_info
 
 __shim_type_info::~__shim_type_info()
@@ -623,174 +871,46 @@ extern "C" _LIBCXXABI_FUNC_VIS void *
 __dynamic_cast(const void *static_ptr, const __class_type_info *static_type,
                const __class_type_info *dst_type,
                std::ptrdiff_t src2dst_offset) {
-    // Possible future optimization:  Take advantage of src2dst_offset
-
     // Get (dynamic_ptr, dynamic_type) from static_ptr
-#if __has_feature(cxx_abi_relative_vtable)
-    // The vtable address will point to the first virtual function, which is 8
-    // bytes after the start of the vtable (4 for the offset from top + 4 for the typeinfo component).
-    const int32_t* vtable =
-        *reinterpret_cast<const int32_t* const*>(static_ptr);
-    int32_t offset_to_derived = vtable[-2];
-    const void* dynamic_ptr = static_cast<const char*>(static_ptr) + offset_to_derived;
-
-    // The typeinfo component is now a relative offset to a proxy.
-    int32_t offset_to_ti_proxy = vtable[-1];
-    const uint8_t* ptr_to_ti_proxy =
-        reinterpret_cast<const uint8_t*>(vtable) + offset_to_ti_proxy;
-    const __class_type_info* dynamic_type =
-        *(reinterpret_cast<const __class_type_info* const*>(ptr_to_ti_proxy));
-#else
-    void **vtable = *static_cast<void ** const *>(static_ptr);
-    ptrdiff_t offset_to_derived = reinterpret_cast<ptrdiff_t>(vtable[-2]);
-    const void* dynamic_ptr = static_cast<const char*>(static_ptr) + offset_to_derived;
-    const __class_type_info* dynamic_type = static_cast<const __class_type_info*>(vtable[-1]);
-#endif
+    derived_object_info derived_info;
+    dyn_cast_get_derived_info(&derived_info, static_ptr);
 
     // Initialize answer to nullptr.  This will be changed from the search
     //    results if a non-null answer is found.  Regardless, this is what will
     //    be returned.
     const void* dst_ptr = 0;
-    // Initialize info struct for this search.
-    __dynamic_cast_info info = {dst_type, static_ptr, static_type, src2dst_offset, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,};
 
     // Find out if we can use a giant short cut in the search
-    if (is_equal(dynamic_type, dst_type, false))
+    if (is_equal(derived_info.dynamic_type, dst_type, false))
     {
-        // We're downcasting from src_type to the complete object's dynamic
-        //   type. This is a really hot path that can be further optimized
-        //   with the `src2dst_offset` hint.
-        // In such a case, dynamic_ptr already gives the casting result if the
-        //   casting ever succeeds. All we have to do now is to check
-        //   static_ptr points to a public base sub-object of dynamic_ptr.
-
-        if (src2dst_offset >= 0)
-        {
-            // The static type is a unique public non-virtual base type of
-            //   dst_type at offset `src2dst_offset` from the origin of dst.
-            // Note that there might be other non-public static_type bases. The
-            //   hint only guarantees that the public base is non-virtual and
-            //   unique. So we have to check whether static_ptr points to that
-            //   unique public base sub-object.
-            if (offset_to_derived == -src2dst_offset)
-                dst_ptr = dynamic_ptr;
-        }
-        else if (src2dst_offset == -2)
-        {
-            // static_type is not a public base of dst_type.
-            dst_ptr = nullptr;
-        }
-        else
-        {
-            // If src2dst_offset == -3, then:
-            //   src_type is a multiple public base type but never a virtual
-            //   base type. We can't conclude that static_ptr points to those
-            //   public base sub-objects because there might be other non-
-            //   public static_type bases. The search is inevitable.
-
-            // Fallback to the slow path to check that static_type is a public
-            //   base type of dynamic_type.
-            // Using giant short cut.  Add that information to info.
-            info.number_of_dst_type = 1;
-            // Do the  search
-            dynamic_type->search_above_dst(&info, dynamic_ptr, dynamic_ptr, public_path, false);
-#ifdef _LIBCXXABI_FORGIVING_DYNAMIC_CAST
-            // The following if should always be false because we should
-            //   definitely find (static_ptr, static_type), either on a public
-            //   or private path
-            if (info.path_dst_ptr_to_static_ptr == unknown)
-            {
-                // We get here only if there is some kind of visibility problem
-                //   in client code.
-                static_assert(std::atomic<size_t>::is_always_lock_free, "");
-                static std::atomic<size_t> error_count(0);
-                size_t error_count_snapshot = error_count.fetch_add(1, std::memory_order_relaxed);
-                if ((error_count_snapshot & (error_count_snapshot-1)) == 0)
-                    syslog(LOG_ERR, "dynamic_cast error 1: Both of the following type_info's "
-                            "should have public visibility. At least one of them is hidden. %s"
-                            ", %s.\n", static_type->name(), dynamic_type->name());
-                // Redo the search comparing type_info's using strcmp
-                info = {dst_type, static_ptr, static_type, src2dst_offset, 0};
-                info.number_of_dst_type = 1;
-                dynamic_type->search_above_dst(&info, dynamic_ptr, dynamic_ptr, public_path, true);
-            }
-#endif // _LIBCXXABI_FORGIVING_DYNAMIC_CAST
-            // Query the search.
-            if (info.path_dst_ptr_to_static_ptr == public_path)
-                dst_ptr = dynamic_ptr;
-        }
+        dst_ptr = dyn_cast_to_derived(static_ptr,
+                                      derived_info.dynamic_ptr,
+                                      static_type,
+                                      dst_type,
+                                      derived_info.offset_to_derived,
+                                      src2dst_offset);
     }
     else
     {
-        if (src2dst_offset >= 0)
-        {
-            // Optimize toward downcasting: dst_type has one unique public
-            //   static_type bases. Let's first try to do a downcast before
-            //   falling back to the slow path. The downcast succeeds if there
-            //   is at least one path regardless of visibility from
-            //   dynamic_type to dst_type.
-            const void* dst_ptr_to_static = reinterpret_cast<const char*>(static_ptr) - src2dst_offset;
-            if (reinterpret_cast<std::intptr_t>(dst_ptr_to_static) >= reinterpret_cast<std::intptr_t>(dynamic_ptr))
-            {
-                // Try to search a path from dynamic_type to dst_type.
-                __dynamic_cast_info dynamic_to_dst_info = {dynamic_type, dst_ptr_to_static, dst_type, src2dst_offset};
-                dynamic_to_dst_info.number_of_dst_type = 1;
-                dynamic_type->search_above_dst(&dynamic_to_dst_info, dynamic_ptr, dynamic_ptr, public_path, false);
-                if (dynamic_to_dst_info.path_dst_ptr_to_static_ptr != unknown) {
-                    // We have found at least one path from dynamic_ptr to
-                    //   dst_ptr. The downcast can succeed.
-                    dst_ptr = dst_ptr_to_static;
-                }
-            }
-        }
+        // Optimize toward downcasting: let's first try to do a downcast before
+        //   falling back to the slow path.
+        dst_ptr = dyn_cast_try_downcast(static_ptr,
+                                        derived_info.dynamic_ptr,
+                                        dst_type,
+                                        derived_info.dynamic_type,
+                                        src2dst_offset);
 
         if (!dst_ptr)
         {
-            // Not using giant short cut.  Do the search
-            dynamic_type->search_below_dst(&info, dynamic_ptr, public_path, false);
-#ifdef _LIBCXXABI_FORGIVING_DYNAMIC_CAST
-            // The following if should always be false because we should
-            //   definitely find (static_ptr, static_type), either on a public
-            //   or private path
-            if (info.path_dst_ptr_to_static_ptr == unknown &&
-                info.path_dynamic_ptr_to_static_ptr == unknown)
-            {
-                static_assert(std::atomic<size_t>::is_always_lock_free, "");
-                static std::atomic<size_t> error_count(0);
-                size_t error_count_snapshot = error_count.fetch_add(1, std::memory_order_relaxed);
-                if ((error_count_snapshot & (error_count_snapshot-1)) == 0)
-                    syslog(LOG_ERR, "dynamic_cast error 2: One or more of the following type_info's "
-                                    "has hidden visibility or is defined in more than one translation "
-                                    "unit. They should all have public visibility. "
-                                    "%s, %s, %s.\n", static_type->name(), dynamic_type->name(),
-                            dst_type->name());
-                // Redo the search comparing type_info's using strcmp
-                info = {dst_type, static_ptr, static_type, src2dst_offset, 0};
-                dynamic_type->search_below_dst(&info, dynamic_ptr, public_path, true);
-            }
-#endif // _LIBCXXABI_FORGIVING_DYNAMIC_CAST
-            // Query the search.
-            switch (info.number_to_static_ptr)
-            {
-            case 0:
-                if (info.number_to_dst_ptr == 1 &&
-                        info.path_dynamic_ptr_to_static_ptr == public_path &&
-                        info.path_dynamic_ptr_to_dst_ptr == public_path)
-                    dst_ptr = info.dst_ptr_not_leading_to_static_ptr;
-                break;
-            case 1:
-                if (info.path_dst_ptr_to_static_ptr == public_path ||
-                    (
-                        info.number_to_dst_ptr == 0 &&
-                        info.path_dynamic_ptr_to_static_ptr == public_path &&
-                        info.path_dynamic_ptr_to_dst_ptr == public_path
-                    )
-                )
-                    dst_ptr = info.dst_ptr_leading_to_static_ptr;
-                break;
-            }
+            dst_ptr = dyn_cast_slow(static_ptr,
+                                    derived_info.dynamic_ptr,
+                                    static_type,
+                                    dst_type,
+                                    derived_info.dynamic_type,
+                                    src2dst_offset);
         }
     }
+
     return const_cast<void*>(dst_ptr);
 }
 
diff --git a/libcxxabi/test/dynamic_cast_stress.pass.cpp b/libcxxabi/test/dynamic_cast_stress.pass.cpp
deleted file mode 100644
index 19dba5fdcfbeab5..000000000000000
--- a/libcxxabi/test/dynamic_cast_stress.pass.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03
-
-#include <cassert>
-#include <tuple>
-#include "support/timer.h"
-
-template <std::size_t Indx, std::size_t Depth>
-struct C
-    : public virtual C<Indx, Depth-1>,
-      public virtual C<Indx+1, Depth-1>
-{
-    virtual ~C() {}
-};
-
-template <std::size_t Indx>
-struct C<Indx, 0>
-{
-    virtual ~C() {}
-};
-
-template <std::size_t Indx, std::size_t Depth>
-struct B
-    : public virtual C<Indx, Depth-1>,
-      public virtual C<Indx+1, Depth-1>
-{
-};
-
-template <class Indx, std::size_t Depth>
-struct makeB;
-
-template <std::size_t ...Indx, std::size_t Depth>
-struct makeB<std::__tuple_indices<Indx...>, Depth>
-    : public B<Indx, Depth>...
-{
-};
-
-template <std::size_t Width, std::size_t Depth>
-struct A
-    : public makeB<typename std::__make_tuple_indices<Width>::type, Depth>
-{
-};
-
-void test()
-{
-    const std::size_t Width = 10;
-    const std::size_t Depth = 5;
-    A<Width, Depth> a;
-    typedef B<Width/2, Depth> Destination;
-//    typedef A<Width, Depth> Destination;
-    Destination *b = nullptr;
-    {
-        timer t;
-        b = dynamic_cast<Destination*>((C<Width/2, 0>*)&a);
-    }
-    assert(b != 0);
-}
-
-int main(int, char**)
-{
-    test();
-
-    return 0;
-}
-
-/*
-Timing results I'm seeing (median of 3 microseconds):
-
-                          libc++abi    gcc's dynamic_cast
-B<Width/2, Depth> -O3      48.334         93.190           libc++abi 93% faster
-B<Width/2, Depth> -Os      58.535         94.103           libc++abi 61% faster
-A<Width, Depth>   -O3      11.515         33.134           libc++abi 188% faster
-A<Width, Depth>   -Os      12.631         31.553           libc++abi 150% faster
-
-*/

>From 3ed9e9e3ace6f9ce320cf4e75cffa04a7c7241b5 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Tue, 29 Aug 2023 19:53:19 +0200
Subject: [PATCH 11/32] [Clang] Add captures to the instantiation scope of
 lambda call operators

Like concepts checking, a trailing return type of a lambda
in a dependent context may refer to captures in which case
they may need to be rebuilt, so the map of local decl
should include captures.

This patch reveal a pre-existing issue.
`this` is always recomputed by TreeTransform.

`*this` (like all captures) only become `const`
after the parameter list.

However, if try to recompute the value of `this` (in a parameter)
during template instantiation while determining the type of the call operator,
we will determine  it to be const (unless the lambda is mutable).

There is no good way to know at that point that we are in a parameter
or not, the easiest/best solution is to transform the type of this.

Note that doing so break a handful of HLSL tests.
So this is a prototype at this point.

Fixes #65067
Fixes #63675

Reviewed By: erichkeane

Differential Revision: https://reviews.llvm.org/D159126
---
 clang/docs/ReleaseNotes.rst                   |  5 ++
 clang/include/clang/Sema/Sema.h               |  8 +++
 clang/lib/Sema/SemaConcept.cpp                | 37 ++-----------
 clang/lib/Sema/SemaDecl.cpp                   |  4 ++
 clang/lib/Sema/SemaLambda.cpp                 | 32 +++++++++++
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  3 ++
 clang/lib/Sema/TreeTransform.h                | 11 +++-
 .../SemaCXX/lambda-capture-type-deduction.cpp | 37 +++++++++++++
 .../SemaCXX/this-type-deduction-concept.cpp   | 54 +++++++++++++++++++
 9 files changed, 158 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/SemaCXX/this-type-deduction-concept.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2d0302c399fb6f3..6a3a6bb8ad425b0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -270,6 +270,11 @@ Bug Fixes to C++ Support
 - Fix crash when parsing the requires clause of some generic lambdas.
   (`#64689 <https://github.com/llvm/llvm-project/issues/64689>`_)
 
+- Fix crash when the trailing return type of a generic and dependent
+  lambda refers to an init-capture.
+  (`#65067 <https://github.com/llvm/llvm-project/issues/65067>`_` and
+   `#63675 <https://github.com/llvm/llvm-project/issues/63675>`_`)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1bb096c667e39c3..566655818a85baf 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -7365,6 +7365,14 @@ class Sema final {
 
   sema::LambdaScopeInfo *RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator);
 
+  class LambdaScopeForCallOperatorInstantiationRAII
+      : private FunctionScopeRAII {
+  public:
+    LambdaScopeForCallOperatorInstantiationRAII(
+        Sema &SemasRef, FunctionDecl *FD, MultiLevelTemplateArgumentList MLTAL,
+        LocalInstantiationScope &Scope);
+  };
+
   /// Check whether the given expression is a valid constraint expression.
   /// A diagnostic is emitted if it is not, false is returned, and
   /// PossibleNonPrimary will be set to true if the failure might be due to a
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index fa3dadf68229ee8..d1fa8e7831225b7 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -600,11 +600,6 @@ bool Sema::SetupConstraintScope(
       if (addInstantiatedParametersToScope(FD, FromMemTempl->getTemplatedDecl(),
                                            Scope, MLTAL))
         return true;
-      // Make sure the captures are also added to the instantiation scope.
-      if (isLambdaCallOperator(FD) &&
-          addInstantiatedCapturesToScope(FD, FromMemTempl->getTemplatedDecl(),
-                                         Scope, MLTAL))
-        return true;
     }
 
     return false;
@@ -629,11 +624,6 @@ bool Sema::SetupConstraintScope(
     // child-function.
     if (addInstantiatedParametersToScope(FD, InstantiatedFrom, Scope, MLTAL))
       return true;
-
-    // Make sure the captures are also added to the instantiation scope.
-    if (isLambdaCallOperator(FD) &&
-        addInstantiatedCapturesToScope(FD, InstantiatedFrom, Scope, MLTAL))
-      return true;
   }
 
   return false;
@@ -712,20 +702,8 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
   }
   CXXThisScopeRAII ThisScope(*this, Record, ThisQuals, Record != nullptr);
 
-  // When checking the constraints of a lambda, we need to restore a
-  // LambdaScopeInfo populated with correct capture information so that the type
-  // of a variable referring to a capture is correctly const-adjusted.
-  FunctionScopeRAII FuncScope(*this);
-  if (isLambdaCallOperator(FD)) {
-    LambdaScopeInfo *LSI = RebuildLambdaScopeInfo(
-        const_cast<CXXMethodDecl *>(cast<CXXMethodDecl>(FD)));
-    // Constraints are checked from the parent context of the lambda, so we set
-    // AfterParameterList to false, so that `tryCaptureVariable` finds
-    // explicit captures in the appropriate context.
-    LSI->AfterParameterList = false;
-  } else {
-    FuncScope.disable();
-  }
+  LambdaScopeForCallOperatorInstantiationRAII LambdaScope(
+      *this, const_cast<FunctionDecl *>(FD), *MLTAL, Scope);
 
   return CheckConstraintSatisfaction(
       FD, {FD->getTrailingRequiresClause()}, *MLTAL,
@@ -913,15 +891,10 @@ bool Sema::CheckInstantiatedFunctionTemplateConstraints(
     ThisQuals = Method->getMethodQualifiers();
     Record = Method->getParent();
   }
-  CXXThisScopeRAII ThisScope(*this, Record, ThisQuals, Record != nullptr);
-  FunctionScopeRAII FuncScope(*this);
 
-  if (isLambdaCallOperator(Decl)) {
-    LambdaScopeInfo *LSI = RebuildLambdaScopeInfo(cast<CXXMethodDecl>(Decl));
-    LSI->AfterParameterList = false;
-  } else {
-    FuncScope.disable();
-  }
+  CXXThisScopeRAII ThisScope(*this, Record, ThisQuals, Record != nullptr);
+  LambdaScopeForCallOperatorInstantiationRAII LambdaScope(
+      *this, const_cast<FunctionDecl *>(Decl), *MLTAL, Scope);
 
   llvm::SmallVector<Expr *, 1> Converted;
   return CheckConstraintSatisfaction(Template, TemplateAC, Converted, *MLTAL,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 027c6c3e4222f07..99806054260965f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -15382,6 +15382,10 @@ LambdaScopeInfo *Sema::RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator) {
   LSI->CallOperator = CallOperator;
   LSI->Lambda = LambdaClass;
   LSI->ReturnType = CallOperator->getReturnType();
+  // This function in calls in situation where the context of the call operator
+  // is not entered, so we set AfterParameterList to false, so that
+  // `tryCaptureVariable` finds explicit captures in the appropriate context.
+  LSI->AfterParameterList = false;
   const LambdaCaptureDefault LCD = LambdaClass->getLambdaCaptureDefault();
 
   if (LCD == LCD_None)
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 5256d91a19a0d0b..1702ddb3ee0fbf0 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -20,6 +20,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaLambda.h"
+#include "clang/Sema/Template.h"
 #include "llvm/ADT/STLExtras.h"
 #include <optional>
 using namespace clang;
@@ -2254,3 +2255,34 @@ ExprResult Sema::BuildBlockForLambdaConversion(SourceLocation CurrentLocation,
 
   return BuildBlock;
 }
+
+Sema::LambdaScopeForCallOperatorInstantiationRAII::
+    LambdaScopeForCallOperatorInstantiationRAII(
+        Sema &SemasRef, FunctionDecl *FD, MultiLevelTemplateArgumentList MLTAL,
+        LocalInstantiationScope &Scope)
+    : FunctionScopeRAII(SemasRef) {
+  if (!isLambdaCallOperator(FD)) {
+    FunctionScopeRAII::disable();
+    return;
+  }
+
+  if (FD->isTemplateInstantiation() && FD->getPrimaryTemplate()) {
+    FunctionTemplateDecl *PrimaryTemplate = FD->getPrimaryTemplate();
+    if (const auto *FromMemTempl =
+            PrimaryTemplate->getInstantiatedFromMemberTemplate()) {
+      SemasRef.addInstantiatedCapturesToScope(
+          FD, FromMemTempl->getTemplatedDecl(), Scope, MLTAL);
+    }
+  }
+
+  else if (FD->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization ||
+           FD->getTemplatedKind() == FunctionDecl::TK_DependentNonTemplate) {
+    FunctionDecl *InstantiatedFrom =
+        FD->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization
+            ? FD->getInstantiatedFromMemberFunction()
+            : FD->getInstantiatedFromDecl();
+    SemasRef.addInstantiatedCapturesToScope(FD, InstantiatedFrom, Scope, MLTAL);
+  }
+
+  SemasRef.RebuildLambdaScopeInfo(cast<CXXMethodDecl>(FD));
+}
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 63f022d5c2ff094..37a7d6204413a38 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2426,6 +2426,9 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
       cast<Decl>(Owner)->isDefinedOutsideFunctionOrMethod());
   LocalInstantiationScope Scope(SemaRef, MergeWithParentScope);
 
+  Sema::LambdaScopeForCallOperatorInstantiationRAII LambdaScope(
+      SemaRef, const_cast<CXXMethodDecl *>(D), TemplateArgs, Scope);
+
   // Instantiate enclosing template arguments for friends.
   SmallVector<TemplateParameterList *, 4> TempParamLists;
   unsigned NumTempParamLists = 0;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 7323140bc336bc2..603a23275889f21 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -12325,7 +12325,16 @@ TreeTransform<Derived>::TransformCXXNullPtrLiteralExpr(
 template<typename Derived>
 ExprResult
 TreeTransform<Derived>::TransformCXXThisExpr(CXXThisExpr *E) {
-  QualType T = getSema().getCurrentThisType();
+
+  // In lambdas, the qualifiers of the type depends of where in
+  // the call operator `this` appear, and we do not have a good way to
+  // rebuild this information, so we transform the type.
+  //
+  // In other contexts, the type of `this` may be overrided
+  // for type deduction, so we need to recompute it.
+  QualType T = getSema().getCurLambda() ?
+                   getDerived().TransformType(E->getType())
+                 : getSema().getCurrentThisType();
 
   if (!getDerived().AlwaysRebuild() && T == E->getType()) {
     // Mark it referenced in the new context regardless.
diff --git a/clang/test/SemaCXX/lambda-capture-type-deduction.cpp b/clang/test/SemaCXX/lambda-capture-type-deduction.cpp
index 9855122c962722c..7bf36a6a9cab732 100644
--- a/clang/test/SemaCXX/lambda-capture-type-deduction.cpp
+++ b/clang/test/SemaCXX/lambda-capture-type-deduction.cpp
@@ -260,3 +260,40 @@ void f(int) {
 void test() { f<int>(0);  }
 
 }
+
+namespace GH65067 {
+
+template <typename> class a {
+public:
+  template <typename b> void c(b f) { d<int>(f)(0); }
+  template <typename, typename b> auto d(b f) {
+    return [f = f](auto arg) -> a<decltype(f(arg))> { return {}; };
+  }
+};
+a<void> e;
+auto fn1() {
+  e.c([](int) {});
+}
+
+}
+
+namespace GH63675 {
+
+template <class _Tp> _Tp __declval();
+struct __get_tag {
+  template <class _Tag> void operator()(_Tag);
+};
+template <class _ImplFn> struct __basic_sender {
+  using __tag_t = decltype(__declval<_ImplFn>()(__declval<__get_tag>()));
+  _ImplFn __impl_;
+};
+auto __make_basic_sender = []<class... _Children>(
+                               _Children... __children) {
+  return __basic_sender{[... __children = __children]<class _Fun>(
+                     _Fun __fun) -> decltype(__fun(__children...)) {}};
+};
+void __trans_tmp_1() {
+  __make_basic_sender(__trans_tmp_1);
+}
+
+}
diff --git a/clang/test/SemaCXX/this-type-deduction-concept.cpp b/clang/test/SemaCXX/this-type-deduction-concept.cpp
new file mode 100644
index 000000000000000..a0c1f605ccefd76
--- /dev/null
+++ b/clang/test/SemaCXX/this-type-deduction-concept.cpp
@@ -0,0 +1,54 @@
+
+// This test case came up in the review of
+// https://reviews.llvm.org/D159126
+// when transforming `this` within a
+// requires expression, we need to make sure
+// the type of this (and its qualifiers) is respected.
+namespace D159126 {
+
+template <class _Tp>
+concept __member_begin = requires(_Tp __t) {
+  __t.begin();
+};
+
+struct {
+  template <class _Tp>
+  requires __member_begin<_Tp>
+  auto operator()(_Tp &&) {}
+} inline begin;
+
+template <class>
+concept range = requires {
+  begin;
+};
+
+template <class _Tp>
+concept __can_compare_begin = requires(_Tp __t) {
+  begin(__t);
+};
+
+struct {
+  template <__can_compare_begin _Tp> void operator()(_Tp &&);
+} empty;
+
+template <range _Rp> struct owning_view {
+  _Rp __r_;
+public:
+  void empty() const requires requires { empty(__r_); };
+};
+
+template <class T>
+concept HasEmpty = requires(T t) {
+  t.empty();
+};
+
+struct ComparableIters {
+    void begin();
+};
+
+static_assert(HasEmpty<owning_view<ComparableIters&>>);
+static_assert(HasEmpty<owning_view<ComparableIters&&>>);
+static_assert(!HasEmpty<owning_view<const ComparableIters&>>);
+static_assert(!HasEmpty<owning_view<const ComparableIters&&>>);
+
+}

>From 5f29ed16548159a575068214ca95cae6f0ad24e7 Mon Sep 17 00:00:00 2001
From: Colin Finck <colin at reactos.org>
Date: Fri, 8 Sep 2023 11:50:26 -0400
Subject: [PATCH 12/32] [libc++] Fix warnings when compiling libc++ for Windows
 with clang-cl /W4

Differential Revision: https://reviews.llvm.org/D96408
---
 libcxx/src/atomic.cpp                     | 2 +-
 libcxx/src/support/win32/locale_win32.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp
index a55249a15c21702..fc90a1f5b2ba04c 100644
--- a/libcxx/src/atomic.cpp
+++ b/libcxx/src/atomic.cpp
@@ -147,7 +147,7 @@ static void __libcpp_contention_notify(__cxx_atomic_contention_t volatile* __con
         // We only call 'wake' if we consumed a contention bit here.
         __libcpp_platform_wake_by_address(__platform_state, __notify_one);
 }
-static __cxx_contention_t __libcpp_contention_monitor_for_wait(__cxx_atomic_contention_t volatile* __contention_state,
+static __cxx_contention_t __libcpp_contention_monitor_for_wait(__cxx_atomic_contention_t volatile* /*__contention_state*/,
                                                                __cxx_atomic_contention_t const volatile* __platform_state)
 {
     // We will monitor this value.
diff --git a/libcxx/src/support/win32/locale_win32.cpp b/libcxx/src/support/win32/locale_win32.cpp
index 2543686c907d8c4..c9d1d63b2508afb 100644
--- a/libcxx/src/support/win32/locale_win32.cpp
+++ b/libcxx/src/support/win32/locale_win32.cpp
@@ -17,8 +17,8 @@ int __libcpp_vasprintf(char **sptr, const char *__restrict fmt, va_list ap);
 
 using std::__libcpp_locale_guard;
 
-// FIXME: base currently unused. Needs manual work to construct the new locale
-locale_t newlocale( int mask, const char * locale, locale_t /*base*/ )
+// FIXME: base and mask currently unused. Needs manual work to construct the new locale
+locale_t newlocale(int /*mask*/, const char * locale, locale_t /*base*/)
 {
     return {_create_locale( LC_ALL, locale ), locale};
 }

>From 8dd87a5f57d62a7d81f527c46b28cf2e9820409f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 8 Sep 2023 08:48:49 -0700
Subject: [PATCH 13/32] [RISCV] Add gather test coverage for non-intptr index
 widths

Note that these are non-canonical.  At IR, we will generally canonicalize to the intptrty width form if knownbits allows.
---
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  | 279 +++++++++++++-----
 .../rvv/fixed-vectors-strided-load-store.ll   |  44 +++
 2 files changed, 255 insertions(+), 68 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index cb3ee899dde7d27..fd51949c6023aa5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -13193,12 +13193,155 @@ define <4 x i32> @mgather_unit_stride_load_with_offset(ptr %base) {
   ret <4 x i32> %v
 }
 
+define <4 x i32> @mgather_unit_stride_load_narrow_idx(ptr %base) {
+; RV32-LABEL: mgather_unit_stride_load_narrow_idx:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vluxei32.v v8, (a0), v8
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: mgather_unit_stride_load_narrow_idx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64V-NEXT:    vid.v v8
+; RV64V-NEXT:    vsll.vi v10, v8, 2
+; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vluxei64.v v8, (a0), v10
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: mgather_unit_stride_load_narrow_idx:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vmset.m v8
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV64ZVE32F-NEXT:    # implicit-def: $v8
+; RV64ZVE32F-NEXT:    beqz zero, .LBB104_5
+; RV64ZVE32F-NEXT:  # %bb.1: # %else
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    bnez a2, .LBB104_6
+; RV64ZVE32F-NEXT:  .LBB104_2: # %else2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
+; RV64ZVE32F-NEXT:    bnez a2, .LBB104_7
+; RV64ZVE32F-NEXT:  .LBB104_3: # %else5
+; RV64ZVE32F-NEXT:    andi a1, a1, 8
+; RV64ZVE32F-NEXT:    bnez a1, .LBB104_8
+; RV64ZVE32F-NEXT:  .LBB104_4: # %else8
+; RV64ZVE32F-NEXT:    ret
+; RV64ZVE32F-NEXT:  .LBB104_5: # %cond.load
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32F-NEXT:    vlse32.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB104_2
+; RV64ZVE32F-NEXT:  .LBB104_6: # %cond.load1
+; RV64ZVE32F-NEXT:    addi a2, a0, 4
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
+; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
+; RV64ZVE32F-NEXT:    beqz a2, .LBB104_3
+; RV64ZVE32F-NEXT:  .LBB104_7: # %cond.load4
+; RV64ZVE32F-NEXT:    addi a2, a0, 8
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
+; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a1, a1, 8
+; RV64ZVE32F-NEXT:    beqz a1, .LBB104_4
+; RV64ZVE32F-NEXT:  .LBB104_8: # %cond.load7
+; RV64ZVE32F-NEXT:    addi a0, a0, 12
+; RV64ZVE32F-NEXT:    lw a0, 0(a0)
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32F-NEXT:    vmv.s.x v9, a0
+; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 3
+; RV64ZVE32F-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %ptrs = getelementptr inbounds i32, ptr %base, <4 x i8>  <i8 0, i8 1, i8 2, i8 3>
+  %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %allones, <4 x i32> poison)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @mgather_unit_stride_load_wide_idx(ptr %base) {
+; RV32-LABEL: mgather_unit_stride_load_wide_idx:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vluxei32.v v8, (a0), v8
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: mgather_unit_stride_load_wide_idx:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64V-NEXT:    vid.v v8
+; RV64V-NEXT:    vsll.vi v10, v8, 2
+; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64V-NEXT:    vluxei64.v v8, (a0), v10
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32F-LABEL: mgather_unit_stride_load_wide_idx:
+; RV64ZVE32F:       # %bb.0:
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT:    vmset.m v8
+; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV64ZVE32F-NEXT:    # implicit-def: $v8
+; RV64ZVE32F-NEXT:    beqz zero, .LBB105_5
+; RV64ZVE32F-NEXT:  # %bb.1: # %else
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    bnez a2, .LBB105_6
+; RV64ZVE32F-NEXT:  .LBB105_2: # %else2
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
+; RV64ZVE32F-NEXT:    bnez a2, .LBB105_7
+; RV64ZVE32F-NEXT:  .LBB105_3: # %else5
+; RV64ZVE32F-NEXT:    andi a1, a1, 8
+; RV64ZVE32F-NEXT:    bnez a1, .LBB105_8
+; RV64ZVE32F-NEXT:  .LBB105_4: # %else8
+; RV64ZVE32F-NEXT:    ret
+; RV64ZVE32F-NEXT:  .LBB105_5: # %cond.load
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32F-NEXT:    vlse32.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    andi a2, a1, 2
+; RV64ZVE32F-NEXT:    beqz a2, .LBB105_2
+; RV64ZVE32F-NEXT:  .LBB105_6: # %cond.load1
+; RV64ZVE32F-NEXT:    addi a2, a0, 4
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
+; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; RV64ZVE32F-NEXT:    andi a2, a1, 4
+; RV64ZVE32F-NEXT:    beqz a2, .LBB105_3
+; RV64ZVE32F-NEXT:  .LBB105_7: # %cond.load4
+; RV64ZVE32F-NEXT:    addi a2, a0, 8
+; RV64ZVE32F-NEXT:    lw a2, 0(a2)
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
+; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 2
+; RV64ZVE32F-NEXT:    andi a1, a1, 8
+; RV64ZVE32F-NEXT:    beqz a1, .LBB105_4
+; RV64ZVE32F-NEXT:  .LBB105_8: # %cond.load7
+; RV64ZVE32F-NEXT:    addi a0, a0, 12
+; RV64ZVE32F-NEXT:    lw a0, 0(a0)
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32F-NEXT:    vmv.s.x v9, a0
+; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 3
+; RV64ZVE32F-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %ptrs = getelementptr inbounds i32, ptr %base, <4 x i128>  <i128 0, i128 1, i128 2, i128 3>
+  %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %allones, <4 x i32> poison)
+  ret <4 x i32> %v
+}
+
+
 ; TODO: Recognize as strided load with SEW=32
 define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV32-LABEL: mgather_strided_2xSEW:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, %hi(.LCPI104_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI104_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI106_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI106_0)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a1)
 ; RV32-NEXT:    vluxei32.v v8, (a0), v10
@@ -13206,8 +13349,8 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ;
 ; RV64V-LABEL: mgather_strided_2xSEW:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    lui a1, %hi(.LCPI104_0)
-; RV64V-NEXT:    addi a1, a1, %lo(.LCPI104_0)
+; RV64V-NEXT:    lui a1, %hi(.LCPI106_0)
+; RV64V-NEXT:    addi a1, a1, %lo(.LCPI106_0)
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64V-NEXT:    vle64.v v12, (a1)
 ; RV64V-NEXT:    vluxei64.v v8, (a0), v12
@@ -13219,35 +13362,35 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vmset.m v8
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    # implicit-def: $v8
-; RV64ZVE32F-NEXT:    beqz zero, .LBB104_9
+; RV64ZVE32F-NEXT:    beqz zero, .LBB106_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
-; RV64ZVE32F-NEXT:    bnez a2, .LBB104_10
-; RV64ZVE32F-NEXT:  .LBB104_2: # %else2
+; RV64ZVE32F-NEXT:    bnez a2, .LBB106_10
+; RV64ZVE32F-NEXT:  .LBB106_2: # %else2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
-; RV64ZVE32F-NEXT:    bnez a2, .LBB104_11
-; RV64ZVE32F-NEXT:  .LBB104_3: # %else5
+; RV64ZVE32F-NEXT:    bnez a2, .LBB106_11
+; RV64ZVE32F-NEXT:  .LBB106_3: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
-; RV64ZVE32F-NEXT:    bnez a2, .LBB104_12
-; RV64ZVE32F-NEXT:  .LBB104_4: # %else8
+; RV64ZVE32F-NEXT:    bnez a2, .LBB106_12
+; RV64ZVE32F-NEXT:  .LBB106_4: # %else8
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
-; RV64ZVE32F-NEXT:    bnez a2, .LBB104_13
-; RV64ZVE32F-NEXT:  .LBB104_5: # %else11
+; RV64ZVE32F-NEXT:    bnez a2, .LBB106_13
+; RV64ZVE32F-NEXT:  .LBB106_5: # %else11
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
-; RV64ZVE32F-NEXT:    bnez a2, .LBB104_14
-; RV64ZVE32F-NEXT:  .LBB104_6: # %else14
+; RV64ZVE32F-NEXT:    bnez a2, .LBB106_14
+; RV64ZVE32F-NEXT:  .LBB106_6: # %else14
 ; RV64ZVE32F-NEXT:    andi a2, a1, 64
-; RV64ZVE32F-NEXT:    bnez a2, .LBB104_15
-; RV64ZVE32F-NEXT:  .LBB104_7: # %else17
+; RV64ZVE32F-NEXT:    bnez a2, .LBB106_15
+; RV64ZVE32F-NEXT:  .LBB106_7: # %else17
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
-; RV64ZVE32F-NEXT:    bnez a1, .LBB104_16
-; RV64ZVE32F-NEXT:  .LBB104_8: # %else20
+; RV64ZVE32F-NEXT:    bnez a1, .LBB106_16
+; RV64ZVE32F-NEXT:  .LBB106_8: # %else20
 ; RV64ZVE32F-NEXT:    ret
-; RV64ZVE32F-NEXT:  .LBB104_9: # %cond.load
+; RV64ZVE32F-NEXT:  .LBB106_9: # %cond.load
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
-; RV64ZVE32F-NEXT:    beqz a2, .LBB104_2
-; RV64ZVE32F-NEXT:  .LBB104_10: # %cond.load1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB106_2
+; RV64ZVE32F-NEXT:  .LBB106_10: # %cond.load1
 ; RV64ZVE32F-NEXT:    addi a2, a0, 2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -13255,48 +13398,48 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
-; RV64ZVE32F-NEXT:    beqz a2, .LBB104_3
-; RV64ZVE32F-NEXT:  .LBB104_11: # %cond.load4
+; RV64ZVE32F-NEXT:    beqz a2, .LBB106_3
+; RV64ZVE32F-NEXT:  .LBB106_11: # %cond.load4
 ; RV64ZVE32F-NEXT:    addi a2, a0, 8
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
-; RV64ZVE32F-NEXT:    beqz a2, .LBB104_4
-; RV64ZVE32F-NEXT:  .LBB104_12: # %cond.load7
+; RV64ZVE32F-NEXT:    beqz a2, .LBB106_4
+; RV64ZVE32F-NEXT:  .LBB106_12: # %cond.load7
 ; RV64ZVE32F-NEXT:    addi a2, a0, 10
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
-; RV64ZVE32F-NEXT:    beqz a2, .LBB104_5
-; RV64ZVE32F-NEXT:  .LBB104_13: # %cond.load10
+; RV64ZVE32F-NEXT:    beqz a2, .LBB106_5
+; RV64ZVE32F-NEXT:  .LBB106_13: # %cond.load10
 ; RV64ZVE32F-NEXT:    addi a2, a0, 16
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 5, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
-; RV64ZVE32F-NEXT:    beqz a2, .LBB104_6
-; RV64ZVE32F-NEXT:  .LBB104_14: # %cond.load13
+; RV64ZVE32F-NEXT:    beqz a2, .LBB106_6
+; RV64ZVE32F-NEXT:  .LBB106_14: # %cond.load13
 ; RV64ZVE32F-NEXT:    addi a2, a0, 18
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 64
-; RV64ZVE32F-NEXT:    beqz a2, .LBB104_7
-; RV64ZVE32F-NEXT:  .LBB104_15: # %cond.load16
+; RV64ZVE32F-NEXT:    beqz a2, .LBB106_7
+; RV64ZVE32F-NEXT:  .LBB106_15: # %cond.load16
 ; RV64ZVE32F-NEXT:    addi a2, a0, 24
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
-; RV64ZVE32F-NEXT:    beqz a1, .LBB104_8
-; RV64ZVE32F-NEXT:  .LBB104_16: # %cond.load19
+; RV64ZVE32F-NEXT:    beqz a1, .LBB106_8
+; RV64ZVE32F-NEXT:  .LBB106_16: # %cond.load19
 ; RV64ZVE32F-NEXT:    addi a0, a0, 26
 ; RV64ZVE32F-NEXT:    lh a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
@@ -13314,8 +13457,8 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV32-LABEL: mgather_gather_2xSEW:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, %hi(.LCPI105_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI105_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI107_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI107_0)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a1)
 ; RV32-NEXT:    vluxei32.v v8, (a0), v10
@@ -13323,8 +13466,8 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ;
 ; RV64V-LABEL: mgather_gather_2xSEW:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    lui a1, %hi(.LCPI105_0)
-; RV64V-NEXT:    addi a1, a1, %lo(.LCPI105_0)
+; RV64V-NEXT:    lui a1, %hi(.LCPI107_0)
+; RV64V-NEXT:    addi a1, a1, %lo(.LCPI107_0)
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64V-NEXT:    vle64.v v12, (a1)
 ; RV64V-NEXT:    vluxei64.v v8, (a0), v12
@@ -13336,35 +13479,35 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vmset.m v8
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    # implicit-def: $v8
-; RV64ZVE32F-NEXT:    beqz zero, .LBB105_9
+; RV64ZVE32F-NEXT:    beqz zero, .LBB107_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
-; RV64ZVE32F-NEXT:    bnez a2, .LBB105_10
-; RV64ZVE32F-NEXT:  .LBB105_2: # %else2
+; RV64ZVE32F-NEXT:    bnez a2, .LBB107_10
+; RV64ZVE32F-NEXT:  .LBB107_2: # %else2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
-; RV64ZVE32F-NEXT:    bnez a2, .LBB105_11
-; RV64ZVE32F-NEXT:  .LBB105_3: # %else5
+; RV64ZVE32F-NEXT:    bnez a2, .LBB107_11
+; RV64ZVE32F-NEXT:  .LBB107_3: # %else5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
-; RV64ZVE32F-NEXT:    bnez a2, .LBB105_12
-; RV64ZVE32F-NEXT:  .LBB105_4: # %else8
+; RV64ZVE32F-NEXT:    bnez a2, .LBB107_12
+; RV64ZVE32F-NEXT:  .LBB107_4: # %else8
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
-; RV64ZVE32F-NEXT:    bnez a2, .LBB105_13
-; RV64ZVE32F-NEXT:  .LBB105_5: # %else11
+; RV64ZVE32F-NEXT:    bnez a2, .LBB107_13
+; RV64ZVE32F-NEXT:  .LBB107_5: # %else11
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
-; RV64ZVE32F-NEXT:    bnez a2, .LBB105_14
-; RV64ZVE32F-NEXT:  .LBB105_6: # %else14
+; RV64ZVE32F-NEXT:    bnez a2, .LBB107_14
+; RV64ZVE32F-NEXT:  .LBB107_6: # %else14
 ; RV64ZVE32F-NEXT:    andi a2, a1, 64
-; RV64ZVE32F-NEXT:    bnez a2, .LBB105_15
-; RV64ZVE32F-NEXT:  .LBB105_7: # %else17
+; RV64ZVE32F-NEXT:    bnez a2, .LBB107_15
+; RV64ZVE32F-NEXT:  .LBB107_7: # %else17
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
-; RV64ZVE32F-NEXT:    bnez a1, .LBB105_16
-; RV64ZVE32F-NEXT:  .LBB105_8: # %else20
+; RV64ZVE32F-NEXT:    bnez a1, .LBB107_16
+; RV64ZVE32F-NEXT:  .LBB107_8: # %else20
 ; RV64ZVE32F-NEXT:    ret
-; RV64ZVE32F-NEXT:  .LBB105_9: # %cond.load
+; RV64ZVE32F-NEXT:  .LBB107_9: # %cond.load
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
-; RV64ZVE32F-NEXT:    beqz a2, .LBB105_2
-; RV64ZVE32F-NEXT:  .LBB105_10: # %cond.load1
+; RV64ZVE32F-NEXT:    beqz a2, .LBB107_2
+; RV64ZVE32F-NEXT:  .LBB107_10: # %cond.load1
 ; RV64ZVE32F-NEXT:    addi a2, a0, 2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -13372,48 +13515,48 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
-; RV64ZVE32F-NEXT:    beqz a2, .LBB105_3
-; RV64ZVE32F-NEXT:  .LBB105_11: # %cond.load4
+; RV64ZVE32F-NEXT:    beqz a2, .LBB107_3
+; RV64ZVE32F-NEXT:  .LBB107_11: # %cond.load4
 ; RV64ZVE32F-NEXT:    addi a2, a0, 4
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
-; RV64ZVE32F-NEXT:    beqz a2, .LBB105_4
-; RV64ZVE32F-NEXT:  .LBB105_12: # %cond.load7
+; RV64ZVE32F-NEXT:    beqz a2, .LBB107_4
+; RV64ZVE32F-NEXT:  .LBB107_12: # %cond.load7
 ; RV64ZVE32F-NEXT:    addi a2, a0, 6
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
-; RV64ZVE32F-NEXT:    beqz a2, .LBB105_5
-; RV64ZVE32F-NEXT:  .LBB105_13: # %cond.load10
+; RV64ZVE32F-NEXT:    beqz a2, .LBB107_5
+; RV64ZVE32F-NEXT:  .LBB107_13: # %cond.load10
 ; RV64ZVE32F-NEXT:    addi a2, a0, 16
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 5, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 4
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
-; RV64ZVE32F-NEXT:    beqz a2, .LBB105_6
-; RV64ZVE32F-NEXT:  .LBB105_14: # %cond.load13
+; RV64ZVE32F-NEXT:    beqz a2, .LBB107_6
+; RV64ZVE32F-NEXT:  .LBB107_14: # %cond.load13
 ; RV64ZVE32F-NEXT:    addi a2, a0, 18
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 64
-; RV64ZVE32F-NEXT:    beqz a2, .LBB105_7
-; RV64ZVE32F-NEXT:  .LBB105_15: # %cond.load16
+; RV64ZVE32F-NEXT:    beqz a2, .LBB107_7
+; RV64ZVE32F-NEXT:  .LBB107_15: # %cond.load16
 ; RV64ZVE32F-NEXT:    addi a2, a0, 20
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
-; RV64ZVE32F-NEXT:    beqz a1, .LBB105_8
-; RV64ZVE32F-NEXT:  .LBB105_16: # %cond.load19
+; RV64ZVE32F-NEXT:    beqz a1, .LBB107_8
+; RV64ZVE32F-NEXT:  .LBB107_16: # %cond.load19
 ; RV64ZVE32F-NEXT:    addi a0, a0, 22
 ; RV64ZVE32F-NEXT:    lh a0, 0(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index e449b6f2280e2c3..b359f71be0e67f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -963,3 +963,47 @@ entry:
 }
 
 declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i8>)
+
+define void @gather_narrow_idx(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
+; CHECK-LABEL: @gather_narrow_idx(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i16> [[VEC_IND]], <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i16> [[I]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i16> [[VEC_IND]], <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.ind = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, %entry ], [ %vec.ind.next, %vector.body ]
+  %i = mul nuw nsw <32 x i16> %vec.ind, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %i1 = getelementptr inbounds i8, ptr %B, <32 x i16> %i
+  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
+  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
+  %wide.load = load <32 x i8>, ptr %i2, align 1
+  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
+  store <32 x i8> %i4, ptr %i2, align 1
+  %index.next = add nuw i64 %index, 32
+  %vec.ind.next = add <32 x i16> %vec.ind, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  %i6 = icmp eq i64 %index.next, 1024
+  br i1 %i6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+

>From df97eaa4a4308165226651e02b8f560a3b0dae55 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <shraiysh.vaishay at amd.com>
Date: Fri, 8 Sep 2023 10:56:17 -0500
Subject: [PATCH 14/32] Addressed comments

---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp        | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d26ac60939031ec..037e0a5662be5bb 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2007,7 +2007,7 @@ class OpenMPIRBuilder {
 
   /// Generator for `#omp teams`
   ///
-  /// \param Loc The location where the task construct was encountered.
+  /// \param Loc The location where the teams construct was encountered.
   /// \param BodyGenCB Callback that will generate the region code.
   InsertPointTy createTeams(const LocationDescription &Loc,
                             BodyGenCallbackTy BodyGenCB);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0f2203f0c1ac84c..a873d95366bc9a6 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6111,9 +6111,8 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB) {
-  if (!updateToLocation(Loc)) {
+  if (!updateToLocation(Loc))
     return Loc.IP;
-  }
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
@@ -6198,7 +6197,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
       WrapperArgTys.push_back(Arg.getType());
     }
     FunctionCallee WrapperFuncVal = M.getOrInsertFunction(
-        "outlined_omp_teams",
+        (Twine(OutlinedFn.getName()) + ".teams").str(),
         FunctionType::get(Builder.getVoidTy(), WrapperArgTys, false));
     Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee());
     WrapperFunc->getArg(0)->setName("global_tid");

>From 2b5bc20c45a14a1bbf3e8b0920f06950d6562550 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <shraiysh.vaishay at amd.com>
Date: Fri, 8 Sep 2023 10:58:50 -0500
Subject: [PATCH 15/32] Replace task -> teams

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a873d95366bc9a6..136590989f99c44 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6130,7 +6130,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   //   current_basic_block:
   //     br label %teams.exit
   //   teams.exit:
-  //     ; instructions after task
+  //     ; instructions after teams
   // }
   // def outlined_fn() {
   //   teams.alloca:

>From 7beb65ae2d9e909c4a63acb808451757e2fb1d07 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Fri, 8 Sep 2023 09:01:10 -0700
Subject: [PATCH 16/32] [flang] Fixed LoopVersioning for array slices. (#65703)

The first test case added in the LIT test demonstrates the problem.
Even though we did not consider the inner loop as a candidate for
the transformation due to the array_coor with a slice, we decided to
version the outer loop for the same function argument.
During the cloning of the outer loop we dropped the slicing completely
producing invalid code.

I restructured the code so that we record all arg uses that cannot be
transformed (regardless of the reason), and then fixup the usage
information across the loop nests. I also noticed that we may generate
redundant contiguity checks for the inner loops, so I fixed it
since it was easy with the new way of keeping the usage data.
---
 .../Optimizer/Transforms/LoopVersioning.cpp   | 217 ++++++++++++++----
 flang/test/Transforms/loop-versioning.fir     | 172 +++++++++++++-
 2 files changed, 346 insertions(+), 43 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index b524b11f5966443..4d3ea51ae1a5f71 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -77,6 +77,72 @@ class LoopVersioningPass
   void runOnOperation() override;
 };
 
+/// @struct ArgInfo
+/// A structure to hold an argument, the size of the argument and dimension
+/// information.
+struct ArgInfo {
+  mlir::Value arg;
+  size_t size;
+  unsigned rank;
+  fir::BoxDimsOp dims[CFI_MAX_RANK];
+};
+
+/// @struct ArgsUsageInLoop
+/// A structure providing information about the function arguments
+/// usage by the instructions immediately nested in a loop.
+struct ArgsUsageInLoop {
+  /// Mapping between the memref operand of an array indexing
+  /// operation (e.g. fir.coordinate_of) and the argument information.
+  llvm::DenseMap<mlir::Value, ArgInfo> usageInfo;
+  /// Some array indexing operations inside a loop cannot be transformed.
+  /// This vector holds the memref operands of such operations.
+  /// The vector is used to make sure that we do not try to transform
+  /// any outer loop, since this will imply the operation rewrite
+  /// in this loop.
+  llvm::SetVector<mlir::Value> cannotTransform;
+
+  // Debug dump of the structure members assuming that
+  // the information has been collected for the given loop.
+  void dump(fir::DoLoopOp loop) const {
+    // clang-format off
+    LLVM_DEBUG(
+        mlir::OpPrintingFlags printFlags;
+        printFlags.skipRegions();
+        llvm::dbgs() << "Arguments usage info for loop:\n";
+        loop.print(llvm::dbgs(), printFlags);
+        llvm::dbgs() << "\nUsed args:\n";
+        for (auto &use : usageInfo) {
+          mlir::Value v = use.first;
+          v.print(llvm::dbgs(), printFlags);
+          llvm::dbgs() << "\n";
+        }
+        llvm::dbgs() << "\nCannot transform args:\n";
+        for (mlir::Value arg : cannotTransform) {
+          arg.print(llvm::dbgs(), printFlags);
+          llvm::dbgs() << "\n";
+        }
+        llvm::dbgs() << "====\n"
+    );
+    // clang-format on
+  }
+
+  // Erase usageInfo and cannotTransform entries for a set
+  // of given arguments.
+  void eraseUsage(const llvm::SetVector<mlir::Value> &args) {
+    for (auto &arg : args)
+      usageInfo.erase(arg);
+    cannotTransform.set_subtract(args);
+  }
+
+  // Erase usageInfo and cannotTransform entries for a set
+  // of given arguments provided in the form of usageInfo map.
+  void eraseUsage(const llvm::DenseMap<mlir::Value, ArgInfo> &args) {
+    for (auto &arg : args) {
+      usageInfo.erase(arg.first);
+      cannotTransform.remove(arg.first);
+    }
+  }
+};
 } // namespace
 
 /// @c replaceOuterUses - replace uses outside of @c op with result of @c
@@ -179,16 +245,6 @@ void LoopVersioningPass::runOnOperation() {
   LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
   mlir::func::FuncOp func = getOperation();
 
-  /// @c ArgInfo
-  /// A structure to hold an argument, the size of the argument and dimension
-  /// information.
-  struct ArgInfo {
-    mlir::Value arg;
-    size_t size;
-    unsigned rank;
-    fir::BoxDimsOp dims[CFI_MAX_RANK];
-  };
-
   // First look for arguments with assumed shape = unknown extent in the lowest
   // dimension.
   LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n");
@@ -224,58 +280,137 @@ void LoopVersioningPass::runOnOperation() {
     }
   }
 
-  if (argsOfInterest.empty())
+  if (argsOfInterest.empty()) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "No suitable arguments.\n=== End " DEBUG_TYPE " ===\n");
     return;
+  }
 
-  struct OpsWithArgs {
-    mlir::Operation *op;
-    mlir::SmallVector<ArgInfo, 4> argsAndDims;
-  };
-  // Now see if those arguments are used inside any loop.
-  mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;
+  // A list of all loops in the function in post-order.
+  mlir::SmallVector<fir::DoLoopOp> originalLoops;
+  // Information about the arguments usage by the instructions
+  // immediately nested in a loop.
+  llvm::DenseMap<fir::DoLoopOp, ArgsUsageInLoop> argsInLoops;
 
+  // Traverse the loops in post-order and see
+  // if those arguments are used inside any loop.
   func.walk([&](fir::DoLoopOp loop) {
     mlir::Block &body = *loop.getBody();
-    mlir::SmallVector<ArgInfo, 4> argsInLoop;
+    auto &argsInLoop = argsInLoops[loop];
+    originalLoops.push_back(loop);
     body.walk([&](mlir::Operation *op) {
-      // support either fir.array_coor or fir.coordinate_of
-      if (auto arrayCoor = mlir::dyn_cast<fir::ArrayCoorOp>(op)) {
-        // no support currently for sliced arrays
-        if (arrayCoor.getSlice())
-          return;
-      } else if (!mlir::isa<fir::CoordinateOp>(op)) {
+      // Support either fir.array_coor or fir.coordinate_of.
+      if (!mlir::isa<fir::ArrayCoorOp, fir::CoordinateOp>(op))
         return;
-      }
-
-      // The current operation could be inside another loop than
-      // the one we're currently processing. Skip it, we'll get
-      // to it later.
+      // Process only operations immediately nested in the current loop.
       if (op->getParentOfType<fir::DoLoopOp>() != loop)
         return;
       mlir::Value operand = op->getOperand(0);
       for (auto a : argsOfInterest) {
         if (a.arg == normaliseVal(operand)) {
-          // use the reboxed value, not the block arg when re-creating the loop:
+          // Use the reboxed value, not the block arg when re-creating the loop.
+          // TODO: should we check that the operand dominates the loop?
+          // If this might be a case, we should record such operands in
+          // argsInLoop.cannotTransform, so that they disable the transformation
+          // for the parent loops as well.
           a.arg = operand;
-          // Only add if it's not already in the list.
-          if (std::find_if(argsInLoop.begin(), argsInLoop.end(), [&](auto it) {
-                return it.arg == a.arg;
-              }) == argsInLoop.end()) {
 
-            argsInLoop.push_back(a);
+          // No support currently for sliced arrays.
+          // This means that we cannot transform properly
+          // instructions referencing a.arg in the whole loop
+          // nest this loop is located in.
+          if (auto arrayCoor = mlir::dyn_cast<fir::ArrayCoorOp>(op))
+            if (arrayCoor.getSlice())
+              argsInLoop.cannotTransform.insert(a.arg);
+
+          if (argsInLoop.cannotTransform.contains(a.arg)) {
+            // Remove any previously recorded usage, if any.
+            argsInLoop.usageInfo.erase(a.arg);
             break;
           }
+
+          // Record the a.arg usage, if not recorded yet.
+          argsInLoop.usageInfo.try_emplace(a.arg, a);
+          break;
         }
       }
     });
-
-    if (!argsInLoop.empty()) {
-      OpsWithArgs ops = {loop, argsInLoop};
-      loopsOfInterest.push_back(ops);
-    }
   });
-  if (loopsOfInterest.empty())
+
+  // Dump loops info after initial collection.
+  // clang-format off
+  LLVM_DEBUG(
+      llvm::dbgs() << "Initial usage info:\n";
+      for (fir::DoLoopOp loop : originalLoops) {
+        auto &argsInLoop = argsInLoops[loop];
+        argsInLoop.dump(loop);
+      }
+  );
+  // clang-format on
+
+  // Clear argument usage for parent loops if an inner loop
+  // contains a non-transformable usage.
+  for (fir::DoLoopOp loop : originalLoops) {
+    auto &argsInLoop = argsInLoops[loop];
+    if (argsInLoop.cannotTransform.empty())
+      continue;
+
+    fir::DoLoopOp parent = loop;
+    while ((parent = parent->getParentOfType<fir::DoLoopOp>()))
+      argsInLoops[parent].eraseUsage(argsInLoop.cannotTransform);
+  }
+
+  // If an argument access can be optimized in a loop and
+  // its descendant loop, then it does not make sense to
+  // generate the contiguity check for the descendant loop.
+  // The check will be produced as part of the ancestor
+  // loop's transformation. So we can clear the argument
+  // usage for all descendant loops.
+  for (fir::DoLoopOp loop : originalLoops) {
+    auto &argsInLoop = argsInLoops[loop];
+    if (argsInLoop.usageInfo.empty())
+      continue;
+
+    loop.getBody()->walk([&](fir::DoLoopOp dloop) {
+      argsInLoops[dloop].eraseUsage(argsInLoop.usageInfo);
+    });
+  }
+
+  // clang-format off
+  LLVM_DEBUG(
+      llvm::dbgs() << "Final usage info:\n";
+      for (fir::DoLoopOp loop : originalLoops) {
+        auto &argsInLoop = argsInLoops[loop];
+        argsInLoop.dump(loop);
+      }
+  );
+  // clang-format on
+
+  // Reduce the collected information to a list of loops
+  // with attached arguments usage information.
+  // The list must hold the loops in post order, so that
+  // the inner loops are transformed before the outer loops.
+  struct OpsWithArgs {
+    mlir::Operation *op;
+    mlir::SmallVector<ArgInfo, 4> argsAndDims;
+  };
+  mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;
+  for (fir::DoLoopOp loop : originalLoops) {
+    auto &argsInLoop = argsInLoops[loop];
+    if (argsInLoop.usageInfo.empty())
+      continue;
+    OpsWithArgs info;
+    info.op = loop;
+    for (auto &arg : argsInLoop.usageInfo)
+      info.argsAndDims.push_back(arg.second);
+    loopsOfInterest.emplace_back(std::move(info));
+  }
+
+  if (loopsOfInterest.empty()) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "No loops to transform.\n=== End " DEBUG_TYPE " ===\n");
     return;
+  }
 
   // If we get here, there are loops to process.
   fir::FirOpBuilder builder{module, std::move(kindMap)};
diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir
index 566903d0897f237..f2768d7325f7407 100644
--- a/flang/test/Transforms/loop-versioning.fir
+++ b/flang/test/Transforms/loop-versioning.fir
@@ -118,8 +118,6 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 
 // -----
 
-// RUN: fir-opt --loop-versioning %s | FileCheck %s
-
 // Check that "no result" from a versioned loop works correctly
 // This code was the basis for this, but `read` is replaced with a function called Func
 // subroutine test3(x, y)
@@ -1266,4 +1264,174 @@ func.func @test_optional_arg(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name
 // CHECK:           fir.store %[[VAL_166:.*]]#1 to %[[VAL_18]] : !fir.ref<i32>
 // CHECK:           return
 // CHECK:         }
+
+// ! Verify that neither of the loops is versioned
+// ! due to the array section in the inner loop:
+// subroutine test_slice(x)
+//   real :: x(:,:)
+//   do i=10,100
+//      x(i,7) = 1.0
+//      x(i,3:5) = 2.0
+//   end do
+// end subroutine test_slice
+func.func @_QPtest_slice(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}) {
+  %c10 = arith.constant 10 : index
+  %c100 = arith.constant 100 : index
+  %c6_i64 = arith.constant 6 : i64
+  %c3 = arith.constant 3 : index
+  %c2 = arith.constant 2 : index
+  %c5 = arith.constant 5 : index
+  %cst = arith.constant 2.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1_i64 = arith.constant 1 : i64
+  %cst_0 = arith.constant 1.000000e+00 : f32
+  %c1 = arith.constant 1 : index
+  %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_sliceEi"}
+  %1 = fir.convert %c10 : (index) -> i32
+  %2:2 = fir.do_loop %arg1 = %c10 to %c100 step %c1 iter_args(%arg2 = %1) -> (index, i32) {
+    fir.store %arg2 to %0 : !fir.ref<i32>
+    %3 = fir.load %0 : !fir.ref<i32>
+    %4 = fir.convert %3 : (i32) -> i64
+    %5 = arith.subi %4, %c1_i64 : i64
+    %6 = fir.coordinate_of %arg0, %5, %c6_i64 : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+    fir.store %cst_0 to %6 : !fir.ref<f32>
+    %7 = fir.load %0 : !fir.ref<i32>
+    %8 = fir.convert %7 : (i32) -> i64
+    %9 = fir.undefined index
+    %10 = fir.convert %7 : (i32) -> index
+    %11 = fir.slice %8, %9, %9, %c3, %c5, %c1 : (i64, index, index, index, index, index) -> !fir.slice<2>
+    %12 = fir.undefined !fir.array<?x?xf32>
+    %13 = fir.do_loop %arg3 = %c0 to %c2 step %c1 unordered iter_args(%arg4 = %12) -> (!fir.array<?x?xf32>) {
+      %18 = arith.addi %arg3, %c1 : index
+      %19 = fir.array_coor %arg0 [%11] %10, %18 : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>, index, index) -> !fir.ref<f32>
+      fir.store %cst to %19 : !fir.ref<f32>
+      fir.result %12 : !fir.array<?x?xf32>
+    }
+    %14 = arith.addi %arg1, %c1 : index
+    %15 = fir.convert %c1 : (index) -> i32
+    %16 = fir.load %0 : !fir.ref<i32>
+    %17 = arith.addi %16, %15 : i32
+    fir.result %14, %17 : index, i32
+  }
+  fir.store %2#1 to %0 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_slice(
+// CHECK-NOT: fir.if
+
+// ! Verify versioning for argument 'x' but not for 'y':
+// subroutine test_independent_args(x, y)
+//   real :: x(:,:), y(:,:)
+//   do i=10,100
+//      x(i,7) = 1.0
+//      y(i,3:5) = 2.0
+//   end do
+// end subroutine test_independent_args
+func.func @_QPtest_independent_args(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
+  %c10 = arith.constant 10 : index
+  %c100 = arith.constant 100 : index
+  %c6_i64 = arith.constant 6 : i64
+  %c3 = arith.constant 3 : index
+  %c2 = arith.constant 2 : index
+  %c5 = arith.constant 5 : index
+  %cst = arith.constant 2.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1_i64 = arith.constant 1 : i64
+  %cst_0 = arith.constant 1.000000e+00 : f32
+  %c1 = arith.constant 1 : index
+  %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_independent_argsEi"}
+  %1 = fir.convert %c10 : (index) -> i32
+  %2:2 = fir.do_loop %arg2 = %c10 to %c100 step %c1 iter_args(%arg3 = %1) -> (index, i32) {
+    fir.store %arg3 to %0 : !fir.ref<i32>
+    %3 = fir.load %0 : !fir.ref<i32>
+    %4 = fir.convert %3 : (i32) -> i64
+    %5 = arith.subi %4, %c1_i64 : i64
+    %6 = fir.coordinate_of %arg0, %5, %c6_i64 : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
+    fir.store %cst_0 to %6 : !fir.ref<f32>
+    %7 = fir.load %0 : !fir.ref<i32>
+    %8 = fir.convert %7 : (i32) -> i64
+    %9 = fir.undefined index
+    %10 = fir.convert %7 : (i32) -> index
+    %11 = fir.slice %8, %9, %9, %c3, %c5, %c1 : (i64, index, index, index, index, index) -> !fir.slice<2>
+    %12 = fir.undefined !fir.array<?x?xf32>
+    %13 = fir.do_loop %arg4 = %c0 to %c2 step %c1 unordered iter_args(%arg5 = %12) -> (!fir.array<?x?xf32>) {
+      %18 = arith.addi %arg4, %c1 : index
+      %19 = fir.array_coor %arg1 [%11] %10, %18 : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>, index, index) -> !fir.ref<f32>
+      fir.store %cst to %19 : !fir.ref<f32>
+      fir.result %12 : !fir.array<?x?xf32>
+    }
+    %14 = arith.addi %arg2, %c1 : index
+    %15 = fir.convert %c1 : (index) -> i32
+    %16 = fir.load %0 : !fir.ref<i32>
+    %17 = arith.addi %16, %15 : i32
+    fir.result %14, %17 : index, i32
+  }
+  fir.store %2#1 to %0 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_independent_args(
+// CHECK-SAME:        %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:        %[[VAL_1:.*]]: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
+// CHECK:           %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %{{.*}} : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_19:.*]] = arith.constant 4 : index
+// CHECK:           %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_16]]#2, %[[VAL_19]] : index
+// CHECK:           %[[VAL_21:.*]]:2 = fir.if %[[VAL_20]] -> (index, i32) {
+// CHECK-NOT: fir.if
+
+
+// ! Verify that the whole loop nest is versioned
+// ! without additional contiguity check for the inner loop:
+// subroutine test_loop_nest(x)
+//   real :: x(:)
+//   do i=10,100
+//      x(i) = 1.0
+//      do j=10,100
+//         x(j) = 2.0
+//      end do
+//   end do
+// end subroutine test_loop_nest
+func.func @_QPtest_loop_nest(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+  %c10 = arith.constant 10 : index
+  %c100 = arith.constant 100 : index
+  %cst = arith.constant 2.000000e+00 : f32
+  %c1_i64 = arith.constant 1 : i64
+  %cst_0 = arith.constant 1.000000e+00 : f32
+  %c1 = arith.constant 1 : index
+  %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_loop_nestEi"}
+  %1 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFtest_loop_nestEj"}
+  %2 = fir.convert %c10 : (index) -> i32
+  %3:2 = fir.do_loop %arg1 = %c10 to %c100 step %c1 iter_args(%arg2 = %2) -> (index, i32) {
+    fir.store %arg2 to %0 : !fir.ref<i32>
+    %4 = fir.load %0 : !fir.ref<i32>
+    %5 = fir.convert %4 : (i32) -> i64
+    %6 = arith.subi %5, %c1_i64 : i64
+    %7 = fir.coordinate_of %arg0, %6 : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+    fir.store %cst_0 to %7 : !fir.ref<f32>
+    %8:2 = fir.do_loop %arg3 = %c10 to %c100 step %c1 iter_args(%arg4 = %2) -> (index, i32) {
+      fir.store %arg4 to %1 : !fir.ref<i32>
+      %13 = fir.load %1 : !fir.ref<i32>
+      %14 = fir.convert %13 : (i32) -> i64
+      %15 = arith.subi %14, %c1_i64 : i64
+      %16 = fir.coordinate_of %arg0, %15 : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+      fir.store %cst to %16 : !fir.ref<f32>
+      %17 = arith.addi %arg3, %c1 : index
+      %18 = fir.convert %c1 : (index) -> i32
+      %19 = fir.load %1 : !fir.ref<i32>
+      %20 = arith.addi %19, %18 : i32
+      fir.result %17, %20 : index, i32
+    }
+    fir.store %8#1 to %1 : !fir.ref<i32>
+    %9 = arith.addi %arg1, %c1 : index
+    %10 = fir.convert %c1 : (index) -> i32
+    %11 = fir.load %0 : !fir.ref<i32>
+    %12 = arith.addi %11, %10 : i32
+    fir.result %9, %12 : index, i32
+  }
+  fir.store %3#1 to %0 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_loop_nest(
+// CHECK: fir.if
+// CHECK-NOT: fir.if
+
 } // End module

>From 39b6c82c5d38630910304ca88cdcd82ef88ea40d Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Fri, 8 Sep 2023 09:01:37 -0700
Subject: [PATCH 17/32] [flang][hlfir] Better recognize non-overlapping array
 sections. (#65707)

This is a copy of the corresponding ArrayValueCopy analysis
for non-overlapping array slices. It is required to achieve
the same performance for Polyhedron/nf, though, additional
changes are needed in the alias analysis for disambiguating
host associated accesses.
---
 .../Transforms/OptimizedBufferization.cpp     |  73 ++++-
 flang/test/HLFIR/opt-array-slice-assign.fir   | 254 ++++++++++++++++++
 2 files changed, 320 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 437455b3defb1b9..748b91d9f457e3d 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -188,7 +188,6 @@ static bool areIdenticalOrDisjointSlices(mlir::Value ref1, mlir::Value ref2) {
       des1.getComponentShape() != des2.getComponentShape() ||
       des1.getSubstring() != des2.getSubstring() ||
       des1.getComplexPart() != des2.getComplexPart() ||
-      des1.getShape() != des2.getShape() ||
       des1.getTypeparams() != des2.getTypeparams()) {
     LLVM_DEBUG(llvm::dbgs() << "Different designator specs for:\n"
                             << des1 << "and:\n"
@@ -211,12 +210,9 @@ static bool areIdenticalOrDisjointSlices(mlir::Value ref1, mlir::Value ref2) {
   // If all the triplets (section speficiers) are the same, then
   // we do not care if %0 is equal to %1 - the slices are either
   // identical or completely disjoint.
-  //
-  // TODO: if we can prove that all non-triplet subscripts are different
-  // (by value), then we may return true regardless of the triplet
-  // values - the sections must be completely disjoint.
   auto des1It = des1.getIndices().begin();
   auto des2It = des2.getIndices().begin();
+  bool identicalTriplets = true;
   for (bool isTriplet : des1.getIsTriplet()) {
     if (isTriplet) {
       for (int i = 0; i < 3; ++i)
@@ -224,14 +220,77 @@ static bool areIdenticalOrDisjointSlices(mlir::Value ref1, mlir::Value ref2) {
           LLVM_DEBUG(llvm::dbgs() << "Triplet mismatch for:\n"
                                   << des1 << "and:\n"
                                   << des2 << "\n");
-          return false;
+          identicalTriplets = false;
+          break;
         }
     } else {
       ++des1It;
       ++des2It;
     }
   }
-  return true;
+  if (identicalTriplets)
+    return true;
+
+  // See if we can prove that any of the triplets do not overlap.
+  // This is mostly a Polyhedron/nf performance hack that looks for
+  // particular relations between the lower and upper bounds
+  // of the array sections, e.g. for any positive constant C:
+  //   X:Y does not overlap with (Y+C):Z
+  //   X:Y does not overlap with Z:(X-C)
+  auto displacedByConstant = [](mlir::Value v1, mlir::Value v2) {
+    auto removeConvert = [](mlir::Value v) -> mlir::Operation * {
+      auto *op = v.getDefiningOp();
+      while (auto conv = mlir::dyn_cast_or_null<fir::ConvertOp>(op))
+        op = conv.getValue().getDefiningOp();
+      return op;
+    };
+
+    auto isPositiveConstant = [](mlir::Value v) -> bool {
+      if (auto conOp =
+              mlir::dyn_cast<mlir::arith::ConstantOp>(v.getDefiningOp()))
+        if (auto iattr = conOp.getValue().dyn_cast<mlir::IntegerAttr>())
+          return iattr.getInt() > 0;
+      return false;
+    };
+
+    auto *op1 = removeConvert(v1);
+    auto *op2 = removeConvert(v2);
+    if (!op1 || !op2)
+      return false;
+    if (auto addi = mlir::dyn_cast<mlir::arith::AddIOp>(op2))
+      if ((addi.getLhs().getDefiningOp() == op1 &&
+           isPositiveConstant(addi.getRhs())) ||
+          (addi.getRhs().getDefiningOp() == op1 &&
+           isPositiveConstant(addi.getLhs())))
+        return true;
+    if (auto subi = mlir::dyn_cast<mlir::arith::SubIOp>(op1))
+      if (subi.getLhs().getDefiningOp() == op2 &&
+          isPositiveConstant(subi.getRhs()))
+        return true;
+    return false;
+  };
+
+  des1It = des1.getIndices().begin();
+  des2It = des2.getIndices().begin();
+  for (bool isTriplet : des1.getIsTriplet()) {
+    if (isTriplet) {
+      mlir::Value des1Lb = *des1It++;
+      mlir::Value des1Ub = *des1It++;
+      mlir::Value des2Lb = *des2It++;
+      mlir::Value des2Ub = *des2It++;
+      // Ignore strides.
+      ++des1It;
+      ++des2It;
+      if (displacedByConstant(des1Ub, des2Lb) ||
+          displacedByConstant(des2Ub, des1Lb))
+        return true;
+    } else {
+      ++des1It;
+      ++des2It;
+    }
+  }
+
+  return false;
 }
 
 std::optional<ElementalAssignBufferization::MatchInfo>
diff --git a/flang/test/HLFIR/opt-array-slice-assign.fir b/flang/test/HLFIR/opt-array-slice-assign.fir
index dc42cbd302b87e1..11bd97c1158342b 100644
--- a/flang/test/HLFIR/opt-array-slice-assign.fir
+++ b/flang/test/HLFIR/opt-array-slice-assign.fir
@@ -128,3 +128,257 @@ func.func @_QPtest3(%arg0: !fir.ref<!fir.array<10x!fir.type<_QMtypesTt{x:!fir.ar
 // CHECK:               hlfir.assign %[[VAL_28]] to %[[VAL_29]] : f32, !fir.ref<f32>
 // CHECK:             }
 // CHECK:           }
+
+// ! ub == lb - 1
+// subroutine test4(x, i1, i2, nx)
+//   real :: x(i2), f
+//   do i=i1,i2,nx
+//      x(i:i+nx-1) = (x(i-nx:i-1))
+//   end do
+// end subroutine test4
+func.func @_QPtest4(%arg0: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i1"}, %arg2: !fir.ref<i32> {fir.bindc_name = "i2"}, %arg3: !fir.ref<i32> {fir.bindc_name = "nx"}) {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c0 = arith.constant 0 : index
+  %0 = fir.alloca f32 {bindc_name = "f", uniq_name = "_QFtest4Ef"}
+  %1:2 = hlfir.declare %0 {uniq_name = "_QFtest4Ef"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest4Ei"}
+  %3:2 = hlfir.declare %2 {uniq_name = "_QFtest4Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %4:2 = hlfir.declare %arg1 {uniq_name = "_QFtest4Ei1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5:2 = hlfir.declare %arg2 {uniq_name = "_QFtest4Ei2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %6:2 = hlfir.declare %arg3 {uniq_name = "_QFtest4Enx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %7 = fir.load %5#0 : !fir.ref<i32>
+  %8 = fir.convert %7 : (i32) -> index
+  %9 = arith.cmpi sgt, %8, %c0 : index
+  %10 = arith.select %9, %8, %c0 : index
+  %11 = fir.shape %10 : (index) -> !fir.shape<1>
+  %12:2 = hlfir.declare %arg0(%11) {uniq_name = "_QFtest4Ex"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+  %13 = fir.load %4#0 : !fir.ref<i32>
+  %14 = fir.convert %13 : (i32) -> index
+  %15 = fir.load %5#0 : !fir.ref<i32>
+  %16 = fir.convert %15 : (i32) -> index
+  %17 = fir.load %6#0 : !fir.ref<i32>
+  %18 = fir.convert %17 : (i32) -> index
+  %19 = fir.convert %14 : (index) -> i32
+  %20:2 = fir.do_loop %arg4 = %14 to %16 step %18 iter_args(%arg5 = %19) -> (index, i32) {
+    fir.store %arg5 to %3#1 : !fir.ref<i32>
+    %21 = fir.load %3#0 : !fir.ref<i32>
+    %22 = fir.load %6#0 : !fir.ref<i32>
+    %23 = arith.subi %21, %22 : i32
+    %24 = arith.subi %21, %c1_i32 : i32
+    %25 = fir.convert %23 : (i32) -> index
+    %26 = fir.convert %24 : (i32) -> index
+    %27 = arith.subi %26, %25 : index
+    %28 = arith.addi %27, %c1 : index
+    %29 = arith.cmpi sgt, %28, %c0 : index
+    %30 = arith.select %29, %28, %c0 : index
+    %31 = fir.shape %30 : (index) -> !fir.shape<1>
+    %32 = hlfir.designate %12#0 (%25:%26:%c1)  shape %31 : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+    %33 = hlfir.elemental %31 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+    ^bb0(%arg6: index):
+      %48 = hlfir.designate %32 (%arg6)  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      %49 = fir.load %48 : !fir.ref<f32>
+      %50 = hlfir.no_reassoc %49 : f32
+      hlfir.yield_element %50 : f32
+    }
+    %34 = arith.addi %21, %22 : i32
+    %35 = arith.subi %34, %c1_i32 : i32
+    %36 = fir.convert %21 : (i32) -> index
+    %37 = fir.convert %35 : (i32) -> index
+    %38 = arith.subi %37, %36 : index
+    %39 = arith.addi %38, %c1 : index
+    %40 = arith.cmpi sgt, %39, %c0 : index
+    %41 = arith.select %40, %39, %c0 : index
+    %42 = fir.shape %41 : (index) -> !fir.shape<1>
+    %43 = hlfir.designate %12#0 (%36:%37:%c1)  shape %42 : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+    hlfir.assign %33 to %43 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+    hlfir.destroy %33 : !hlfir.expr<?xf32>
+    %44 = arith.addi %arg4, %18 : index
+    %45 = fir.convert %18 : (index) -> i32
+    %46 = fir.load %3#1 : !fir.ref<i32>
+    %47 = arith.addi %46, %45 : i32
+    fir.result %44, %47 : index, i32
+  }
+  fir.store %20#1 to %3#1 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest4(
+// CHECK-NOT: hlfir.elemental
+
+// ! lb == ub + 1
+// subroutine test5(x, i1, i2, nx)
+//   real :: x(i2), f
+//   do i=i1,i2,nx
+//      x(i+1:i+nx-1) = (x(i-nx:i))
+//   end do
+// end subroutine test5
+func.func @_QPtest5(%arg0: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i1"}, %arg2: !fir.ref<i32> {fir.bindc_name = "i2"}, %arg3: !fir.ref<i32> {fir.bindc_name = "nx"}) {
+  %c1_i32 = arith.constant 1 : i32
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.alloca f32 {bindc_name = "f", uniq_name = "_QFtest5Ef"}
+  %1:2 = hlfir.declare %0 {uniq_name = "_QFtest5Ef"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest5Ei"}
+  %3:2 = hlfir.declare %2 {uniq_name = "_QFtest5Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %4:2 = hlfir.declare %arg1 {uniq_name = "_QFtest5Ei1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5:2 = hlfir.declare %arg2 {uniq_name = "_QFtest5Ei2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %6:2 = hlfir.declare %arg3 {uniq_name = "_QFtest5Enx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %7 = fir.load %5#0 : !fir.ref<i32>
+  %8 = fir.convert %7 : (i32) -> index
+  %9 = arith.cmpi sgt, %8, %c0 : index
+  %10 = arith.select %9, %8, %c0 : index
+  %11 = fir.shape %10 : (index) -> !fir.shape<1>
+  %12:2 = hlfir.declare %arg0(%11) {uniq_name = "_QFtest5Ex"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+  %13 = fir.load %4#0 : !fir.ref<i32>
+  %14 = fir.convert %13 : (i32) -> index
+  %15 = fir.load %5#0 : !fir.ref<i32>
+  %16 = fir.convert %15 : (i32) -> index
+  %17 = fir.load %6#0 : !fir.ref<i32>
+  %18 = fir.convert %17 : (i32) -> index
+  %19 = fir.convert %14 : (index) -> i32
+  %20:2 = fir.do_loop %arg4 = %14 to %16 step %18 iter_args(%arg5 = %19) -> (index, i32) {
+    fir.store %arg5 to %3#1 : !fir.ref<i32>
+    %21 = fir.load %3#0 : !fir.ref<i32>
+    %22 = fir.load %6#0 : !fir.ref<i32>
+    %23 = arith.subi %21, %22 : i32
+    %24 = fir.convert %23 : (i32) -> index
+    %25 = fir.convert %21 : (i32) -> index
+    %26 = arith.subi %25, %24 : index
+    %27 = arith.addi %26, %c1 : index
+    %28 = arith.cmpi sgt, %27, %c0 : index
+    %29 = arith.select %28, %27, %c0 : index
+    %30 = fir.shape %29 : (index) -> !fir.shape<1>
+    %31 = hlfir.designate %12#0 (%24:%25:%c1)  shape %30 : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+    %32 = hlfir.elemental %30 unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+    ^bb0(%arg6: index):
+      %48 = hlfir.designate %31 (%arg6)  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      %49 = fir.load %48 : !fir.ref<f32>
+      %50 = hlfir.no_reassoc %49 : f32
+      hlfir.yield_element %50 : f32
+    }
+    %33 = arith.addi %21, %c1_i32 : i32
+    %34 = arith.addi %21, %22 : i32
+    %35 = arith.subi %34, %c1_i32 : i32
+    %36 = fir.convert %33 : (i32) -> index
+    %37 = fir.convert %35 : (i32) -> index
+    %38 = arith.subi %37, %36 : index
+    %39 = arith.addi %38, %c1 : index
+    %40 = arith.cmpi sgt, %39, %c0 : index
+    %41 = arith.select %40, %39, %c0 : index
+    %42 = fir.shape %41 : (index) -> !fir.shape<1>
+    %43 = hlfir.designate %12#0 (%36:%37:%c1)  shape %42 : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+    hlfir.assign %32 to %43 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+    hlfir.destroy %32 : !hlfir.expr<?xf32>
+    %44 = arith.addi %arg4, %18 : index
+    %45 = fir.convert %18 : (index) -> i32
+    %46 = fir.load %3#1 : !fir.ref<i32>
+    %47 = arith.addi %46, %45 : i32
+    fir.result %44, %47 : index, i32
+  }
+  fir.store %20#1 to %3#1 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest5(
+// CHECK-NOT: hlfir.elemental
+
+// ! ub = lb - 1 and dim1 is unknown
+// ! FIR lowering produces a temp.
+// subroutine test6(x, i1, i2, nx)
+//   real :: x(i2,i2), f
+//   integer n1, n2, n3, n4
+//   do i=i1,i2,nx
+//      x(i:i+nx-1,n1:n2) = (x(i-nx:i-1,n3:n4))
+//   end do
+// end subroutine test6
+func.func @_QPtest6(%arg0: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<i32> {fir.bindc_name = "i1"}, %arg2: !fir.ref<i32> {fir.bindc_name = "i2"}, %arg3: !fir.ref<i32> {fir.bindc_name = "nx"}) {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c0 = arith.constant 0 : index
+  %0 = fir.alloca f32 {bindc_name = "f", uniq_name = "_QFtest6Ef"}
+  %1:2 = hlfir.declare %0 {uniq_name = "_QFtest6Ef"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest6Ei"}
+  %3:2 = hlfir.declare %2 {uniq_name = "_QFtest6Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %4:2 = hlfir.declare %arg1 {uniq_name = "_QFtest6Ei1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5:2 = hlfir.declare %arg2 {uniq_name = "_QFtest6Ei2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %6 = fir.alloca i32 {bindc_name = "n1", uniq_name = "_QFtest6En1"}
+  %7:2 = hlfir.declare %6 {uniq_name = "_QFtest6En1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %8 = fir.alloca i32 {bindc_name = "n2", uniq_name = "_QFtest6En2"}
+  %9:2 = hlfir.declare %8 {uniq_name = "_QFtest6En2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %10 = fir.alloca i32 {bindc_name = "n3", uniq_name = "_QFtest6En3"}
+  %11:2 = hlfir.declare %10 {uniq_name = "_QFtest6En3"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %12 = fir.alloca i32 {bindc_name = "n4", uniq_name = "_QFtest6En4"}
+  %13:2 = hlfir.declare %12 {uniq_name = "_QFtest6En4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %14:2 = hlfir.declare %arg3 {uniq_name = "_QFtest6Enx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %15 = fir.load %5#0 : !fir.ref<i32>
+  %16 = fir.convert %15 : (i32) -> index
+  %17 = arith.cmpi sgt, %16, %c0 : index
+  %18 = arith.select %17, %16, %c0 : index
+  %19 = fir.shape %18, %18 : (index, index) -> !fir.shape<2>
+  %20:2 = hlfir.declare %arg0(%19) {uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+  %21 = fir.load %4#0 : !fir.ref<i32>
+  %22 = fir.convert %21 : (i32) -> index
+  %23 = fir.load %5#0 : !fir.ref<i32>
+  %24 = fir.convert %23 : (i32) -> index
+  %25 = fir.load %14#0 : !fir.ref<i32>
+  %26 = fir.convert %25 : (i32) -> index
+  %27 = fir.convert %22 : (index) -> i32
+  %28:2 = fir.do_loop %arg4 = %22 to %24 step %26 iter_args(%arg5 = %27) -> (index, i32) {
+    fir.store %arg5 to %3#1 : !fir.ref<i32>
+    %29 = fir.load %3#0 : !fir.ref<i32>
+    %30 = fir.load %14#0 : !fir.ref<i32>
+    %31 = arith.subi %29, %30 : i32
+    %32 = arith.subi %29, %c1_i32 : i32
+    %33 = fir.convert %31 : (i32) -> index
+    %34 = fir.convert %32 : (i32) -> index
+    %35 = arith.subi %34, %33 : index
+    %36 = arith.addi %35, %c1 : index
+    %37 = arith.cmpi sgt, %36, %c0 : index
+    %38 = arith.select %37, %36, %c0 : index
+    %39 = fir.load %11#0 : !fir.ref<i32>
+    %40 = fir.load %13#0 : !fir.ref<i32>
+    %41 = fir.convert %39 : (i32) -> index
+    %42 = fir.convert %40 : (i32) -> index
+    %43 = arith.subi %42, %41 : index
+    %44 = arith.addi %43, %c1 : index
+    %45 = arith.cmpi sgt, %44, %c0 : index
+    %46 = arith.select %45, %44, %c0 : index
+    %47 = fir.shape %38, %46 : (index, index) -> !fir.shape<2>
+    %48 = hlfir.designate %20#0 (%33:%34:%c1, %41:%42:%c1)  shape %47 : (!fir.box<!fir.array<?x?xf32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
+    %49 = hlfir.elemental %47 unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf32> {
+    ^bb0(%arg6: index, %arg7: index):
+      %72 = hlfir.designate %48 (%arg6, %arg7)  : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+      %73 = fir.load %72 : !fir.ref<f32>
+      %74 = hlfir.no_reassoc %73 : f32
+      hlfir.yield_element %74 : f32
+    }
+    %50 = arith.addi %29, %30 : i32
+    %51 = arith.subi %50, %c1_i32 : i32
+    %52 = fir.convert %29 : (i32) -> index
+    %53 = fir.convert %51 : (i32) -> index
+    %54 = arith.subi %53, %52 : index
+    %55 = arith.addi %54, %c1 : index
+    %56 = arith.cmpi sgt, %55, %c0 : index
+    %57 = arith.select %56, %55, %c0 : index
+    %58 = fir.load %7#0 : !fir.ref<i32>
+    %59 = fir.load %9#0 : !fir.ref<i32>
+    %60 = fir.convert %58 : (i32) -> index
+    %61 = fir.convert %59 : (i32) -> index
+    %62 = arith.subi %61, %60 : index
+    %63 = arith.addi %62, %c1 : index
+    %64 = arith.cmpi sgt, %63, %c0 : index
+    %65 = arith.select %64, %63, %c0 : index
+    %66 = fir.shape %57, %65 : (index, index) -> !fir.shape<2>
+    %67 = hlfir.designate %20#0 (%52:%53:%c1, %60:%61:%c1)  shape %66 : (!fir.box<!fir.array<?x?xf32>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
+    hlfir.assign %49 to %67 : !hlfir.expr<?x?xf32>, !fir.box<!fir.array<?x?xf32>>
+    hlfir.destroy %49 : !hlfir.expr<?x?xf32>
+    %68 = arith.addi %arg4, %26 : index
+    %69 = fir.convert %26 : (index) -> i32
+    %70 = fir.load %3#1 : !fir.ref<i32>
+    %71 = arith.addi %70, %69 : i32
+    fir.result %68, %71 : index, i32
+  }
+  fir.store %28#1 to %3#1 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest6(
+// CHECK-NOT: hlfir.elemental

>From 390b48675be80420f471bd3be74577495b1b1897 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane at nvidia.com>
Date: Fri, 8 Sep 2023 09:05:16 -0700
Subject: [PATCH 18/32] Add 'run' line and 'expected-no-diagnostics' to test
 added in 3ed9e9e3ace6f9ce320cf4e75cffa04a7c7241b5

---
 clang/test/SemaCXX/this-type-deduction-concept.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/test/SemaCXX/this-type-deduction-concept.cpp b/clang/test/SemaCXX/this-type-deduction-concept.cpp
index a0c1f605ccefd76..7f578440befdb23 100644
--- a/clang/test/SemaCXX/this-type-deduction-concept.cpp
+++ b/clang/test/SemaCXX/this-type-deduction-concept.cpp
@@ -1,9 +1,11 @@
-
+// RUN: %clang_cc1 -std=c++23 -verify -fsyntax-only %s
 // This test case came up in the review of
 // https://reviews.llvm.org/D159126
 // when transforming `this` within a
 // requires expression, we need to make sure
 // the type of this (and its qualifiers) is respected.
+
+// expected-no-diagnostics
 namespace D159126 {
 
 template <class _Tp>

>From 21c251aaca7e01cad7f97d8be548926e58e4d4e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson at ericsson.com>
Date: Fri, 8 Sep 2023 18:06:09 +0200
Subject: [PATCH 19/32] [LowerMatrixIntrinsics] Drop support for typed pointers
 (#65605)

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 38 ++++---------------
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 3c31d4a4cd3786a..c173e3dd7d0e5d6 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -177,7 +177,6 @@ Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
   assert((!isa<ConstantInt>(Stride) ||
           cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
          "Stride must be >= the number of elements in the result vector.");
-  unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
 
   // Compute the start of the vector with index VecIdx as VecIdx * Stride.
   Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");
@@ -189,11 +188,7 @@ Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
   else
     VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep");
 
-  // Cast elementwise vector start pointer to a pointer to a vector
-  // (EltType x NumElements)*.
-  auto *VecType = FixedVectorType::get(EltType, NumElements);
-  Type *VecPtrType = PointerType::get(VecType, AS);
-  return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast");
+  return VecStart;
 }
 
 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
@@ -1060,13 +1055,6 @@ class LowerMatrixIntrinsics {
     return Changed;
   }
 
-  /// Turns \p BasePtr into an elementwise pointer to \p EltType.
-  Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) {
-    unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
-    Type *EltPtrType = PointerType::get(EltType, AS);
-    return Builder.CreatePointerCast(BasePtr, EltPtrType);
-  }
-
   /// Replace intrinsic calls
   bool VisitCallInst(CallInst *Inst) {
     if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic())
@@ -1118,7 +1106,7 @@ class LowerMatrixIntrinsics {
     auto *VType = cast<VectorType>(Ty);
     Type *EltTy = VType->getElementType();
     Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());
-    Value *EltPtr = createElementPtr(Ptr, EltTy, Builder);
+    Value *EltPtr = Ptr;
     MatrixTy Result;
     for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
       Value *GEP = computeVectorAddr(
@@ -1144,17 +1132,11 @@ class LowerMatrixIntrinsics {
     Value *Offset = Builder.CreateAdd(
         Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
 
-    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
-    Value *EltPtr =
-        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
-    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+    Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
     auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
                                                    ResultShape.NumColumns);
-    Type *TilePtrTy = PointerType::get(TileTy, AS);
-    Value *TilePtr =
-        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
 
-    return loadMatrix(TileTy, TilePtr, Align,
+    return loadMatrix(TileTy, TileStart, Align,
                       Builder.getInt64(MatrixShape.getStride()), IsVolatile,
                       ResultShape, Builder);
   }
@@ -1190,17 +1172,11 @@ class LowerMatrixIntrinsics {
     Value *Offset = Builder.CreateAdd(
         Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
 
-    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
-    Value *EltPtr =
-        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
-    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+    Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
     auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
                                                    StoreVal.getNumColumns());
-    Type *TilePtrTy = PointerType::get(TileTy, AS);
-    Value *TilePtr =
-        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
 
-    storeMatrix(TileTy, StoreVal, TilePtr, MAlign,
+    storeMatrix(TileTy, StoreVal, TileStart, MAlign,
                 Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
   }
 
@@ -1210,7 +1186,7 @@ class LowerMatrixIntrinsics {
                        MaybeAlign MAlign, Value *Stride, bool IsVolatile,
                        IRBuilder<> &Builder) {
     auto VType = cast<VectorType>(Ty);
-    Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
+    Value *EltPtr = Ptr;
     for (auto Vec : enumerate(StoreVal.vectors())) {
       Value *GEP = computeVectorAddr(
           EltPtr,

>From 0f1a01807c137736236fcb9ea4253804e5ec7cf8 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Fri, 8 Sep 2023 17:08:02 +0100
Subject: [PATCH 20/32] [lldb] Add test to document alias tab completion
 behaviour (#65760)

While looking at https://github.com/llvm/llvm-project/issues/49528 I
found that, happily, aliases can now be tab completed.

However, if there is a built-in match that will always be taken. Which
is a bit surprising, though logical if we don't want people really
messing up their commands I guess.

Meaning "b" tab completes to our built-in breakpoint alias, before it
looks at any of the aliases. "bf" doesn't match "b", so it looks through
the aliases.

I didn't find any tests for this in the obvious places, so this adds
some.
---
 .../completion/TestCompletion.py              | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py
index df7c89032f29fa0..cc2a20dcd0dca76 100644
--- a/lldb/test/API/functionalities/completion/TestCompletion.py
+++ b/lldb/test/API/functionalities/completion/TestCompletion.py
@@ -622,6 +622,25 @@ def test_command_delete(self):
     def test_command_unalias(self):
         self.complete_from_to("command unalias ima", "command unalias image")
 
+    def test_command_aliases(self):
+        self.runCmd("command alias brkpt breakpoint")
+        # If there is an unambiguous completion from the built-in commands,
+        # we choose that.
+        self.complete_from_to("br", "breakpoint")
+        # Only if there is not, do we then look for an unambiguous completion
+        # from the user defined aliases.
+        self.complete_from_to("brk", "brkpt")
+
+        # Aliases are included when there's no exact match.
+        self.runCmd("command alias play breakpoint")
+        self.complete_from_to("pl", ["plugin", "platform", "play"])
+
+        # That list can also contain only aliases if there's no built-ins to
+        # match.
+        self.runCmd("command alias test_1 breakpoint")
+        self.runCmd("command alias test_2 breakpoint")
+        self.complete_from_to("test_", ["test_1", "test_2"])
+
     def test_completion_description_commands(self):
         """Test descriptions of top-level command completions"""
         self.check_completion_with_desc(

>From 38750d7ec179283e4798331f45cccbff122066dd Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Fri, 8 Sep 2023 12:19:09 -0400
Subject: [PATCH 21/32] Fix Clang Sphinx build

This addresses issues found by:
https://lab.llvm.org/buildbot/#/builders/92/builds/50415
---
 clang/docs/ReleaseNotes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6a3a6bb8ad425b0..5c2c5cf4fb2c349 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -273,7 +273,7 @@ Bug Fixes to C++ Support
 - Fix crash when the trailing return type of a generic and dependent
   lambda refers to an init-capture.
   (`#65067 <https://github.com/llvm/llvm-project/issues/65067>`_` and
-   `#63675 <https://github.com/llvm/llvm-project/issues/63675>`_`)
+  `#63675 <https://github.com/llvm/llvm-project/issues/63675>`_`)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^

>From d55ac38aced7b1672b9be85564b9bb4e56711d70 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias at hieta.se>
Date: Fri, 8 Sep 2023 18:22:20 +0200
Subject: [PATCH 22/32] Add GitHub workflow for check Python file formatting
 (#65482)

Using darker which is doing black on diffs, similar to git-clang-format.
---
 .github/workflows/pr-python-format.yml | 39 ++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .github/workflows/pr-python-format.yml

diff --git a/.github/workflows/pr-python-format.yml b/.github/workflows/pr-python-format.yml
new file mode 100644
index 000000000000000..c6122958826545c
--- /dev/null
+++ b/.github/workflows/pr-python-format.yml
@@ -0,0 +1,39 @@
+name: "Check Python Formatting"
+on:
+  pull_request:
+    # run on .py
+    paths:
+      - '**.py'
+
+jobs:
+  python_formatting:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fetch LLVM sources
+        uses: actions/checkout at v4
+        with:
+          persist-credentials: false
+          fetch-depth: 2
+
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files at v39
+        with:
+          files: '**/*.py'
+
+      - name: "Listed files"
+        run: |
+          echo "Formatting files:"
+          echo "${{ steps.changed-files.outputs.all_changed_files }}"
+
+      - name: Setup Python env
+        uses: actions/setup-python at v4
+        with:
+          python-version: '3.11'
+
+      - name: Python Formatting
+        uses: akaihola/darker at 1.7.2
+        with:
+          options: "--check --diff --color"
+          version: "~=1.7.2"
+          src: "${{ steps.changed-files.outputs.all_changed_files }}"

>From 5ee7dc04dee5c8f285a846a054ae403de501dba5 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 8 Sep 2023 09:37:52 -0700
Subject: [PATCH 23/32] [RISCV] Match gather(splat(ptr)) as zero strided load
 (#65769)

We were already handling the case where the broadcast was being done via
a GEP, but we hadn't handled the case of a broadcast via a shuffle.
---
 .../RISCV/RISCVGatherScatterLowering.cpp      | 30 ++++++++----
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  | 47 ++-----------------
 2 files changed, 23 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index fac3526c43148d8..0e9244d0aefa813 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -67,7 +67,7 @@ class RISCVGatherScatterLowering : public FunctionPass {
   bool tryCreateStridedLoadStore(IntrinsicInst *II, Type *DataType, Value *Ptr,
                                  Value *AlignOp);
 
-  std::pair<Value *, Value *> determineBaseAndStride(GetElementPtrInst *GEP,
+  std::pair<Value *, Value *> determineBaseAndStride(Instruction *Ptr,
                                                      IRBuilderBase &Builder);
 
   bool matchStridedRecurrence(Value *Index, Loop *L, Value *&Stride,
@@ -321,9 +321,19 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
 }
 
 std::pair<Value *, Value *>
-RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP,
+RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr,
                                                    IRBuilderBase &Builder) {
 
+  // A gather/scatter of a splat is a zero strided load/store.
+  if (auto *BasePtr = getSplatValue(Ptr)) {
+    Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType());
+    return std::make_pair(BasePtr, ConstantInt::get(IntPtrTy, 0));
+  }
+
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP)
+    return std::make_pair(nullptr, nullptr);
+
   auto I = StridedAddrs.find(GEP);
   if (I != StridedAddrs.end())
     return I->second;
@@ -452,17 +462,17 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
   if (!TLI->isTypeLegal(DataTypeVT))
     return false;
 
-  // Pointer should be a GEP.
-  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-  if (!GEP)
+  // Pointer should be an instruction.
+  auto *PtrI = dyn_cast<Instruction>(Ptr);
+  if (!PtrI)
     return false;
 
-  LLVMContext &Ctx = GEP->getContext();
+  LLVMContext &Ctx = PtrI->getContext();
   IRBuilder<InstSimplifyFolder> Builder(Ctx, *DL);
-  Builder.SetInsertPoint(GEP);
+  Builder.SetInsertPoint(PtrI);
 
   Value *BasePtr, *Stride;
-  std::tie(BasePtr, Stride) = determineBaseAndStride(GEP, Builder);
+  std::tie(BasePtr, Stride) = determineBaseAndStride(PtrI, Builder);
   if (!BasePtr)
     return false;
   assert(Stride != nullptr);
@@ -485,8 +495,8 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
   II->replaceAllUsesWith(Call);
   II->eraseFromParent();
 
-  if (GEP->use_empty())
-    RecursivelyDeleteTriviallyDeadInstructions(GEP);
+  if (PtrI->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(PtrI);
 
   return true;
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index fd51949c6023aa5..30790064090c4ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -12918,60 +12918,19 @@ define <4 x i32> @mgather_broadcast_load_unmasked2(ptr %base) {
 ; RV32-LABEL: mgather_broadcast_load_unmasked2:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vluxei32.v v8, (zero), v8
+; RV32-NEXT:    vlse32.v v8, (a0), zero
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_broadcast_load_unmasked2:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64V-NEXT:    vmv.v.x v10, a0
-; RV64V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64V-NEXT:    vluxei64.v v8, (zero), v10
+; RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64V-NEXT:    vlse32.v v8, (a0), zero
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_broadcast_load_unmasked2:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT:    vmset.m v8
-; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV64ZVE32F-NEXT:    # implicit-def: $v8
-; RV64ZVE32F-NEXT:    beqz zero, .LBB100_5
-; RV64ZVE32F-NEXT:  # %bb.1: # %else
-; RV64ZVE32F-NEXT:    andi a2, a1, 2
-; RV64ZVE32F-NEXT:    bnez a2, .LBB100_6
-; RV64ZVE32F-NEXT:  .LBB100_2: # %else2
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
-; RV64ZVE32F-NEXT:    bnez a2, .LBB100_7
-; RV64ZVE32F-NEXT:  .LBB100_3: # %else5
-; RV64ZVE32F-NEXT:    andi a1, a1, 8
-; RV64ZVE32F-NEXT:    bnez a1, .LBB100_8
-; RV64ZVE32F-NEXT:  .LBB100_4: # %else8
-; RV64ZVE32F-NEXT:    ret
-; RV64ZVE32F-NEXT:  .LBB100_5: # %cond.load
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vlse32.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    andi a2, a1, 2
-; RV64ZVE32F-NEXT:    beqz a2, .LBB100_2
-; RV64ZVE32F-NEXT:  .LBB100_6: # %cond.load1
-; RV64ZVE32F-NEXT:    lw a2, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
-; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
-; RV64ZVE32F-NEXT:    andi a2, a1, 4
-; RV64ZVE32F-NEXT:    beqz a2, .LBB100_3
-; RV64ZVE32F-NEXT:  .LBB100_7: # %cond.load4
-; RV64ZVE32F-NEXT:    lw a2, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
-; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 2
-; RV64ZVE32F-NEXT:    andi a1, a1, 8
-; RV64ZVE32F-NEXT:    beqz a1, .LBB100_4
-; RV64ZVE32F-NEXT:  .LBB100_8: # %cond.load7
-; RV64ZVE32F-NEXT:    lw a0, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.s.x v9, a0
-; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 3
 ; RV64ZVE32F-NEXT:    ret
   %head = insertelement <4 x i1> poison, i1 true, i32 0
   %allones = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer

>From 5f16a3a489bd1730db49f635604bc1d833abed15 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar at redhat.com>
Date: Fri, 8 Sep 2023 09:40:37 -0700
Subject: [PATCH 24/32] workflows: Add a simple pull request subscription
 workflow (#64913)

This new workflow will make it possible for people to subscribe to pull
requests based on the labels that are added. Labels will be added
automatically to the pull requests based on the modified files and each
label will be associated with a GitHub team that will be notified when
the label is added.

See
https://discourse.llvm.org/t/changes-to-pull-request-subscription-system/73296
---
 .github/new-prs-labeler.yml         | 819 +++++++++++++++++++++-------
 .github/workflows/pr-subscriber.yml |  31 ++
 llvm/utils/git/github-automation.py |  39 ++
 3 files changed, 707 insertions(+), 182 deletions(-)
 create mode 100644 .github/workflows/pr-subscriber.yml

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 79bd72c8f6602a1..bc4a30a4802a7c3 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -1,91 +1,6 @@
-backend:AArch64:
-  - llvm/include/llvm/IR/IntrinsicsAArch64.td
-  - llvm/lib/Target/AArch64/**/*
-  - llvm/test/**/*AArch64/**/*
-  - clang/lib/Basic/Targets/AArch64*
-  - clang/lib/Driver/ToolChains/Arch/AArch64*
-  - clang/lib/CodeGen/Targets/AArch64.cpp
-  - clang/include/clang/Basic/BuiltinsAArch64*
-
-backend:ARM:
-  - llvm/include/llvm/IR/IntrinsicsARM.td
-  - llvm/lib/Target/ARM/**/*
-  - llvm/test/**/*ARM/**/*
-  - clang/lib/Basic/Targets/ARM*
-  - clang/lib/Driver/ToolChains/Arch/ARM*
-  - clang/lib/CodeGen/Targets/ARM.cpp
-  - clang/include/clang/Basic/BuiltinsARM*
-
-backend:DirectX:
-  - llvm/include/llvm/IR/IntrinsicsDirectX.td
-  - llvm/**/Target/DirectX/**/*
-  - llvm/**/dxil-dis/**/*
-  - clang/lib/Basic/Targets/DirectX*
-
-backend:MSP430:
-  - llvm/include/llvm/IR/IntrinsicsMSP430.td
-  - llvm/lib/Target/MSP430/**/*
-  - llvm/test/**/MSP430/**/*
-  - clang/lib/Basic/Targets/MSP430*
-  - clang/lib/Driver/ToolChains/Arch/MSP430*
-  - clang/lib/CodeGen/Targets/MSP430.cpp
-  - clang/include/clang/Basic/BuiltinsMSP430*
-
-backend:RISC-V:
-  - llvm/**/*RISCV*/**/*
-  - llvm/**/*risv*/**/*
-  - clang/**/*RISCV*/**/*
-  - clang/**/*risv*/**/*
-
-backend:Sparc:
-  - llvm/include/llvm/IR/IntrinsicsSparc.td
-  - llvm/lib/Target/Sparc/**/*
-  - llvm/test/**/Sparc/**/*
-  - clang/lib/Basic/Targets/Sparc*
-  - clang/lib/Driver/ToolChains/Arch/Sparc*
-  - clang/lib/CodeGen/Targets/Sparc.cpp
-  - clang/include/clang/Basic/BuiltinsSparc*
-
-backend:X86:
-  - llvm/include/llvm/IR/IntrinsicsX86.td
-  - llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
-  - llvm/include/llvm/TargetParser/X86*
-  - llvm/lib/TargetParser/X86*
-  - llvm/**/**/X86/**/*
-  - llvm/utils/TableGen/X86*
-  - clang/lib/Basic/Targets/X86/**/*
-  - clang/lib/Driver/ToolChains/Arch/X86.*
-  - clang/lib/CodeGen/Targets/X86.*
-  - clang/test/CodeGen/X86/**/*
-  - clang/include/clang/Basic/BuiltinsX86*
-
-backend:loongarch:
-  - llvm/include/llvm/IR/IntrinsicsLoongArch.td
-  - llvm/lib/Target/LoongArch/**/*
-  - llvm/test/**/LoongArch/**/*
-  - clang/lib/Basic/Targets/LoongArch*
-  - clang/lib/Driver/ToolChains/Arch/LoongArch*
-  - clang/lib/CodeGen/Targets/LoongArch.cpp
-  - clang/include/clang/Basic/BuiltinsLoongArch*
-
-clang-format:
-  - clang/**/Format/**/*
-  - clang/tools/clang-format/**/*
-
-clang-tidy:
-  - clang-tools-extra/**/clang-tidy/**/*
-  - clang-tools-extra/clang-tidy/**/*
-
-clang:codegen:
-  - clang/lib/CodeGen/**/*
-  - clang/include/clang/CodeGen/**/*
-
 clang:dataflow:
   - clang/**/Analysis/**/*
 
-clang:driver:
-  - clang/**/Driver/**/*
-
 clang:frontend:
   - clang/lib/AST/**/*
   - clang/include/clang/AST/**/*
@@ -108,130 +23,43 @@ clang:static analyzer:
   - clang/utils/analyzer/**/*
   - clang/docs/analyzer/**/*
 
-clangd:
-  - clang-tools-extra/clangd/**/*
-
 compiler-rt:
   - compiler-rt/**/*
 
-coroutines:
-  - '**/*Coroutine*'
-  - '**/*Coroutine*/**/*'
-
 flang:
   - flang/**/*
 
-flang:driver:
-  - flang/tools/flang-driver/**/*
-  - flang/test/Driver/**/*
-
 flang:frontend:
   - flang/Parser/**/*
   - flang/Evaluate/**/*
   - flang/Semantics/**/*
 
-flang:runtime:
-  - flang/runtime/**/*
-
 HLSL:
   - clang/*HLSL*/**/*
   - clang/**/*HLSL*
   - llvm/**/Frontend/HLSL/**/*
 
-libc:
-  - libc/**/*
-  - utils/bazel/llvm-project-overlay/libc/**/*
-
 libc++:
   - libcxx/**/*
 
 libc++-abi:
   - libcxxabi/**/*
 
-libunwind:
-  - libunwind/**/*
-
 lld:
   - lld/**/*
 
-lldb:
-  - lldb/**/*
-
 llvm-lit:
   - llvm/utils/lit/**/*
 
-llvm:SelectionDAGP:
-  - llvm/include/llvm/CodeGen/DAGCombine.h
-  - llvm/include/llvm/CodeGen/ISDOpcodes.h
-  - llvm/include/llvm/CodeGen/SelectionDAG*.h
-  - llvm/include/llvm/CodeGen/SDNodeProperties.td
-  - llvm/include/llvm/Target/TargetSelectionDAG.td
-  - llvm/lib/CodeGen/SelectionDAG/**/*
-  - llvm/utils/TableGen/CodeGenDAG*
-  - llvm/utils/TableGen/DAGISel*
-
-llvm:globalisel:
-  - llvm/**/GlobalISel/**/*
-  - llvm/utils/TableGen/GlobalISelEmitter.cpp
-  - llvm/utils/TableGen/GICombinerEmitter.cpp
-
-LTO:
-  - llvm/**/LTO/**
-  - llvm/**/ThinLTO/**
-
 mc:
   - llvm/**/MC/**
 
-mlir:
-  - mlir/**
-
 mlir:afine:
   - mlir/**/Affine/**/*
 
-mlir:bufferization:
-  - mlir/**/Bufferization/**/*
-
-mlir:complex:
-  - mlir/**/Complex/**/*
-
-mlir:core:
-  - mlir/**/AsmParser/**/*
-  - mlir/**/Bytecode/**/*
-  - mlir/**/Debug/**/*
-  - mlir/**/IR/**/*
-  - mlir/**/Parser/**/*
-  - mlir/**/Pass/**/*
-  - mlir/**/Reducer/**/*
-  - mlir/**/Support/**/*
-  - mlir/**/Transforms/**/*
-  - mlir/**/tools/**/*
-  - mlir/tools/**/*
-
-mlir:gpu:
-  - mlir/**/*GPU*/**/*
-
-mlir:linalg:
-  - mlir/**/Linalg/**/*
-  - mlir/**/linalg/**/*
-
-mlir:llvm:
-  - mlir/**/LLVM/**/*
-
-mlir:memref:
-  - mlir/**/MemRef/**/*
-
 mlir:python:
   - mlir/python/**/*
 
-mlir:scf:
-  - mlir/**/SCF/**/*
-
-mlir:tensor:
-  - mlir/**/Tensor/**/*
-
-mlir:tosa:
-  - mlir/**/Tosa/**/*
-
 mlir:vectorops:
   - mlir/**/Vector/**/*
 
@@ -260,14 +88,641 @@ vectorization:
   - llvm/lib/Transforms/Vectorize/**/*
   - llvm/include/llvm/Transforms/Vectorize/**/*
 
+# IMPORTED FROM CODEOWNERS
+LTO:
+  - /llvm/*/LTO/
+  - /llvm/*/Linker/
+  - /llvm/*/ThinLTO/
+  - /llvm/lib/Transforms/*/FunctionImport*
+  - /llvm/tools/gold/
+
+mc:
+  - /llvm/*/MC/
+
+clang:driver:
+  - /clang/*/Driver/
+
+compiler-rt:sanitizer:
+  - /llvm/lib/Transforms/Instrumentation/*Sanitizer*
+  - /compiler-rt/lib/interception/
+  - /compiler-rt/lib/*san*
+  - /compiler-rt/test/*san*
+  - /compiler-rt/lib/fuzzer/
+  - /compiler-rt/test/fuzzer/
+  - /compiler-rt/lib/scudo/
+  - /compiler-rt/test/scudo/
+
 xray:
-  - llvm/tools/llvm-xray/**
-  - compiler-rt/**/xray/**
-  - clang/include/clang/Basic/XRay*
-  - clang/lib/Basic/XRay*
-  - compiler-rt/**/xray/*
-  - llvm/include/llvm/XRay/*
-  - llvm/lib/XRay/*
-  - llvm/tools/llvm-xray/*
-  - llvm/unittests/XRay/*
-  - compiler-rt/**/xray/*
+  - /llvm/tools/llvm-xray/
+  - /compiler-rt/*/xray/
+  - /clang/include/clang/Basic/XRay*
+  - /clang/lib/Basic/XRay*
+  - /compiler-rt/*/xray/
+  - /llvm/include/llvm/XRay/
+  - /llvm/lib/XRay/
+  - /llvm/tools/llvm-xray/
+  - /llvm/unittests/XRay/
+  - /compiler-rt/*/xray/
+
+clang:codegen:
+  - /clang/lib/CodeGen/**
+  - /clang/include/clang/CodeGen/
+
+mlir:
+  - /mlir/
+
+mlir:core:
+  - /mlir/**/Support/
+  - /mlir/**/Parser/
+  - /mlir/**/IR/
+  - /mlir/**/Bytecode/
+  - /mlir/**/AsmParser/
+  - /mlir/**/Pass/
+  - /mlir/**/tools/
+  - /mlir/**/Reducer/
+  - /mlir/**/Transforms/
+  - /mlir/**/Debug/
+  - /mlir/tools/
+
+mlir:ods:
+  - /mlir/TableGen/
+  - /mlir/tblgen/
+  - /mlir/include/mlir/IR/*.td
+
+mlir:bindings:
+  - /mlir/Bindings/
+
+mlir:gpu:
+  - /mlir/**/*GPU
+
+mlir:amdgpu:
+  - /mlir/**/AMDGPU/
+
+mlir:amx:
+  - /mlir/**/AMX/
+
+mlir:affine:
+  - /mlir/**/Affine/
+
+mlir:arith:
+  - /mlir/**/Arith/
+
+mlir:neon:
+  - /mlir/**/ArmNeon/
+
+mlir:sme:
+  - /mlir/**/ArmSME/
+
+mlir:sve:
+  - /mlir/**/ArmSVE/
+
+mlir:async:
+  - /mlir/**/Async/
+  - /mlir/**/Async/
+
+mlir:bufferization:
+  - /mlir/**/Bufferization/
+
+mlir:complex:
+  - /mlir/**/Complex/
+
+mlir:cf:
+  - /mlir/**/ControlFlow/
+
+mlir:dlti:
+  - /mlir/**/DLTI/
+
+mlir:emitc:
+  - /mlir/**/EmitC/
+
+mlir:func:
+  - /mlir/**/Func/
+
+mlir:irdl:
+  - /mlir/**/IRDL/
+
+mlir:index:
+  - /mlir/**/Index/
+
+mlir:llvm:
+  - /mlir/**/LLVM/
+
+mlir:linalg:
+  - /mlir/**/*linalg
+  - /mlir/**/*Linalg
+
+mlir:mlprogram:
+  - /mlir/**/MLProgram
+
+mlir:math:
+  - /mlir/**/Math/
+
+mlir:memref:
+  - /mlir/**/MemRef/
+
+mlir:nvgpu:
+  - /mlir/**/NVGPU/
+
+mlir:openacc:
+  - /mlir/**/*OpenACC
+
+mlir:openmp:
+  - /mlir/**/*OpenMP
+
+mlir:pdl:
+  - /mlir/**/PDL/
+
+mlir:quant:
+  - /mlir/**/Quant/
+
+mlir:scf:
+  - /mlir/**/SCF/
+
+mlir:spirv:
+  - /mlir/**/SPIRV/
+
+mlir:shape:
+  - /mlir/**/Shape/
+
+mlir:sparse:
+  - /mlir/**/SparseTensor/
+
+mlir:tensor:
+  - /mlir/**/Tensor/
+
+mlir:tosa:
+  - /mlir/**/Tosa/
+
+mlir:ub:
+  - /mlir/**/UB/
+
+mlir:vector:
+  - /mlir/**/*Vector/
+
+mlir:execution-engine:
+  - /mlir/**/ExecutionEngine/
+
+coroutines:
+  - /clang/docs/DebuggingCoroutines.rst
+  - /clang/lib/Sema/SemaCoroutine.cpp
+  - /clang/lib/CodeGen/CGCoroutine.cpp
+  - /clang/test/CodeGenCoroutines/
+  - /llvm/docs/Coroutines.rst
+  - /llvm/include/llvm/Transforms/Coroutines/
+  - /llvm/lib/Transforms/Coroutines/
+  - /llvm/test/Transforms/Coroutines/*
+
+clang:modules:
+  - /clang/docs/StandardCPlusPlusModules.rst
+  - /clang/include/clang/AST/AbstractBasicReader.h
+  - /clang/include/clang/AST/AbstractBasicWriter.h
+  - /clang/include/clang/AST/AbstractTypeReader.h
+  - /clang/include/clang/AST/AbstractTypeWriter.h
+  - /clang/include/clang/AST/PropertiesBase.td
+  - /clang/include/clang/AST/ODRHash.h
+  - /clang/include/clang/AST/TypeProperties.td
+  - /clang/include/clang/Basic/Module.h
+  - /clang/include/clang/Frontend/PrecompiledPreamble.h
+  - /clang/include/clang/Lex/ModuleLoader.h
+  - /clang/include/clang/Lex/ModuleMap.h
+  - /clang/include/clang/Serialization/
+  - /clang/lib/AST/ODRHash.cpp
+  - /clang/lib/AST/StmtProfile.cpp
+  - /clang/lib/Basic/Module.cpp
+  - /clang/lib/Frontend/ModuleDependencyCollector.cpp
+  - /clang/lib/Frontend/PrecompiledPreamble.cpp
+  - /clang/lib/Lex/ModuleMap.cpp
+  - /clang/lib/Sema/SemaModule.cpp
+  - /clang/lib/Serialization/
+  - /clang/test/CXX/module/
+  - /clang/test/Modules/
+  - /clang/unittests/Serialization/*
+
+clang-tidy:
+  - /clang-tools-extra/clang-tidy/
+  - /clang-tools-extra/docs/clang-tidy/
+  - /clang-tools-extra/test/clang-tidy/
+
+tools:llvm-mca:
+  - /llvm/tools/llvm-mca/
+  - /llvm/include/llvm/MCA/
+  - /llvm/lib/MCA/
+
+vectorizers:
+  - /llvm/lib/Transforms/Vectorize/
+  - /llvm/include/llvm/Transforms/Vectorize/
+
+clang:
+  - /clang/
+
+testing-tools:
+  - /llvm/include/llvm/FileCheck/
+  - /llvm/lib/FileCheck/
+  - /llvm/test/FileCheck/
+  - /llvm/unittests/FileCheck/
+  - /llvm/utils/lit/
+  - /llvm/utils/split-file/
+  - /llvm/utils/not/
+  - /llvm/utils/count/
+  - /llvm/utils/FileCheck/
+  - /llvm/docs/CommandGuide/FileCheck.rst
+  - /llvm/docs/CommandGuide/lit.rst
+  - /llvm/docs/TestingGuide.rst
+  - /llvm/test/Other/FileCheck-space.txt
+  - /llvm/utils/UpdateTestChecks/
+  - /llvm/utils/update*_test_checks.py
+
+debuginfo:
+  - /llvm/include/llvm/DebugInfo/
+  - /llvm/lib/DebugInfo/
+  - /llvm/tools/dsymutil/
+  - /llvm/tools/llvm-debuginfo-analyzer/
+  - /llvm/tools/llvm-dwarfdump/
+  - /llvm/tools/llvm-dwarfutil/
+  - /llvm/tools/llvm-dwp/
+  - /llvm/tools/llvm-gsymutil/
+  - /llvm/tools/llvm-pdbutil/
+  - /llvm/tools/llvm-debuginfod/
+  - /llvm/tools/llvm-debuginfod-find/
+  - /llvm/lib/CodeGen/AsmPrinter/
+  - /clang/lib/CodeGen/CGDebugInfo.cpp
+  - /llvm/include/llvm/BinaryFormat/Dwarf.*
+  - /llvm/test/DebugInfo/
+  - /llvm/test/tools/dsymutil/
+  - /llvm/test/tools/llvm-debuginfo-analyzer/
+  - /llvm/test/tools/llvm-debuginfod/
+  - /llvm/test/tools/llvm-debuginfod-find/
+  - /llvm/test/tools/llvm-dwarfdump/
+  - /llvm/test/tools/llvm-dwarfutil/
+  - /llvm/test/tools/llvm-dwp/
+  - /llvm/test/tools/llvm-gsymutil/
+  - /llvm/test/tools/llvm-pdbuti/
+  - /llvm/lib/IR/Debug*.cpp
+  - /llvm/include/llvm/IR/Debug*.h
+
+github:workflow:
+  - /.github/workflows/
+
+flang:driver:
+  - /flang/tools/flang-driver/
+  - /flang/unittests/Frontend/
+  - /flang/lib/FrontendTool/
+  - /flang/lib/Frontend/
+  - /flang/include/flang/Frontend/
+  - /flang/include/flang/FrontendTool/
+  - /flang/test/Driver/
+
+backend:m68k:
+  - /llvm/lib/Target/M68k/
+  - /clang/lib/Basic/Targets/M68k.*
+  - /clang/lib/CodeGen/Targets/M68k.cpp
+  - /llvm/test/CodeGen/M68k/
+  - /llvm/test/MC/Disassembler/M68k/
+  - /llvm/test/MC/M68k/
+
+libc++:
+  - /libcxx/
+  - /runtimes/
+
+libc++abi:
+  - /libcxxabi/
+  - /runtimes/
+
+libunwind:
+  - /libunwind/
+  - /runtimes/
+
+objectyaml:
+  - /llvm/include/llvm/ObjectYAML/
+  - /llvm/lib/ObjectYAML/
+  - /llvm/test/tools/obj2yaml/
+  - /llvm/test/tools/yaml2obj/
+  - /llvm/tools/obj2yaml/
+  - /llvm/tools/yaml2obj/
+
+clang:analysis:
+  - /clang/include/clang/Analysis/
+  - /clang/lib/Analysis/
+
+clang:static analyzer:
+  - /clang/include/clang/StaticAnalyzer/
+  - /clang/lib/StaticAnalyzer/
+  - /clang/tools/scan-build/
+  - /clang/utils/analyzer/
+  - /clang/docs/analyzer/
+
+pgo:
+  - /llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+  - /llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+  - /llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+  - /llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+  - /llvm/lib/Transforms/Instrumentation/PGO*
+  - /llvm/lib/Transforms/Instrumentation/ValueProfile*
+  - /llvm/test/Instrumentation/InstrProfiling/
+  - /llvm/test/Transforms/PGOProfile/
+  - /compiler-rt/lib/profile/
+  - /compiler-rt/lib/memprof/
+  - /compiler-rt/test/profile/
+  - /compiler-rt/test/memprof/
+  - /llvm/tools/llvm-profdata/
+  - /llvm/tools/llvm-profgen/
+  - /llvm/test/tools/llvm-profdata/
+  - /llvm/test/tools/llvm-profgen/
+  - /llvm/unittests/ProfileData/*
+
+openacc:
+  - /flang/**/OpenACC/
+  - /flang/include/flang/Lower/OpenACC.h
+  - /flang/docs/OpenACC.md
+  - /flang/lib/Parser/openacc-parsers.cpp
+  - /flang/lib/Lower/OpenACC.cpp
+  - /llvm/**/Frontend/OpenACC/
+  - /llvm/unittests/Frontend/OpenACCTest.cpp
+  - /mlir/test/Target/LLVMIR/openacc-llvm.mlir
+  - /mlir/**/*OpenACC/
+
+flang:runtime:
+  - /flang/runtime/
+
+flang:parser:
+  - /flang/**/Parser/
+
+flang:semantics:
+  - /flang/**/Evaluate/
+  - /flang/**/Semantics/
+
+flang:fir-hlfir:
+  - /flang/**/Lower/
+  - /flang/**/Optimizer/
+
+flang:codegen:
+  - flang/**/CodeGen/
+
+llvm:globalisel:
+  - /llvm/**/GlobalISel/
+  - /llvm/utils/TableGen/GlobalISel*
+
+function-specialization:
+  - /llvm/include/llvm/Transforms/Utils/SCCPSolver.h
+  - /llvm/lib/Transforms/Utils/SCCPSolver.cpp
+  - /llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+  - /llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+  - /llvm/test/Transforms/FunctionSpecialization/*
+
+libc:
+  - libc/**
+  - utils/bazel/llvm-project-overlay/libc/**
+
+clang-format:
+  - /clang/**/Format/
+  - /clang/tools/clang-format/
+
+flang:openmp:
+  - /flang/test/**/OpenMP/
+  - /flang/lib/Lower/OpenMP.cpp
+  - /flang/lib/Semantics/resolve-directives.cpp
+  - /flang/lib/Semantics/check-omp-structure.cpp
+  - /flang/lib/Optimizer/Transforms/OMP*
+  - /flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+  - /flang/test/Lower/OpenMP/
+  - /flang/test/Transforms/omp*
+  - /mlir/**/*OpenMP*
+  - /mlir/test/Target/LLVMIR/openmp*
+  - /llvm/lib/Frontend/OpenMP/
+  - /llvm/include/llvm/Frontend/OpenMP/
+  - /llvm/unittests/Frontend/OpenMP*
+
+llvm:ir:
+  - /llvm/lib/IR/
+  - /llvm/include/llvm/IR/
+  - /llvm/docs/LangRef.rst
+
+llvm:analysis:
+  - /llvm/lib/Analysis/
+  - /llvm/include/llvm/Analysis/
+
+llvm:transforms:
+  - /llvm/lib/Transforms/
+  - /llvm/include/llvm/Transforms/
+
+clangd:
+  - /clang-tools-extra/clangd/
+
+hlsl:
+  - /clang/test/ParserHLSL/
+  - /clang/test/SemaHLSL/
+  - /clang/test/AST/HLSL/
+  - /clang/test/CodeGenHLSL/
+  - /clang/cmake/caches/HLSL.cmake
+  - /clang/include/clang/Basic/HLSL*.h
+  - /clang/include/clang/Sema/HLSL*.h
+  - /clang/docs/HLSL/
+  - /clang/lib/Driver/ToolChains/HLSL*
+  - /clang/lib/Parse/ParseHLSL.cpp
+  - /clang/lib/Sema/HLSLExternalSemaSource.cpp
+  - /clang/lib/Sema/SemaHLSL.cpp
+  - /clang/lib/CodeGen/CGHLSLRuntime.*
+  - /llvm/include/llvm/Frontend/HLSL/
+  - /llvm/lib/Frontend/HLSL/
+
+llvm:SelectionDAG:
+  - /llvm/include/llvm/CodeGen/SelectionDAG*.h
+  - /llvm/include/llvm/CodeGen/SDNodeProperties.td
+  - /llvm/include/llvm/Target/TargetSelectionDAG.td
+  - /llvm/lib/CodeGen/SelectionDAG/
+  - /llvm/utils/TableGen/CodeGenDAG*
+  - /llvm/utils/TableGen/DAGISel*
+  - /llvm/include/llvm/CodeGen/DAGCombine.h
+  - /llvm/include/llvm/CodeGen/ISDOpcodes.h
+
+backend:DirectX:
+  - /llvm/lib/Target/DirectX/
+  - /llvm/test/CodeGen/DirectX/
+  - /llvm/tools/dxil-dis
+  - /llvm/test/tools/dxil-dis
+  - /clang/lib/Basic/Targets/DirectX*
+  - /llvm/include/llvm/IR/IntrinsicsDirectX.td
+
+mlgo:
+  - /llvm/lib/Analysis/ML*
+  - /llvm/include/llvm/Analysis/ML*
+  - /llvm/lib/Analysis/*Runner.cpp
+  - /llvm/include/llvm/Analysis/*Runner.h
+  - /llvm/unittests/Analysis/ML*
+  - /llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+  - /llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+  - /llvm/test/Analysis/FunctionPropertiesAnalysis/*
+  - /llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+  - /llvm/test/Transforms/inline/ML/
+  - /llvm/lib/CodeGen/ML*
+  - /llvm/unittests/CodeGen/ML*
+  - /llvm/test/CodeGen/MLRegAlloc/
+
+tools:llvm-exegesis:
+  - /llvm/tools/llvm-exegesis/
+  - /llvm/test/tools/llvm-exegesis/
+  - /llvm/unittests/tools/llvm-exegesis/
+
+platform:windows:
+  - /lld/COFF/
+  - /clang/lib/Driver/MSVC.cpp
+  - /clang/lib/Driver/MinGW.cpp
+  - /llvm/lib/DebugInfo/CodeView/
+  - /llvm/lib/DebugInfo/PDB/
+  - /llvm/lib/WindowsDriver/
+  - /llvm/lib/Support/Windows/
+  - /llvm/lib/BinaryFormat/COFF.cpp
+
+llvm:regalloc:
+  - /llvm/**/CodeGen/CalcSpillWeights*
+  - /llvm/**/CodeGen/InlineSpiller*
+  - /llvm/**/CodeGen/InterferenceCache*
+  - /llvm/**/CodeGen/LiveInterval*
+  - /llvm/**/CodeGen/LiveRange*
+  - /llvm/**/CodeGen/LiveReg*
+  - /llvm/**/CodeGen/LiveVariables*
+  - /llvm/**/CodeGen/MachineCopyPropagation*
+  - /llvm/**/CodeGen/PHIElimination*
+  - /llvm/**/CodeGen/ProcessImplicitDefs.cpp
+  - /llvm/**/CodeGen/Register*
+  - /llvm/**/CodeGen/RegUsage*
+  - /llvm/**/CodeGen/RenameIndependentSubregs.cpp
+  - /llvm/**/CodeGen/SlotIndexes.h
+  - /llvm/**/CodeGen/SpillPlacement*
+  - /llvm/**/CodeGen/SplitKit*
+  - /llvm/**/CodeGen/VirtRegMap.h
+  - /llvm/include/PBQP/
+  - /llvm/include/PBQPRAConstraint.h
+  - /llvm/include/llvm/CodeGen/Spiller.h
+  - /llvm/**/*RegAlloc
+
+mlir:presburger:
+  - /mlir/**/*Presburger
+
+lldb:
+  - /lldb/**
+
+backend:AMDGPU:
+  - '**/*amdgpu*'
+  - '**/*AMDGPU*'
+
+backend:RISC-V:
+  - /clang/**/*riscv*
+  - /clang/**/*RISCV*
+  - /llvm/**/*riscv*
+  - /llvm/**/*RISCV*
+
+lld:coff:
+  - /lld/**/COFF/
+  - /lld/Common/
+
+lld:elf:
+  - /lld/**/ELF/
+  - /lld/Common/
+
+lld:macho:
+  - /lld/**/MachO/
+  - /lld/Common/
+
+lld:wasm:
+  - /lld/**/wasm/
+  - /lld/Common/
+
+backend:ARM:
+  - /llvm/include/llvm/IR/IntrinsicsARM.td
+  - /llvm/test/MC/ARM/
+  - /llvm/lib/Target/ARM/
+  - /llvm/test/CodeGen/ARM/
+  - /clang/lib/Basic/Targets/ARM*
+  - /clang/lib/Driver/ToolChains/Arch/ARM.*
+  - /clang/lib/CodeGen/Targets/ARM.cpp
+  - /clang/include/clang/Basic/BuiltinsARM*
+  - /llvm/test/MC/DisasemblerARM/
+
+backend:AArch64:
+  - /llvm/include/llvm/IR/IntrinsicsAArch64.td
+  - /llvm/test/MC/AArch64/
+  - /llvm/lib/Target/AArch64/
+  - /llvm/test/CodeGen/AArch64/
+  - /clang/lib/Basic/Targets/AArch64*
+  - /clang/lib/Driver/ToolChains/Arch/AArch64.*
+  - /clang/lib/CodeGen/Targets/AArch64.cpp
+  - /clang/include/clang/Basic/BuiltinsAArch64*
+  - /llvm/test/MC/Disassembler/AArch64/
+
+backend:loongarch:
+  - /llvm/include/llvm/IR/IntrinsicsLoongArch.td
+  - /llvm/test/MC/LoongArch/
+  - /llvm/lib/Target/LoongArch/
+  - /llvm/test/CodeGen/LoongArch/
+  - /clang/lib/Basic/Targets/LoongArch*
+  - /clang/lib/Driver/ToolChains/Arch/LoongArch.*
+  - /clang/lib/CodeGen/Targets/LoongArch.cpp
+  - /clang/include/clang/Basic/BuiltinsLoongArch*
+
+backend:MSP430:
+  - /llvm/include/llvm/IR/IntrinsicsMSP430.td
+  - /llvm/test/MC/MSP430/
+  - /llvm/lib/Target/MSP430/
+  - /llvm/test/CodeGen/MSP430/
+  - /clang/lib/Basic/Targets/MSP430*
+  - /clang/lib/Driver/ToolChains/Arch/MSP430.*
+  - /clang/lib/CodeGen/Targets/MSP430.cpp
+  - /clang/include/clang/Basic/BuiltinsMSP430*
+  - /llvm/test/MC/Disassembler/MSP430/
+
+backend:Sparc:
+  - /llvm/include/llvm/IR/IntrinsicsSparc.td
+  - /llvm/test/MC/Sparc/
+  - /llvm/lib/Target/Sparc/
+  - /llvm/test/CodeGen/Sparc/
+  - /clang/lib/Basic/Targets/Sparc*
+  - /clang/lib/Driver/ToolChains/Arch/Sparc.*
+  - /clang/lib/CodeGen/Targets/Sparc.cpp
+  - /clang/include/clang/Basic/BuiltinsSparc*
+  - /llvm/test/MC/Disassembler/Sparc/
+
+backend:WebAssembly:
+  - /llvm/lib/Target/WebAssembly/
+  - /llvm/test/CodeGen/WebAssembly/
+  - /clang/lib/Basic/Targets/WebAssembly*
+  - /clang/include/clang/Basic/BuiltinsWebAssembly.def
+  - /clang/include/clang/Basic/WebAssemblyReferenceTypes.def
+  - /clang/lib/CodeGen/Targets/WebAssembly*
+  - /llvm/include/llvm/IR/IntinsicsWebAssembly.td
+  - /llvm/include/llvm/Object/Wasm*
+  - /llvm/lib/CodeGen/AsmPrinter/Wasm*
+  - /llvm/lib/CodeGen/Wasm*
+  - /llvm/lib/MC/MCParser/Wasm*
+  - /llvm/lib/MC/Wasm*
+  - /llvm/lib/ObjCopy/wasm/
+  - /llvm/lib/Object/Wasm*
+  - /clang/lib/Driver/Toolchains/WebAssembly*
+  - /clang/lib/Headers/wasm_simd128.h
+  - /clang/test/CodeGen/WebAssembly/
+  - /clang/test/SemaCXX/*wasm*
+  - /clang/test/Sema/*wasm*
+  - /llvm/include/llvm/BinaryFormat/Wasm.h
+  - /llvm/unittests/Target/WebAssembly/
+  - /llvm/test/DebugInfo/WebAssembly/
+  - /llvm/test/MC/WebAssembly/
+
+backend:X86:
+  - /llvm/include/llvm/IR/IntrinsicsX86.td
+  - /llvm/lib/Target/X86/
+  - /llvm/test/CodeGen/X86/
+  - /llvm/test/MC/X86/
+  - /llvm/test/MC/Disassembler/X86/
+  - /llvm/test/Analysis/CostModel/X86/
+  - /llvm/test/tools/llvm-mca/X86/
+  - /clang/lib/Basic/Targets/X86/
+  - /clang/lib/Driver/ToolChains/Arch/X86.*
+  - /clang/lib/CodeGen/Targets/X86.*
+  - /clang/lib/Headers/
+  - /clang/test/CodeGen/X86/
+  - /clang/include/clang/Basic/BuiltinsX86*
+  - /llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+  - /llvm/include/llvm/TargetParser/X86*
+  - /llvm/lib/TargetParser/X86*
+  - /llvm/utils/TableGen/X86*
+
diff --git a/.github/workflows/pr-subscriber.yml b/.github/workflows/pr-subscriber.yml
new file mode 100644
index 000000000000000..af40f836811d6e8
--- /dev/null
+++ b/.github/workflows/pr-subscriber.yml
@@ -0,0 +1,31 @@
+name: PR Subscriber
+
+on:
+  pull_request:
+    types:
+      - labeled
+
+permissions:
+  contents: read
+
+jobs:
+  auto-subscribe:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Setup Automation Script
+        run: |
+          curl -O -L https://raw.githubusercontent.com/$GITHUB_REPOSITORY/$GITHUB_SHA/llvm/utils/git/github-automation.py
+          curl -O -L https://raw.githubusercontent.com/$GITHUB_REPOSITORY/$GITHUB_SHA/llvm/utils/git/requirements.txt
+          chmod a+x github-automation.py
+          pip install -r requirements.txt
+
+      - name: Update watchers
+        # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
+        env:
+          LABEL_NAME: ${{ github.event.label.name }}
+        run: |
+          ./github-automation.py \
+          --token '${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}' \
+          pr-subscriber \
+          --issue-number '${{ github.event.pull_request.number }}'
+          --label-name "$LABEL_NAME"
diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index 1ec58fef573dcaf..6fcd29301230a54 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -65,6 +65,36 @@ def run(self) -> bool:
         return False
 
 
+class PRSubscriber:
+    @property
+    def team_name(self) -> str:
+        return self._team_name
+
+    def __init__(self, token: str, repo: str, pr_number: int, label_name: str):
+        self.repo = github.Github(token).get_repo(repo)
+        self.org = github.Github(token).get_organization(self.repo.organization.login)
+        self.pr = self.repo.get_issue(pr_number).as_pull_request()
+        self._team_name = "pr-subscribers-{}".format(label_name).lower()
+
+    def run(self) -> bool:
+        for team in self.org.get_teams():
+            if self.team_name != team.name.lower():
+                continue
+            try:
+                # GitHub limits comments to 65,536 characters, let's limit our comments to 20,000.
+                patch = requests.get(self.pr.diff_url).text[0:20000]
+            except:
+                patch = ""
+            comment = (
+                "@llvm/{}".format(team.slug)
+                + "\n\n<details><summary>Changes</summary><pre>\n"
+                + patch
+                + "\n</pre></details>"
+            )
+            self.pr.as_issue().create_comment(comment)
+        return True
+
+
 def setup_llvmbot_git(git_dir="."):
     """
     Configure the git repo in `git_dir` with the llvmbot account so
@@ -506,6 +536,10 @@ def execute_command(self) -> bool:
 issue_subscriber_parser.add_argument("--label-name", type=str, required=True)
 issue_subscriber_parser.add_argument("--issue-number", type=int, required=True)
 
+pr_subscriber_parser = subparsers.add_parser("pr-subscriber")
+pr_subscriber_parser.add_argument("--label-name", type=str, required=True)
+pr_subscriber_parser.add_argument("--issue-number", type=int, required=True)
+
 release_workflow_parser = subparsers.add_parser("release-workflow")
 release_workflow_parser.add_argument(
     "--llvm-project-dir",
@@ -551,6 +585,11 @@ def execute_command(self) -> bool:
         args.token, args.repo, args.issue_number, args.label_name
     )
     issue_subscriber.run()
+elif args.command == "pr-subscriber":
+    pr_subscriber = PRSubscriber(
+        args.token, args.repo, args.issue_number, args.label_name
+    )
+    pr_subscriber.run()
 elif args.command == "release-workflow":
     release_workflow = ReleaseWorkflow(
         args.token,

>From 123bf084029e4d35420a95e47ddd332d24c23a6d Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Fri, 8 Sep 2023 13:41:29 -0300
Subject: [PATCH 25/32] [libc] Unify gettime implementations (#65383)

Similar to D159208, this patch unifies the calls to a syscall, in this
patch it is the syscall SYS_clock_gettime/SYS_clock_gettime64.

This patch also fixes calls to SYS_clock_gettime64 by creating a
timespec64 object, passing it to the syscall and rewriting the timespec
given by the caller with timespec64 object's contents. This fixes cases
where timespec has a 4 bytes long time_t member, but SYS_clock_gettime
is not available (e.g., rv32).
---
 libc/src/time/CMakeLists.txt                  | 41 +++++++------------
 libc/src/time/linux/CMakeLists.txt            | 26 ++++++++++++
 libc/src/time/linux/clock.cpp                 | 18 +++-----
 .../clockGetTimeImpl.h}                       | 34 +++++++--------
 libc/src/time/linux/clock_gettime.cpp         | 35 ++++++++++++++++
 libc/src/time/{ => linux}/gettimeofday.cpp    | 26 +++++-------
 libc/src/time/linux/time.cpp                  | 15 ++-----
 7 files changed, 112 insertions(+), 83 deletions(-)
 rename libc/src/time/{clock_gettime.cpp => linux/clockGetTimeImpl.h} (63%)
 create mode 100644 libc/src/time/linux/clock_gettime.cpp
 rename libc/src/time/{ => linux}/gettimeofday.cpp (58%)

diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt
index 5a0b3ab31cf0f88..210f3b23432b49f 100644
--- a/libc/src/time/CMakeLists.txt
+++ b/libc/src/time/CMakeLists.txt
@@ -35,19 +35,6 @@ add_entrypoint_object(
     libc.include.time
 )
 
-add_entrypoint_object(
-  clock_gettime
-  SRCS
-    clock_gettime.cpp
-  HDRS
-    clock_gettime.h
-  DEPENDS
-    libc.include.time
-    libc.include.sys_syscall
-    libc.src.__support.OSUtil.osutil
-    libc.src.errno.errno
-)
-
 add_entrypoint_object(
   difftime
   SRCS
@@ -58,20 +45,6 @@ add_entrypoint_object(
     libc.include.time
 )
 
-add_entrypoint_object(
-  gettimeofday
-  SRCS
-    gettimeofday.cpp
-  HDRS
-    gettimeofday.h
-  DEPENDS
-    .clock_gettime
-    libc.include.time
-    libc.include.sys_syscall
-    libc.src.__support.OSUtil.osutil
-    libc.src.errno.errno
-)
-
 add_entrypoint_object(
   gmtime
   SRCS
@@ -126,3 +99,17 @@ add_entrypoint_object(
   DEPENDS
     .${LIBC_TARGET_OS}.nanosleep
 )
+
+add_entrypoint_object(
+  clock_gettime
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.clock_gettime
+)
+
+add_entrypoint_object(
+  gettimeofday
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.gettimeofday
+)
diff --git a/libc/src/time/linux/CMakeLists.txt b/libc/src/time/linux/CMakeLists.txt
index 8b4976847f82af3..df79bf5986261cf 100644
--- a/libc/src/time/linux/CMakeLists.txt
+++ b/libc/src/time/linux/CMakeLists.txt
@@ -38,3 +38,29 @@ add_entrypoint_object(
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
+
+add_entrypoint_object(
+  clock_gettime
+  SRCS
+    clock_gettime.cpp
+  HDRS
+    ../clock_gettime.h
+  DEPENDS
+    libc.include.time
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
+  gettimeofday
+  SRCS
+    gettimeofday.cpp
+  HDRS
+    ../gettimeofday.h
+  DEPENDS
+    libc.include.time
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp
index 2b19f8e9c54cea7..cf27f5c2995712a 100644
--- a/libc/src/time/linux/clock.cpp
+++ b/libc/src/time/linux/clock.cpp
@@ -12,6 +12,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/errno/libc_errno.h"
+#include "src/time/linux/clockGetTimeImpl.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 #include <time.h>
@@ -20,19 +21,10 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(clock_t, clock, ()) {
   struct timespec ts;
-#if SYS_clock_gettime
-  int ret = __llvm_libc::syscall_impl<int>(
-      SYS_clock_gettime, CLOCK_PROCESS_CPUTIME_ID, reinterpret_cast<long>(&ts));
-#elif defined(SYS_clock_gettime64)
-  int ret = __llvm_libc::syscall_impl<int>(SYS_clock_gettime64,
-                                           CLOCK_PROCESS_CPUTIME_ID,
-                                           reinterpret_cast<long>(&ts));
-#else
-#error "SYS_clock_gettime and SYS_clock_gettime64 syscalls not available."
-#endif
-  if (ret < 0) {
-    libc_errno = -ret;
-    return clock_t(-1);
+  auto result = internal::clock_gettimeimpl(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
   }
 
   // The above syscall gets the CPU time in seconds plus nanoseconds.
diff --git a/libc/src/time/clock_gettime.cpp b/libc/src/time/linux/clockGetTimeImpl.h
similarity index 63%
rename from libc/src/time/clock_gettime.cpp
rename to libc/src/time/linux/clockGetTimeImpl.h
index 94818c03768cc98..a5b7d659c7c14c2 100644
--- a/libc/src/time/clock_gettime.cpp
+++ b/libc/src/time/linux/clockGetTimeImpl.h
@@ -1,4 +1,4 @@
-//===---------- Linux implementation of the POSIX clock_gettime function --===//
+//===- Linux implementation of the POSIX clock_gettime function -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,40 +6,42 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/time/clock_gettime.h"
+#ifndef LLVM_LIBC_SRC_TIME_LINUX_CLOCKGETTIMEIMPL_H
+#define LLVM_LIBC_SRC_TIME_LINUX_CLOCKGETTIMEIMPL_H
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/error_or.h"
 #include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 #include <time.h>
 
 namespace __llvm_libc {
+namespace internal {
 
-// TODO(michaelrj): Move this into time/linux with the other syscalls.
-LLVM_LIBC_FUNCTION(int, clock_gettime,
-                   (clockid_t clockid, struct timespec *tp)) {
+LIBC_INLINE ErrorOr<int> clock_gettimeimpl(clockid_t clockid,
+                                           struct timespec *ts) {
 #if SYS_clock_gettime
   int ret = __llvm_libc::syscall_impl<int>(SYS_clock_gettime,
                                            static_cast<long>(clockid),
-                                           reinterpret_cast<long>(tp));
+                                           reinterpret_cast<long>(ts));
 #elif defined(SYS_clock_gettime64)
+  struct timespec64 ts64;
   int ret = __llvm_libc::syscall_impl<int>(SYS_clock_gettime64,
                                            static_cast<long>(clockid),
-                                           reinterpret_cast<long>(tp));
+                                           reinterpret_cast<long>(&ts64));
+  ts->tv_sec = static_cast<time_t>(ts64.tv_sec);
+  ts->tv_nsec = static_cast<long>(ts64.tv_nsec);
 #else
 #error "SYS_clock_gettime and SYS_clock_gettime64 syscalls not available."
 #endif
-
-  // A negative return value indicates an error with the magnitude of the
-  // value being the error code.
-  if (ret < 0) {
-    libc_errno = -ret;
-    return -1;
-  }
-
-  return 0;
+  if (ret < 0)
+    return Error(-ret);
+  return ret;
 }
 
+} // namespace internal
 } // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_TIME_LINUX_CLOCKGETTIMEIMPL_H
diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp
new file mode 100644
index 000000000000000..33ec04eb352743e
--- /dev/null
+++ b/libc/src/time/linux/clock_gettime.cpp
@@ -0,0 +1,35 @@
+//===---------- Linux implementation of the POSIX clock_gettime function --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/time/clock_gettime.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/errno/libc_errno.h"
+#include "src/time/linux/clockGetTimeImpl.h"
+
+#include <sys/syscall.h> // For syscall numbers.
+#include <time.h>
+
+namespace __llvm_libc {
+
+// TODO(michaelrj): Move this into time/linux with the other syscalls.
+LLVM_LIBC_FUNCTION(int, clock_gettime,
+                   (clockid_t clockid, struct timespec *ts)) {
+  auto result = internal::clock_gettimeimpl(clockid, ts);
+
+  // A negative return value indicates an error with the magnitude of the
+  // value being the error code.
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/time/gettimeofday.cpp b/libc/src/time/linux/gettimeofday.cpp
similarity index 58%
rename from libc/src/time/gettimeofday.cpp
rename to libc/src/time/linux/gettimeofday.cpp
index 8d44e630cc13b32..2df6429974164a9 100644
--- a/libc/src/time/gettimeofday.cpp
+++ b/libc/src/time/linux/gettimeofday.cpp
@@ -11,6 +11,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/errno/libc_errno.h"
+#include "src/time/linux/clockGetTimeImpl.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
@@ -21,26 +22,19 @@ LLVM_LIBC_FUNCTION(int, gettimeofday,
                    (struct timeval * tv, [[maybe_unused]] void *unused)) {
   if (tv == nullptr)
     return 0;
-  struct timespec tp;
-#if SYS_clock_gettime
-  int ret = __llvm_libc::syscall_impl<int>(SYS_clock_gettime,
-                                           static_cast<long>(CLOCK_REALTIME),
-                                           reinterpret_cast<long>(&tp));
-#elif defined(SYS_clock_gettime64)
-  int ret = __llvm_libc::syscall_impl<int>(SYS_clock_gettime64,
-                                           static_cast<long>(CLOCK_REALTIME),
-                                           reinterpret_cast<long>(&tp));
-#else
-#error "SYS_clock_gettime and SYS_clock_gettime64 syscalls not available."
-#endif
+
+  struct timespec ts;
+  auto result = internal::clock_gettimeimpl(CLOCK_REALTIME, &ts);
+
   // A negative return value indicates an error with the magnitude of the
   // value being the error code.
-  if (ret < 0) {
-    libc_errno = -ret;
+  if (!result.has_value()) {
+    libc_errno = result.error();
     return -1;
   }
-  tv->tv_sec = tp.tv_sec;
-  tv->tv_usec = static_cast<suseconds_t>(tp.tv_nsec / 1000);
+
+  tv->tv_sec = ts.tv_sec;
+  tv->tv_usec = static_cast<suseconds_t>(ts.tv_nsec / 1000);
   return 0;
 }
 
diff --git a/libc/src/time/linux/time.cpp b/libc/src/time/linux/time.cpp
index 34cca42adf5baf4..4dbcabc8963510a 100644
--- a/libc/src/time/linux/time.cpp
+++ b/libc/src/time/linux/time.cpp
@@ -11,6 +11,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/errno/libc_errno.h"
+#include "src/time/linux/clockGetTimeImpl.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 #include <time.h>
@@ -20,17 +21,9 @@ namespace __llvm_libc {
 LLVM_LIBC_FUNCTION(time_t, time, (time_t * tp)) {
   // TODO: Use the Linux VDSO to fetch the time and avoid the syscall.
   struct timespec ts;
-#if SYS_clock_gettime
-  int ret = __llvm_libc::syscall_impl<int>(SYS_clock_gettime, CLOCK_REALTIME,
-                                           reinterpret_cast<long>(&ts));
-#elif defined(SYS_clock_gettime64)
-  int ret = __llvm_libc::syscall_impl<int>(SYS_clock_gettime64, CLOCK_REALTIME,
-                                           reinterpret_cast<long>(&ts));
-#else
-#error "SYS_clock_gettime and SYS_clock_gettime64 syscalls not available."
-#endif
-  if (ret < 0) {
-    libc_errno = -ret;
+  auto result = internal::clock_gettimeimpl(CLOCK_REALTIME, &ts);
+  if (!result.has_value()) {
+    libc_errno = result.error();
     return -1;
   }
 

>From 3cfdef37155d1f0e5b32abe7c84e8304ef77ca10 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Sat, 9 Sep 2023 00:48:06 +0800
Subject: [PATCH 26/32] [Clang] Fix the for statement disappearing in AST when
 an error occurs in the conditional expression of the for statement (#65381)

Consider:
```
constexpr int f() {
    int sum = 0;
    for (int i = 0; undefined_var; ++i) {
        sum += i;
    }
    return sum;
}

static_assert(f());
```

The AST before this patch:
```
|-FunctionDecl <line:1:1, line:7:1> line:1:15 used constexpr f 'int ()' implicit-inline
| `-CompoundStmt <col:19, line:7:1>
|   |-DeclStmt <line:2:5, col:16>
|   | `-VarDecl <col:5, col:15> col:9 used sum 'int' cinit
|   |   `-IntegerLiteral <col:15> 'int' 0
|   `-ReturnStmt <line:6:5, col:12>
|     `-ImplicitCastExpr <col:12> 'int' <LValueToRValue>
|       `-DeclRefExpr <col:12> 'int' lvalue Var 0xb870518 'sum' 'int'
```

The AST after this patch:
```
|-FunctionDecl 0x11d0f63f8 <./main.cpp:1:1, line:7:1> line:1:15 used constexpr f 'int ()' implicit-inline
| `-CompoundStmt 0x11d110880 <col:19, line:7:1>
|   |-DeclStmt 0x11d0f65c8 <line:2:5, col:16>
|   | `-VarDecl 0x11d0f6528 <col:5, col:15> col:9 used sum 'int' cinit
|   |   `-IntegerLiteral 0x11d0f6590 <col:15> 'int' 0
|   |-ForStmt 0x11d110800 <line:3:5, line:5:5>
|   | |-DeclStmt 0x11d0f66a0 <line:3:10, col:19>
|   | | `-VarDecl 0x11d0f6600 <col:10, col:18> col:14 used i 'int' cinit
|   | |   `-IntegerLiteral 0x11d0f6668 <col:18> 'int' 0
|   | |-<<<NULL>>>
|   | |-RecoveryExpr 0x11d0f66e8 <col:21> 'bool' contains-errors
|   | |-UnaryOperator 0x11d0f6728 <col:36, col:38> 'int' lvalue prefix '++'
|   | | `-DeclRefExpr 0x11d0f6708 <col:38> 'int' lvalue Var 0x11d0f6600 'i' 'int'
|   | `-CompoundStmt 0x11d0f67c8 <col:41, line:5:5>
|   |   `-CompoundAssignOperator 0x11d0f6798 <line:4:9, col:16> 'int' lvalue '+=' ComputeLHSTy='int' ComputeResultTy='int'
|   |     |-DeclRefExpr 0x11d0f6740 <col:9> 'int' lvalue Var 0x11d0f6528 'sum' 'int'
|   |     `-ImplicitCastExpr 0x11d0f6780 <col:16> 'int' <LValueToRValue>
|   |       `-DeclRefExpr 0x11d0f6760 <col:16> 'int' lvalue Var 0x11d0f6600 'i' 'int'
|   `-ReturnStmt 0x11d110870 <line:6:5, col:12>
|     `-ImplicitCastExpr 0x11d110858 <col:12> 'int' <LValueToRValue>
|       `-DeclRefExpr 0x11d110838 <col:12> 'int' lvalue Var 0x11d0f6528 'sum' 'int'
```

---------

Co-authored-by: Shafik Yaghmour <shafik at users.noreply.github.com>
---
 clang/lib/Parse/ParseStmt.cpp                 | 19 +++++++++++++++++--
 clang/test/AST/ast-dump-recovery.cpp          | 15 +++++++++++++++
 .../constexpr-function-recovery-crash.cpp     |  1 +
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 110806ef0c77d63..fb883c08a745cfa 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -2158,11 +2158,13 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
         // for-range-declaration next.
         bool MightBeForRangeStmt = !ForRangeInfo.ParsedForRangeDecl();
         ColonProtectionRAIIObject ColonProtection(*this, MightBeForRangeStmt);
+        SourceLocation SecondPartStart = Tok.getLocation();
+        Sema::ConditionKind CK = Sema::ConditionKind::Boolean;
         SecondPart = ParseCXXCondition(
-            nullptr, ForLoc, Sema::ConditionKind::Boolean,
+            /*InitStmt=*/nullptr, ForLoc, CK,
             // FIXME: recovery if we don't see another semi!
             /*MissingOK=*/true, MightBeForRangeStmt ? &ForRangeInfo : nullptr,
-            /*EnterForConditionScope*/ true);
+            /*EnterForConditionScope=*/true);
 
         if (ForRangeInfo.ParsedForRangeDecl()) {
           Diag(FirstPart.get() ? FirstPart.get()->getBeginLoc()
@@ -2178,6 +2180,19 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
                 << FixItHint::CreateRemoval(EmptyInitStmtSemiLoc);
           }
         }
+
+        if (SecondPart.isInvalid()) {
+          ExprResult CondExpr = Actions.CreateRecoveryExpr(
+              SecondPartStart,
+              Tok.getLocation() == SecondPartStart ? SecondPartStart
+                                                   : PrevTokLocation,
+              {}, Actions.PreferredConditionType(CK));
+          if (!CondExpr.isInvalid())
+            SecondPart = Actions.ActOnCondition(getCurScope(), ForLoc,
+                                                CondExpr.get(), CK,
+                                                /*MissingOK=*/false);
+        }
+
       } else {
         // We permit 'continue' and 'break' in the condition of a for loop.
         getCurScope()->AddFlags(Scope::BreakScope | Scope::ContinueScope);
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index c882b4659a7310e..278b9fc000b5740 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -432,3 +432,18 @@ void RecoveryToDoWhileStmtCond() {
   // CHECK-NEXT:      `-IntegerLiteral {{.*}} 'int' 10
   do {} while (some_invalid_val + 1 < 10);
 }
+
+void RecoveryForStmtCond() {
+  // CHECK:FunctionDecl {{.*}} RecoveryForStmtCond
+  // CHECK-NEXT:`-CompoundStmt {{.*}}
+  // CHECK-NEXT:  `-ForStmt {{.*}}
+  // CHECK-NEXT:    |-DeclStmt {{.*}}
+  // CHECK-NEXT:    | `-VarDecl {{.*}}
+  // CHECK-NEXT:    |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
+  // CHECK-NEXT:    |-<<<NULL>>>
+  // CHECK-NEXT:    |-RecoveryExpr {{.*}} 'bool' contains-errors
+  // CHECK-NEXT:    |-UnaryOperator {{.*}} 'int' lvalue prefix '++'
+  // CHECK-NEXT:    | `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'i' 'int'
+  // CHECK-NEXT:    `-CompoundStmt {{.*}}
+  for (int i = 0; i < invalid; ++i) {}
+}
diff --git a/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp b/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp
index e1d97ceafbe9d15..90ee7892b2fc2e7 100644
--- a/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp
+++ b/clang/test/SemaCXX/constexpr-function-recovery-crash.cpp
@@ -106,3 +106,4 @@ TEST_EVALUATE(ForCond, for (; !!;){};);// expected-error + {{}}
 TEST_EVALUATE(ForInc, for (;; !!){};);// expected-error + {{}}
                                       // expected-note at -1 + {{infinite loop}}
                                       // expected-note at -2 {{in call}}
+TEST_EVALUATE(ForCondUnDef, for (;some_cond;){};);        // expected-error + {{}}

>From b4ee25025bcce533abda3d73e51614e9760d88f2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar at redhat.com>
Date: Fri, 8 Sep 2023 09:48:26 -0700
Subject: [PATCH 27/32] Fix up some yaml errors after
 5f16a3a489bd1730db49f635604bc1d833abed15

---
 .github/new-prs-labeler.yml | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index bc4a30a4802a7c3..96e39fdf73e520f 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -18,11 +18,6 @@ clang:frontend:
 clang:headers:
   - clang/lib/Headers/**/*
 
-clang:static analyzer:
-  - clang/**/StaticAnalyzer/**/*
-  - clang/utils/analyzer/**/*
-  - clang/docs/analyzer/**/*
-
 compiler-rt:
   - compiler-rt/**/*
 
@@ -39,21 +34,12 @@ HLSL:
   - clang/**/*HLSL*
   - llvm/**/Frontend/HLSL/**/*
 
-libc++:
-  - libcxx/**/*
-
-libc++-abi:
-  - libcxxabi/**/*
-
 lld:
   - lld/**/*
 
 llvm-lit:
   - llvm/utils/lit/**/*
 
-mc:
-  - llvm/**/MC/**
-
 mlir:afine:
   - mlir/**/Affine/**/*
 
@@ -63,14 +49,6 @@ mlir:python:
 mlir:vectorops:
   - mlir/**/Vector/**/*
 
-tools:llvm-exegesis:
-  - '**/llvm-exegesis/**/*'
-
-tools:llvm-mca:
-  - llvm/tools/llvm-mca/**/*
-  - llvm/include/llvm/MCA/**/*
-  - llvm/lib/MCA/**/*
-
 PGO:
   - llvm/lib/Transforms/Instrumentation/CGProfile.cpp
   - llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -725,4 +703,3 @@ backend:X86:
   - /llvm/include/llvm/TargetParser/X86*
   - /llvm/lib/TargetParser/X86*
   - /llvm/utils/TableGen/X86*
-

>From 6b856abc6fec04b523c17aab96f9877f40e7b7ab Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy at fb.com>
Date: Fri, 8 Sep 2023 09:49:54 -0700
Subject: [PATCH 28/32] [PseudoProbe] Use probe id as the base dwarf
 discriminator for callsites (#65685)

With `-fpseudo-probe-for-profiling`, the dwarf discriminator for a
callsite will be overwritten to pseudo probe related information for
that callsite. The probe information is encoded in a special format
(i.e., with all lowest three digits be one) in order to be distinguished
from regular dwarf discriminator. The special encoding format will be
decoded to zero by the regular discriminator logic. This means all
callsites would have a zero discriminator in both the sample profile and
the compiler, for classic AutoFDO. This is inconvenient in that no
decent classic AutoFDO can be generated from a pseudo probe build. I'm
mitigating the issue by allowing callsite probe id to be used as the
base dwarf discriminator for classic AutoFDO, since probe id is also
unique and can be used to differentiate callsites on the same source
line.
---
 llvm/include/llvm/IR/DebugInfoMetadata.h      |  9 ++
 .../SampleProfile/pseudo-probe-inline.ll      |  2 +
 .../tools/llvm-profgen/inline-probe-afdo.test | 82 +++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 llvm/test/tools/llvm-profgen/inline-probe-afdo.test

diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index b9f6d39a7491566..9beb514b87125af 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PseudoProbe.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Discriminator.h"
@@ -2075,6 +2076,14 @@ class DILocation : public MDNode {
   static unsigned
   getBaseDiscriminatorFromDiscriminator(unsigned D,
                                         bool IsFSDiscriminator = false) {
+    // Return the probe id instead of zero for a pseudo probe discriminator.
+    // This should help differenciate callsites with same line numbers to
+    // achieve a decent AutoFDO profile under -fpseudo-probe-for-profiling,
+    // where the original callsite dwarf discriminator is overwritten by
+    // callsite probe information.
+    if (isPseudoProbeDiscriminator(D))
+      return PseudoProbeDwarfDiscriminator::extractProbeIndex(D);
+
     if (IsFSDiscriminator)
       return getMaskedDiscriminator(D, getBaseDiscriminatorBits());
     return getUnsignedFromPrefixEncoding(D);
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
index 8157bebdc3777be..18cbd857d97bb2c 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -106,6 +106,8 @@ if.end:
 ;YAML-NEXT:    - Line:            '1'
 ;YAML-NEXT:    - String:          ':'
 ;YAML-NEXT:    - Column:          '11'
+;YAML-NEXT:    - String:          .
+;YAML-NEXT:    - Disc:            '2'
 ;YAML-NEXT:    - String:          ';'
 ;YAML-NEXT:  ...
 ;YAML:  --- !Analysis
diff --git a/llvm/test/tools/llvm-profgen/inline-probe-afdo.test b/llvm/test/tools/llvm-profgen/inline-probe-afdo.test
new file mode 100644
index 000000000000000..763bf3b0f8c394a
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/inline-probe-afdo.test
@@ -0,0 +1,82 @@
+; RUN: llvm-profgen --format=text --use-dwarf-correlation --ignore-stack-samples --perfscript=%S/Inputs/cs-preinline-cost.perfscript --binary=%S/Inputs/cs-preinline-cost.perfbin --output %t
+; RUN: FileCheck %s --input-file %t
+
+; CHECK:     main:947937:0
+; CHECK-NEXT:  2: 545
+; CHECK-NEXT:  3: 545
+; CHECK-NEXT:  5: 545
+; CHECK-NEXT:  7: 0
+; CHECK-NEXT:  65496: 545
+; CHECK-NEXT:  3.7: _Z3fooi:915794
+; CHECK-NEXT:   1: 545
+; CHECK-NEXT:   5: 545
+; CHECK-NEXT:   6: 272
+; CHECK-NEXT:   10: 273
+; CHECK-NEXT:   11: 180
+; CHECK-NEXT:   12: 6965
+; CHECK-NEXT:   13: 6965
+; CHECK-NEXT:   14: 6965
+; CHECK-NEXT:   15: 6965
+; CHECK-NEXT:   20: 182
+; CHECK-NEXT:   21: 6958
+; CHECK-NEXT:   22: 6958
+; CHECK-NEXT:   23: 6958
+; CHECK-NEXT:   24: 6958
+; CHECK-NEXT:   29: 272
+; CHECK-NEXT:   65529: 182
+; CHECK-NEXT:  4.8: _Z3fooi:16338
+; CHECK-NEXT:   1: 272
+; CHECK-NEXT:   6: 545
+
+
+
+
+; binary is built with the source below using the following command line:
+;   clang -O3 -g -fpseudo-probe-for-profiling test.cpp
+;
+;#include <stdio.h>
+;
+;volatile int state = 9000;
+;
+;int foo(int x) {
+;    if (x == 0) {
+;        return 7;
+;    }
+;
+;    if ((x & 1) == 0) {
+;        state--;
+;        return 9;
+;    }
+;
+;    if (state > 5000) {
+;        while (state > 5000) {
+;               for (int i = 50; i >= 0; i--) {
+;                state *= 6;
+;                state /= 7;
+;                state -= 1;
+;            }
+;        }
+;    }
+;    else {
+;        while (state < 5000) {
+;            for (int i = 50; i >= 0; i--) {
+;                state *= 6;
+;                state /= 5;
+;                state += 1;
+;            }
+;        }
+;    }
+;
+;    return state;
+;}
+;
+;volatile int cnt = 10000000;//10000000;
+;int main() {
+;    int r = 0;
+;    for (int i = 0; i < cnt; i++) {
+;      r += foo(i);
+;      r -= foo(i & (~1));
+;      r += foo(0);
+;    }
+;    return r;
+;}

>From 8929f3832049a60af7a3695377d1cac98bba0dec Mon Sep 17 00:00:00 2001
From: Shraiysh <Shraiysh.Vaishay at amd.com>
Date: Fri, 8 Sep 2023 11:54:52 -0500
Subject: [PATCH 29/32] [nfc][OpenMPIRBuilder] Formatting OMPIRBuilder.cpp and
 OMPIRBuilder.h (#65772)

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  4 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 44 +++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index ea1035f1907e492..b6460267c08aaa1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1159,8 +1159,8 @@ class OpenMPIRBuilder {
                                 InsertPointTy AllocaIP,
                                 BodyGenCallbackTy BodyGenCB);
 
-
-  using FileIdentifierInfoCallbackTy = std::function<std::tuple<std::string, uint64_t>()>;
+  using FileIdentifierInfoCallbackTy =
+      std::function<std::tuple<std::string, uint64_t>()>;
 
   /// Creates a unique info for a target entry when provided a filename and
   /// line number from.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2cfb36d11dcf898..6cab70df269c170 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -390,9 +390,9 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
       if (Param) {
         if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
           FnAS = FnAS.addAttribute(Ctx, AK);
-      } else
-        if (auto AK = TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
-          FnAS = FnAS.addAttribute(Ctx, AK);
+      } else if (auto AK =
+                     TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
+        FnAS = FnAS.addAttribute(Ctx, AK);
     } else {
       FnAS = FnAS.addAttributes(Ctx, AS);
     }
@@ -406,7 +406,7 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets)                \
   case Enum:                                                                   \
     FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet);                           \
-    addAttrSet(RetAttrs, RetAttrSet, /*Param*/false);                          \
+    addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false);                         \
     for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo)                \
       addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]);                         \
     Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs));    \
@@ -4927,8 +4927,8 @@ void OpenMPIRBuilder::emitOffloadingArrays(
             static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
                 CombinedInfo.Types[I] &
                 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
-          ConstSizes[I] = ConstantInt::get(Int64Ty,
-                                           CombinedInfo.NonContigInfo.Dims[I]);
+          ConstSizes[I] =
+              ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
         else
           ConstSizes[I] = CI;
         continue;
@@ -4991,8 +4991,8 @@ void OpenMPIRBuilder::emitOffloadingArrays(
         createOffloadMapnames(CombinedInfo.Names, MapnamesName);
     Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
   } else {
-    Info.RTArgs.MapNamesArray = Constant::getNullValue(
-        PointerType::getUnqual(Builder.getContext()));
+    Info.RTArgs.MapNamesArray =
+        Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
   }
 
   // If there's a present map type modifier, it must not be applied to the end
@@ -5017,10 +5017,10 @@ void OpenMPIRBuilder::emitOffloadingArrays(
   for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
     Value *BPVal = CombinedInfo.BasePointers[I];
     Value *BP = Builder.CreateConstInBoundsGEP2_32(
-        ArrayType::get(PtrTy, Info.NumberOfPtrs),
-        Info.RTArgs.BasePointersArray, 0, I);
-    Builder.CreateAlignedStore(
-        BPVal, BP, M.getDataLayout().getPrefTypeAlign(PtrTy));
+        ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
+        0, I);
+    Builder.CreateAlignedStore(BPVal, BP,
+                               M.getDataLayout().getPrefTypeAlign(PtrTy));
 
     if (Info.requiresDevicePointerInfo()) {
       if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
@@ -5039,21 +5039,21 @@ void OpenMPIRBuilder::emitOffloadingArrays(
 
     Value *PVal = CombinedInfo.Pointers[I];
     Value *P = Builder.CreateConstInBoundsGEP2_32(
-        ArrayType::get(PtrTy, Info.NumberOfPtrs),
-        Info.RTArgs.PointersArray, 0, I);
+        ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
+        I);
     // TODO: Check alignment correct.
-    Builder.CreateAlignedStore(
-        PVal, P, M.getDataLayout().getPrefTypeAlign(PtrTy));
+    Builder.CreateAlignedStore(PVal, P,
+                               M.getDataLayout().getPrefTypeAlign(PtrTy));
 
     if (RuntimeSizes.test(I)) {
       Value *S = Builder.CreateConstInBoundsGEP2_32(
           ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
           /*Idx0=*/0,
           /*Idx1=*/I);
-      Builder.CreateAlignedStore(
-          Builder.CreateIntCast(CombinedInfo.Sizes[I], Int64Ty,
-                                /*isSigned=*/true),
-          S, M.getDataLayout().getPrefTypeAlign(PtrTy));
+      Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
+                                                       Int64Ty,
+                                                       /*isSigned=*/true),
+                                 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
     }
     // Fill up the mapper array.
     unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
@@ -5655,8 +5655,8 @@ GlobalVariable *
 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
                                        std::string VarName) {
   llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
-      llvm::ArrayType::get(
-          llvm::PointerType::getUnqual(M.getContext()), Names.size()),
+      llvm::ArrayType::get(llvm::PointerType::getUnqual(M.getContext()),
+                           Names.size()),
       Names);
   auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
       M, MapNamesArrayInit->getType(),

>From db08f22bfefac9ed7028cb65bd53d5ddcd119a17 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <shraiysh.vaishay at amd.com>
Date: Thu, 7 Sep 2023 15:45:52 -0500
Subject: [PATCH 30/32] [OpenMPIRBuilder] Added `createTeams`

This patch adds a generator for the teams construct. The generated IR looks like the following:

```
current_fn() {
  ...
  call @__kmpc_fork_teams(ptr @ident, i32 num_args, ptr @outlined_omp_teams, ...args)
  ...
}
outlined_omp_teams(ptr %global_tid, ptr %bound_tid, ...args) {
  ; teams body
}
```

It does this by first generating the body in the current function. Then we outline the
body in a temporary function. We then create the @outlined_omp_teams function and embed
the temporary outlined function in this function. We then emit the call to runtime
function.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   7 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 144 ++++++++++++++++++
 .../Frontend/OpenMPIRBuilderTest.cpp          |  73 +++++++++
 3 files changed, 224 insertions(+)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index b6460267c08aaa1..d26ac60939031ec 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2005,6 +2005,13 @@ class OpenMPIRBuilder {
   /// \param Loc The insert and source location description.
   void createTargetDeinit(const LocationDescription &Loc);
 
+  /// Generator for `#omp teams`
+  ///
+  /// \param Loc The location where the task construct was encountered.
+  /// \param BodyGenCB Callback that will generate the region code.
+  InsertPointTy createTeams(const LocationDescription &Loc,
+                            BodyGenCallbackTy BodyGenCB);
+
   ///}
 
 private:
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6cab70df269c170..0f2203f0c1ac84c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -36,10 +36,12 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Target/TargetMachine.h"
@@ -6106,6 +6108,148 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
   }
 }
 
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
+                             BodyGenCallbackTy BodyGenCB) {
+  if (!updateToLocation(Loc)) {
+    return Loc.IP;
+  }
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  // Splitting a basic block expects a terminator. Hence, creating an
+  // unreachable instruction, which will be deleted later.
+  UnreachableInst *UI = Builder.CreateUnreachable();
+  BasicBlock *CurrentBasicBlock = Builder.GetInsertBlock();
+
+  // The current basic block is split into four basic blocks. After outlining,
+  // they will be mapped as follows:
+  // ```
+  // def current_fn() {
+  //   current_basic_block:
+  //     br label %teams.exit
+  //   teams.exit:
+  //     ; instructions after task
+  // }
+  // def outlined_fn() {
+  //   teams.alloca:
+  //     br label %teams.body
+  //   teams.body:
+  //     ; instructions within teams body
+  // }
+  // ```
+  BasicBlock *AllocaBB = CurrentBasicBlock->splitBasicBlock(UI, "teams.alloca");
+  BasicBlock *BodyBB = AllocaBB->splitBasicBlock(UI, "teams.body");
+  BasicBlock *ExitBB = BodyBB->splitBasicBlock(UI, "teams.exit");
+
+  UI->eraseFromParent();
+
+  // Generate the body of teams.
+  InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
+  InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
+  BodyGenCB(AllocaIP, CodeGenIP);
+
+  OutlineInfo OI;
+  OI.EntryBB = AllocaBB;
+  OI.ExitBB = ExitBB;
+  OI.PostOutlineCB = [this, Ident](Function &OutlinedFn) {
+    // The input IR here looks like the following-
+    // ```
+    // func @current_fn() {
+    //   outlined_fn(%args)
+    // }
+    // func @outlined_fn(%args) {
+    //   ; teams body
+    // }
+    // ```
+    //
+    // This is changed to the following-
+    //
+    // ```
+    // func @current_fn() {
+    //   runtime_call(..., wrapper_fn, ...)
+    // }
+    // func @wrapper_fn(..., %args) {
+    //   ; teams body
+    // }
+    // ```
+
+    // The outlined function has different inputs than what is expected from it.
+    // So, a wrapper function with expected signature is created and the
+    // required arguments are passed to the outlined function. The stale call
+    // instruction in current function will be replaced with a new call
+    // instruction for runtime call with the wrapper function. The outlined
+    // function is then inlined in the wrapper function and the call from the
+    // current function is removed.
+
+    assert(OutlinedFn.getNumUses() == 1 &&
+           "there must be a single user for the outlined function");
+    CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+    assert(StaleCI && "Error while outlining - no CallInst user found for the "
+                      "outlined function.");
+    OutlinedFn.addFnAttr(Attribute::AttrKind::AlwaysInline);
+
+    // Create the wrapper function.
+    Builder.SetInsertPoint(StaleCI);
+    SmallVector<Type *> WrapperArgTys{Builder.getPtrTy(), Builder.getPtrTy()};
+    for (auto &Arg : OutlinedFn.args()) {
+      WrapperArgTys.push_back(Arg.getType());
+    }
+    FunctionCallee WrapperFuncVal = M.getOrInsertFunction(
+        "outlined_omp_teams",
+        FunctionType::get(Builder.getVoidTy(), WrapperArgTys, false));
+    Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee());
+    WrapperFunc->getArg(0)->setName("global_tid");
+    WrapperFunc->getArg(1)->setName("bound_tid");
+    WrapperFunc->getArg(2)->setName("data");
+
+    // Emit the body of the wrapper function - just a call to outlined function
+    // and return statement.
+    BasicBlock *WrapperEntryBB =
+        BasicBlock::Create(M.getContext(), "entrybb", WrapperFunc);
+    Builder.SetInsertPoint(WrapperEntryBB);
+    SmallVector<Value *> Args;
+    for (size_t ArgIndex = 2; ArgIndex < WrapperFunc->arg_size(); ArgIndex++) {
+      Args.push_back(WrapperFunc->getArg(ArgIndex));
+    }
+    CallInst *OutlinedFnCall = Builder.CreateCall(&OutlinedFn, Args);
+    Builder.CreateRetVoid();
+
+    // Call to the runtime function for teams in the current function.
+    Builder.SetInsertPoint(StaleCI);
+    Args = {Ident, Builder.getInt32(StaleCI->arg_size()), WrapperFunc};
+    for (Use &Arg : StaleCI->args()) {
+      Args.push_back(Arg);
+    }
+    Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
+                           omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
+                       Args);
+    StaleCI->eraseFromParent();
+
+    // Inlining the outlined teams function in the wrapper. This wrapper is the
+    // argument for the runtime call.
+    assert(OutlinedFn.getNumUses() == 1 &&
+           "More than one use for the outlined function found. Expected only "
+           "one use.");
+    InlineFunctionInfo IFI;
+    InlineResult IR = InlineFunction(*OutlinedFnCall, IFI);
+    LLVM_DEBUG(if (!IR.isSuccess()) {
+      dbgs() << "Attempt to merge the outlined function in the wrapper failed: "
+             << IR.getFailureReason() << "\n";
+    });
+    assert(IR.isSuccess() && "Inlining outlined omp teams failed");
+    OutlinedFn.eraseFromParent();
+  };
+
+  addOutlineInfo(std::move(OI));
+
+  Builder.SetInsertPoint(ExitBB);
+
+  return Builder.saveIP();
+}
+
 bool OffloadEntriesInfoManager::empty() const {
   return OffloadEntriesTargetRegion.empty() &&
          OffloadEntriesDeviceGlobalVar.empty();
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 5870457956b5433..b189559b3430461 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6033,4 +6033,77 @@ TEST_F(OpenMPIRBuilderTest, createGPUOffloadEntry) {
   EXPECT_TRUE(Fn->hasFnAttribute(Attribute::MustProgress));
 }
 
+TEST_F(OpenMPIRBuilderTest, createTeams) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  AllocaInst *ValPtr32 = Builder.CreateAlloca(Builder.getInt32Ty());
+  AllocaInst *ValPtr128 = Builder.CreateAlloca(Builder.getInt128Ty());
+  Value *Val128 =
+      Builder.CreateLoad(Builder.getInt128Ty(), ValPtr128, "bodygen.load");
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    Builder.restoreIP(AllocaIP);
+    AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr,
+                                                "bodygen.alloca128");
+
+    Builder.restoreIP(CodeGenIP);
+    // Loading and storing captured pointer and values
+    Builder.CreateStore(Val128, Local128);
+    Value *Val32 = Builder.CreateLoad(ValPtr32->getAllocatedType(), ValPtr32,
+                                      "bodygen.load32");
+
+    LoadInst *PrivLoad128 = Builder.CreateLoad(
+        Local128->getAllocatedType(), Local128, "bodygen.local.load128");
+    Value *Cmp = Builder.CreateICmpNE(
+        Val32, Builder.CreateTrunc(PrivLoad128, Val32->getType()));
+    Instruction *ThenTerm, *ElseTerm;
+    SplitBlockAndInsertIfThenElse(Cmp, CodeGenIP.getBlock()->getTerminator(),
+                                  &ThenTerm, &ElseTerm);
+  };
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+  Builder.restoreIP(OMPBuilder.createTeams(Builder, BodyGenCB));
+  OMPBuilder.finalize();
+  Builder.CreateRetVoid();
+
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  CallInst *TeamsForkCall = dyn_cast<CallInst>(
+      OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams)
+          ->user_back());
+
+  // Verify the Ident argument
+  GlobalVariable *Ident = cast<GlobalVariable>(TeamsForkCall->getArgOperand(0));
+  ASSERT_NE(Ident, nullptr);
+  EXPECT_TRUE(Ident->hasInitializer());
+  Constant *Initializer = Ident->getInitializer();
+  GlobalVariable *SrcStrGlob =
+      cast<GlobalVariable>(Initializer->getOperand(4)->stripPointerCasts());
+  ASSERT_NE(SrcStrGlob, nullptr);
+  ConstantDataArray *SrcSrc =
+      dyn_cast<ConstantDataArray>(SrcStrGlob->getInitializer());
+  ASSERT_NE(SrcSrc, nullptr);
+
+  // Verify the outlined function signature.
+  Function *OutlinedFn =
+      dyn_cast<Function>(TeamsForkCall->getArgOperand(2)->stripPointerCasts());
+  ASSERT_NE(OutlinedFn, nullptr);
+  EXPECT_FALSE(OutlinedFn->isDeclaration());
+  EXPECT_TRUE(OutlinedFn->arg_size() >= 3);
+  EXPECT_EQ(OutlinedFn->getArg(0)->getType(), Builder.getPtrTy()); // global_tid
+  EXPECT_EQ(OutlinedFn->getArg(1)->getType(), Builder.getPtrTy()); // bound_tid
+  EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
+            Builder.getPtrTy()); // captured args
+
+  // Check for TruncInst and ICmpInst in the outlined function.
+  EXPECT_TRUE(any_of(instructions(OutlinedFn),
+                     [](Instruction &inst) { return isa<TruncInst>(&inst); }));
+  EXPECT_TRUE(any_of(instructions(OutlinedFn),
+                     [](Instruction &inst) { return isa<ICmpInst>(&inst); }));
+}
+
 } // namespace

>From a1aa308c4e7bc0f7c8a266fd937ba7b2fd66609f Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <shraiysh.vaishay at amd.com>
Date: Fri, 8 Sep 2023 10:56:17 -0500
Subject: [PATCH 31/32] Addressed comments

---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp        | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d26ac60939031ec..037e0a5662be5bb 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2007,7 +2007,7 @@ class OpenMPIRBuilder {
 
   /// Generator for `#omp teams`
   ///
-  /// \param Loc The location where the task construct was encountered.
+  /// \param Loc The location where the teams construct was encountered.
   /// \param BodyGenCB Callback that will generate the region code.
   InsertPointTy createTeams(const LocationDescription &Loc,
                             BodyGenCallbackTy BodyGenCB);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0f2203f0c1ac84c..a873d95366bc9a6 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6111,9 +6111,8 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB) {
-  if (!updateToLocation(Loc)) {
+  if (!updateToLocation(Loc))
     return Loc.IP;
-  }
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
@@ -6198,7 +6197,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
       WrapperArgTys.push_back(Arg.getType());
     }
     FunctionCallee WrapperFuncVal = M.getOrInsertFunction(
-        "outlined_omp_teams",
+        (Twine(OutlinedFn.getName()) + ".teams").str(),
         FunctionType::get(Builder.getVoidTy(), WrapperArgTys, false));
     Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee());
     WrapperFunc->getArg(0)->setName("global_tid");

>From 3f3d50a76c06e8ac6a3550b33d0622ae0e26c4ce Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <shraiysh.vaishay at amd.com>
Date: Fri, 8 Sep 2023 10:58:50 -0500
Subject: [PATCH 32/32] Replace task -> teams

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a873d95366bc9a6..136590989f99c44 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6130,7 +6130,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   //   current_basic_block:
   //     br label %teams.exit
   //   teams.exit:
-  //     ; instructions after task
+  //     ; instructions after teams
   // }
   // def outlined_fn() {
   //   teams.alloca: